aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig61
-rw-r--r--fs/Kconfig.binfmt4
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/dir.c9
-rw-r--r--fs/afs/inode.c3
-rw-r--r--fs/aio.c2
-rw-r--r--fs/binfmt_aout.c1
-rw-r--r--fs/binfmt_elf.c681
-rw-r--r--fs/bio.c8
-rw-r--r--fs/block_dev.c8
-rw-r--r--fs/buffer.c44
-rw-r--r--fs/char_dev.c6
-rw-r--r--fs/cifs/CHANGES5
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/README28
-rw-r--r--fs/cifs/TODO14
-rw-r--r--fs/cifs/cifs_dfs_ref.c377
-rw-r--r--fs/cifs/cifs_fs_sb.h5
-rw-r--r--fs/cifs/cifs_spnego.c6
-rw-r--r--fs/cifs/cifsacl.c240
-rw-r--r--fs/cifs/cifsfs.c56
-rw-r--r--fs/cifs/cifsfs.h5
-rw-r--r--fs/cifs/cifsglob.h41
-rw-r--r--fs/cifs/cifspdu.h3
-rw-r--r--fs/cifs/cifsproto.h11
-rw-r--r--fs/cifs/cifssmb.c67
-rw-r--r--fs/cifs/connect.c24
-rw-r--r--fs/cifs/dir.c10
-rw-r--r--fs/cifs/dns_resolve.c124
-rw-r--r--fs/cifs/dns_resolve.h32
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/cifs/inode.c26
-rw-r--r--fs/cifs/link.c16
-rw-r--r--fs/cifs/sess.c10
-rw-r--r--fs/coda/psdev.c8
-rw-r--r--fs/compat.c4
-rw-r--r--fs/compat_binfmt_elf.c131
-rw-r--r--fs/compat_ioctl.c10
-rw-r--r--fs/configfs/dir.c5
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/configfs/mount.c13
-rw-r--r--fs/debugfs/inode.c13
-rw-r--r--fs/dlm/dir.c76
-rw-r--r--fs/dlm/dlm_internal.h16
-rw-r--r--fs/dlm/lock.c249
-rw-r--r--fs/dlm/lock.h2
-rw-r--r--fs/dlm/lockspace.c66
-rw-r--r--fs/dlm/lowcomms.c15
-rw-r--r--fs/dlm/main.c10
-rw-r--r--fs/dlm/member.c4
-rw-r--r--fs/dlm/member.h3
-rw-r--r--fs/dlm/memory.c32
-rw-r--r--fs/dlm/memory.h16
-rw-r--r--fs/dlm/midcomms.c15
-rw-r--r--fs/dlm/rcom.c25
-rw-r--r--fs/dlm/recover.c27
-rw-r--r--fs/dlm/recoverd.c11
-rw-r--r--fs/dlm/user.c29
-rw-r--r--fs/dlm/util.c82
-rw-r--r--fs/dquot.c36
-rw-r--r--fs/ecryptfs/crypto.c9
-rw-r--r--fs/ecryptfs/inode.c20
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/main.c135
-rw-r--r--fs/ecryptfs/messaging.c1
-rw-r--r--fs/ecryptfs/mmap.c31
-rw-r--r--fs/ecryptfs/netlink.c3
-rw-r--r--fs/ecryptfs/read_write.c27
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/ext2/super.c32
-rw-r--r--fs/ext3/super.c34
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/balloc.c247
-rw-r--r--fs/ext4/dir.c14
-rw-r--r--fs/ext4/extents.c481
-rw-r--r--fs/ext4/file.c23
-rw-r--r--fs/ext4/group.h8
-rw-r--r--fs/ext4/ialloc.c141
-rw-r--r--fs/ext4/inode.c360
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c4552
-rw-r--r--fs/ext4/migrate.c560
-rw-r--r--fs/ext4/namei.c135
-rw-r--r--fs/ext4/resize.c28
-rw-r--r--fs/ext4/super.c381
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--fs/fat/fatent.c28
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fuse/inode.c26
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/bmap.c37
-rw-r--r--fs/gfs2/bmap.h2
-rw-r--r--fs/gfs2/daemon.c50
-rw-r--r--fs/gfs2/daemon.h1
-rw-r--r--fs/gfs2/dir.c4
-rw-r--r--fs/gfs2/eaops.c84
-rw-r--r--fs/gfs2/eattr.c2
-rw-r--r--fs/gfs2/glock.c83
-rw-r--r--fs/gfs2/glops.c110
-rw-r--r--fs/gfs2/incore.h47
-rw-r--r--fs/gfs2/inode.c41
-rw-r--r--fs/gfs2/inode.h12
-rw-r--r--fs/gfs2/locking/dlm/mount.c5
-rw-r--r--fs/gfs2/locking/dlm/plock.c18
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c38
-rw-r--r--fs/gfs2/locking/dlm/thread.c9
-rw-r--r--fs/gfs2/log.c119
-rw-r--r--fs/gfs2/log.h14
-rw-r--r--fs/gfs2/lops.c71
-rw-r--r--fs/gfs2/main.c3
-rw-r--r--fs/gfs2/meta_io.c97
-rw-r--r--fs/gfs2/meta_io.h1
-rw-r--r--fs/gfs2/ops_address.c649
-rw-r--r--fs/gfs2/ops_address.h7
-rw-r--r--fs/gfs2/ops_file.c229
-rw-r--r--fs/gfs2/ops_file.h24
-rw-r--r--fs/gfs2/ops_fstype.c73
-rw-r--r--fs/gfs2/ops_inode.c20
-rw-r--r--fs/gfs2/ops_inode.h6
-rw-r--r--fs/gfs2/ops_super.c1
-rw-r--r--fs/gfs2/ops_vm.c169
-rw-r--r--fs/gfs2/ops_vm.h18
-rw-r--r--fs/gfs2/quota.c29
-rw-r--r--fs/gfs2/recovery.c18
-rw-r--r--fs/gfs2/rgrp.c104
-rw-r--r--fs/gfs2/rgrp.h4
-rw-r--r--fs/gfs2/super.c25
-rw-r--r--fs/gfs2/sys.c36
-rw-r--r--fs/gfs2/trans.c5
-rw-r--r--fs/gfs2/trans.h1
-rw-r--r--fs/hfs/bfind.c12
-rw-r--r--fs/hfs/brec.c15
-rw-r--r--fs/hfs/btree.c20
-rw-r--r--fs/hfs/hfs.h5
-rw-r--r--fs/inode.c5
-rw-r--r--fs/ioprio.c30
-rw-r--r--fs/jbd/checkpoint.c3
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jbd2/checkpoint.c25
-rw-r--r--fs/jbd2/commit.c257
-rw-r--r--fs/jbd2/journal.c368
-rw-r--r--fs/jbd2/recovery.c151
-rw-r--r--fs/jbd2/revoke.c6
-rw-r--r--fs/jbd2/transaction.c34
-rw-r--r--fs/jffs2/background.c2
-rw-r--r--fs/jfs/jfs_dtree.c27
-rw-r--r--fs/jfs/jfs_dtree.h4
-rw-r--r--fs/jfs/jfs_imap.c4
-rw-r--r--fs/jfs/jfs_logmgr.c34
-rw-r--r--fs/jfs/jfs_metapage.c43
-rw-r--r--fs/jfs/jfs_mount.c2
-rw-r--r--fs/jfs/jfs_umount.c4
-rw-r--r--fs/jfs/namei.c4
-rw-r--r--fs/jfs/resize.c2
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/lockd/clntlock.c42
-rw-r--r--fs/lockd/clntproc.c35
-rw-r--r--fs/lockd/xdr.c3
-rw-r--r--fs/namei.c4
-rw-r--r--fs/namespace.c11
-rw-r--r--fs/nfs/callback.c7
-rw-r--r--fs/nfs/callback.h4
-rw-r--r--fs/nfs/callback_proc.c51
-rw-r--r--fs/nfs/callback_xdr.c6
-rw-r--r--fs/nfs/client.c358
-rw-r--r--fs/nfs/delegation.c110
-rw-r--r--fs/nfs/delegation.h3
-rw-r--r--fs/nfs/dir.c63
-rw-r--r--fs/nfs/direct.c113
-rw-r--r--fs/nfs/file.c40
-rw-r--r--fs/nfs/getroot.c11
-rw-r--r--fs/nfs/idmap.c99
-rw-r--r--fs/nfs/inode.c34
-rw-r--r--fs/nfs/internal.h16
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs2xdr.c24
-rw-r--r--fs/nfs/nfs3proc.c45
-rw-r--r--fs/nfs/nfs3xdr.c27
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4namespace.c20
-rw-r--r--fs/nfs/nfs4proc.c286
-rw-r--r--fs/nfs/nfs4renewd.c2
-rw-r--r--fs/nfs/nfs4state.c46
-rw-r--r--fs/nfs/nfs4xdr.c40
-rw-r--r--fs/nfs/pagelist.c14
-rw-r--r--fs/nfs/proc.c30
-rw-r--r--fs/nfs/read.c45
-rw-r--r--fs/nfs/super.c309
-rw-r--r--fs/nfs/unlink.c45
-rw-r--r--fs/nfs/write.c102
-rw-r--r--fs/nfsd/nfs3xdr.c5
-rw-r--r--fs/nfsd/nfsxdr.c5
-rw-r--r--fs/ocfs2/Makefile5
-rw-r--r--fs/ocfs2/alloc.c76
-rw-r--r--fs/ocfs2/aops.c137
-rw-r--r--fs/ocfs2/buffer_head_io.c65
-rw-r--r--fs/ocfs2/buffer_head_io.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.h2
-rw-r--r--fs/ocfs2/cluster/masklog.c4
-rw-r--r--fs/ocfs2/cluster/sys.c83
-rw-r--r--fs/ocfs2/cluster/tcp.h4
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h8
-rw-r--r--fs/ocfs2/cluster/ver.c2
-rw-r--r--fs/ocfs2/dcache.c8
-rw-r--r--fs/ocfs2/dir.c8
-rw-r--r--fs/ocfs2/dlm/dlmfsver.c2
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c19
-rw-r--r--fs/ocfs2/dlm/dlmver.c2
-rw-r--r--fs/ocfs2/dlmglue.c546
-rw-r--r--fs/ocfs2/dlmglue.h31
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/export.c8
-rw-r--r--fs/ocfs2/file.c163
-rw-r--r--fs/ocfs2/file.h6
-rw-r--r--fs/ocfs2/heartbeat.c80
-rw-r--r--fs/ocfs2/heartbeat.h2
-rw-r--r--fs/ocfs2/inode.c84
-rw-r--r--fs/ocfs2/inode.h10
-rw-r--r--fs/ocfs2/ioctl.c31
-rw-r--r--fs/ocfs2/journal.c64
-rw-r--r--fs/ocfs2/journal.h6
-rw-r--r--fs/ocfs2/localalloc.c50
-rw-r--r--fs/ocfs2/locks.c125
-rw-r--r--fs/ocfs2/locks.h (renamed from fs/ocfs2/vote.h)29
-rw-r--r--fs/ocfs2/mmap.c17
-rw-r--r--fs/ocfs2/namei.c66
-rw-r--r--fs/ocfs2/ocfs2.h35
-rw-r--r--fs/ocfs2/ocfs2_fs.h22
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/resize.c634
-rw-r--r--fs/ocfs2/resize.h32
-rw-r--r--fs/ocfs2/slot_map.c19
-rw-r--r--fs/ocfs2/slot_map.h2
-rw-r--r--fs/ocfs2/suballoc.c20
-rw-r--r--fs/ocfs2/suballoc.h8
-rw-r--r--fs/ocfs2/super.c140
-rw-r--r--fs/ocfs2/sysfile.c2
-rw-r--r--fs/ocfs2/ver.c2
-rw-r--r--fs/ocfs2/vote.c756
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/partitions/check.c327
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c98
-rw-r--r--fs/proc/generic.c7
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/proc_misc.c4
-rw-r--r--fs/proc/proc_net.c55
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/proc/task_nommu.c4
-rw-r--r--fs/read_write.c64
-rw-r--r--fs/smbfs/Makefile20
-rw-r--r--fs/splice.c32
-rw-r--r--fs/sysfs/dir.c10
-rw-r--r--fs/sysfs/file.c78
-rw-r--r--fs/sysfs/group.c26
-rw-r--r--fs/sysfs/symlink.c88
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c37
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c129
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c20
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c8
-rw-r--r--fs/xfs/quota/xfs_qm.c3
-rw-r--r--fs/xfs/xfs_dir2_block.c6
-rw-r--r--fs/xfs/xfs_dir2_leaf.c2
-rw-r--r--fs/xfs/xfs_dir2_sf.c9
-rw-r--r--fs/xfs/xfs_iget.c2
-rw-r--r--fs/xfs/xfs_inode.c6
-rw-r--r--fs/xfs/xfs_itable.c43
269 files changed, 14497 insertions, 5547 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 635f3e286ad8..219ec06a8c7e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -236,6 +236,7 @@ config JBD_DEBUG
236 236
237config JBD2 237config JBD2
238 tristate 238 tristate
239 select CRC32
239 help 240 help
240 This is a generic journaling layer for block devices that support 241 This is a generic journaling layer for block devices that support
241 both 32-bit and 64-bit block numbers. It is currently used by 242 both 32-bit and 64-bit block numbers. It is currently used by
@@ -440,14 +441,8 @@ config OCFS2_FS
440 Tools web page: http://oss.oracle.com/projects/ocfs2-tools 441 Tools web page: http://oss.oracle.com/projects/ocfs2-tools
441 OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ 442 OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
442 443
443 Note: Features which OCFS2 does not support yet: 444 For more information on OCFS2, see the file
444 - extended attributes 445 <file:Documentation/filesystems/ocfs2.txt>.
445 - quotas
446 - cluster aware flock
447 - Directory change notification (F_NOTIFY)
448 - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
449 - POSIX ACLs
450 - readpages / writepages (not user visible)
451 446
452config OCFS2_DEBUG_MASKLOG 447config OCFS2_DEBUG_MASKLOG
453 bool "OCFS2 logging support" 448 bool "OCFS2 logging support"
@@ -1028,8 +1023,8 @@ config HUGETLB_PAGE
1028 def_bool HUGETLBFS 1023 def_bool HUGETLBFS
1029 1024
1030config CONFIGFS_FS 1025config CONFIGFS_FS
1031 tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)" 1026 tristate "Userspace-driven configuration filesystem"
1032 depends on SYSFS && EXPERIMENTAL 1027 depends on SYSFS
1033 help 1028 help
1034 configfs is a ram-based filesystem that provides the converse 1029 configfs is a ram-based filesystem that provides the converse
1035 of sysfs's functionality. Where sysfs is a filesystem-based 1030 of sysfs's functionality. Where sysfs is a filesystem-based
@@ -1112,8 +1107,8 @@ config HFS_FS
1112 help 1107 help
1113 If you say Y here, you will be able to mount Macintosh-formatted 1108 If you say Y here, you will be able to mount Macintosh-formatted
1114 floppy disks and hard drive partitions with full read-write access. 1109 floppy disks and hard drive partitions with full read-write access.
1115 Please read <file:fs/hfs/HFS.txt> to learn about the available mount 1110 Please read <file:Documentation/filesystems/hfs.txt> to learn about
1116 options. 1111 the available mount options.
1117 1112
1118 To compile this file system support as a module, choose M here: the 1113 To compile this file system support as a module, choose M here: the
1119 module will be called hfs. 1114 module will be called hfs.
@@ -1305,7 +1300,7 @@ config JFFS2_COMPRESSION_OPTIONS
1305 help 1300 help
1306 Enabling this option allows you to explicitly choose which 1301 Enabling this option allows you to explicitly choose which
1307 compression modules, if any, are enabled in JFFS2. Removing 1302 compression modules, if any, are enabled in JFFS2. Removing
1308 compressors and mean you cannot read existing file systems, 1303 compressors can mean you cannot read existing file systems,
1309 and enabling experimental compressors can mean that you 1304 and enabling experimental compressors can mean that you
1310 write a file system which cannot be read by a standard kernel. 1305 write a file system which cannot be read by a standard kernel.
1311 1306
@@ -1905,13 +1900,15 @@ config CIFS
1905 file servers such as Windows 2000 (including Windows 2003, NT 4 1900 file servers such as Windows 2000 (including Windows 2003, NT 4
1906 and Windows XP) as well by Samba (which provides excellent CIFS 1901 and Windows XP) as well by Samba (which provides excellent CIFS
1907 server support for Linux and many other operating systems). Limited 1902 server support for Linux and many other operating systems). Limited
1908 support for OS/2 and Windows ME and similar servers is provided as well. 1903 support for OS/2 and Windows ME and similar servers is provided as
1909 1904 well.
1910 The intent of the cifs module is to provide an advanced 1905
1911 network file system client for mounting to CIFS compliant servers, 1906 The cifs module provides an advanced network file system
1912 including support for dfs (hierarchical name space), secure per-user 1907 client for mounting to CIFS compliant servers. It includes
1913 session establishment, safe distributed caching (oplock), optional 1908 support for DFS (hierarchical name space), secure per-user
1914 packet signing, Unicode and other internationalization improvements. 1909 session establishment via Kerberos or NTLM or NTLMv2,
1910 safe distributed caching (oplock), optional packet
1911 signing, Unicode and other internationalization improvements.
1915 If you need to mount to Samba or Windows from this machine, say Y. 1912 If you need to mount to Samba or Windows from this machine, say Y.
1916 1913
1917config CIFS_STATS 1914config CIFS_STATS
@@ -1943,7 +1940,8 @@ config CIFS_WEAK_PW_HASH
1943 (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) 1940 (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
1944 security mechanisms. These hash the password more securely 1941 security mechanisms. These hash the password more securely
1945 than the mechanisms used in the older LANMAN version of the 1942 than the mechanisms used in the older LANMAN version of the
1946 SMB protocol needed to establish sessions with old SMB servers. 1943 SMB protocol but LANMAN based authentication is needed to
1944 establish sessions with some old SMB servers.
1947 1945
1948 Enabling this option allows the cifs module to mount to older 1946 Enabling this option allows the cifs module to mount to older
1949 LANMAN based servers such as OS/2 and Windows 95, but such 1947 LANMAN based servers such as OS/2 and Windows 95, but such
@@ -1951,8 +1949,8 @@ config CIFS_WEAK_PW_HASH
1951 security mechanisms if you are on a public network. Unless you 1949 security mechanisms if you are on a public network. Unless you
1952 have a need to access old SMB servers (and are on a private 1950 have a need to access old SMB servers (and are on a private
1953 network) you probably want to say N. Even if this support 1951 network) you probably want to say N. Even if this support
1954 is enabled in the kernel build, they will not be used 1952 is enabled in the kernel build, LANMAN authentication will not be
1955 automatically. At runtime LANMAN mounts are disabled but 1953 used automatically. At runtime LANMAN mounts are disabled but
1956 can be set to required (or optional) either in 1954 can be set to required (or optional) either in
1957 /proc/fs/cifs (see fs/cifs/README for more detail) or via an 1955 /proc/fs/cifs (see fs/cifs/README for more detail) or via an
1958 option on the mount command. This support is disabled by 1956 option on the mount command. This support is disabled by
@@ -2018,12 +2016,22 @@ config CIFS_UPCALL
2018 depends on CIFS_EXPERIMENTAL 2016 depends on CIFS_EXPERIMENTAL
2019 depends on KEYS 2017 depends on KEYS
2020 help 2018 help
2021 Enables an upcall mechanism for CIFS which will be used to contact 2019 Enables an upcall mechanism for CIFS which accesses
2022 userspace helper utilities to provide SPNEGO packaged Kerberos 2020 userspace helper utilities to provide SPNEGO packaged (RFC 4178)
2023 tickets which are needed to mount to certain secure servers 2021 Kerberos tickets which are needed to mount to certain secure servers
2024 (for which more secure Kerberos authentication is required). If 2022 (for which more secure Kerberos authentication is required). If
2025 unsure, say N. 2023 unsure, say N.
2026 2024
2025config CIFS_DFS_UPCALL
2026 bool "DFS feature support (EXPERIMENTAL)"
2027 depends on CIFS_EXPERIMENTAL
2028 depends on KEYS
2029 help
2030 Enables an upcall mechanism for CIFS which contacts userspace
2031 helper utilities to provide server name resolution (host names to
2032 IP addresses) which is needed for implicit mounts of DFS junction
2033 points. If unsure, say N.
2034
2027config NCP_FS 2035config NCP_FS
2028 tristate "NCP file system support (to mount NetWare volumes)" 2036 tristate "NCP file system support (to mount NetWare volumes)"
2029 depends on IPX!=n || INET 2037 depends on IPX!=n || INET
@@ -2130,4 +2138,3 @@ source "fs/nls/Kconfig"
2130source "fs/dlm/Kconfig" 2138source "fs/dlm/Kconfig"
2131 2139
2132endmenu 2140endmenu
2133
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index d4fc6095466d..7c3d5f923da1 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -23,6 +23,10 @@ config BINFMT_ELF
23 ld.so (check the file <file:Documentation/Changes> for location and 23 ld.so (check the file <file:Documentation/Changes> for location and
24 latest version). 24 latest version).
25 25
26config COMPAT_BINFMT_ELF
27 bool
28 depends on COMPAT && MMU
29
26config BINFMT_ELF_FDPIC 30config BINFMT_ELF_FDPIC
27 bool "Kernel support for FDPIC ELF binaries" 31 bool "Kernel support for FDPIC ELF binaries"
28 default y 32 default y
diff --git a/fs/Makefile b/fs/Makefile
index 500cf15cdb4b..1e7a11bd4da1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
39obj-y += binfmt_script.o 39obj-y += binfmt_script.o
40 40
41obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o 41obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
42obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
42obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o 43obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
43obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o 44obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
44obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o 45obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 33fe39ad4e03..0cc3597c1197 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -546,11 +546,11 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
546 dentry->d_op = &afs_fs_dentry_operations; 546 dentry->d_op = &afs_fs_dentry_operations;
547 547
548 d_add(dentry, inode); 548 d_add(dentry, inode);
549 _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%lu }", 549 _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
550 fid.vnode, 550 fid.vnode,
551 fid.unique, 551 fid.unique,
552 dentry->d_inode->i_ino, 552 dentry->d_inode->i_ino,
553 dentry->d_inode->i_version); 553 (unsigned long long)dentry->d_inode->i_version);
554 554
555 return NULL; 555 return NULL;
556} 556}
@@ -630,9 +630,10 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
630 * been deleted and replaced, and the original vnode ID has 630 * been deleted and replaced, and the original vnode ID has
631 * been reused */ 631 * been reused */
632 if (fid.unique != vnode->fid.unique) { 632 if (fid.unique != vnode->fid.unique) {
633 _debug("%s: file deleted (uq %u -> %u I:%lu)", 633 _debug("%s: file deleted (uq %u -> %u I:%llu)",
634 dentry->d_name.name, fid.unique, 634 dentry->d_name.name, fid.unique,
635 vnode->fid.unique, dentry->d_inode->i_version); 635 vnode->fid.unique,
636 (unsigned long long)dentry->d_inode->i_version);
636 spin_lock(&vnode->lock); 637 spin_lock(&vnode->lock);
637 set_bit(AFS_VNODE_DELETED, &vnode->flags); 638 set_bit(AFS_VNODE_DELETED, &vnode->flags);
638 spin_unlock(&vnode->lock); 639 spin_unlock(&vnode->lock);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index d196840127c6..84750c8e9f95 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -301,7 +301,8 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
301 301
302 inode = dentry->d_inode; 302 inode = dentry->d_inode;
303 303
304 _enter("{ ino=%lu v=%lu }", inode->i_ino, inode->i_version); 304 _enter("{ ino=%lu v=%llu }", inode->i_ino,
305 (unsigned long long)inode->i_version);
305 306
306 generic_fillattr(inode, stat); 307 generic_fillattr(inode, stat);
307 return 0; 308 return 0;
diff --git a/fs/aio.c b/fs/aio.c
index 9dec7d2d546e..8a37dbbf3437 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -397,7 +397,7 @@ void fastcall __put_ioctx(struct kioctx *ctx)
397 * This prevents races between the aio code path referencing the 397 * This prevents races between the aio code path referencing the
398 * req (after submitting it) and aio_complete() freeing the req. 398 * req (after submitting it) and aio_complete() freeing the req.
399 */ 399 */
400static struct kiocb *FASTCALL(__aio_get_req(struct kioctx *ctx)); 400static struct kiocb *__aio_get_req(struct kioctx *ctx);
401static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx) 401static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
402{ 402{
403 struct kiocb *req = NULL; 403 struct kiocb *req = NULL;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index e176d195e7e5..7596e1e94cde 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -319,7 +319,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
319 current->mm->free_area_cache = current->mm->mmap_base; 319 current->mm->free_area_cache = current->mm->mmap_base;
320 current->mm->cached_hole_size = 0; 320 current->mm->cached_hole_size = 0;
321 321
322 current->mm->mmap = NULL;
323 compute_creds(bprm); 322 compute_creds(bprm);
324 current->flags &= ~PF_FORKNOEXEC; 323 current->flags &= ~PF_FORKNOEXEC;
325#ifdef __sparc__ 324#ifdef __sparc__
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index ba8de7ca260b..18ed6dd906c1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -45,7 +45,8 @@
45 45
46static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); 46static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
47static int load_elf_library(struct file *); 47static int load_elf_library(struct file *);
48static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int); 48static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
49 int, int, unsigned long);
49 50
50/* 51/*
51 * If we don't support core dumping, then supply a NULL so we 52 * If we don't support core dumping, then supply a NULL so we
@@ -298,33 +299,70 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
298#ifndef elf_map 299#ifndef elf_map
299 300
300static unsigned long elf_map(struct file *filep, unsigned long addr, 301static unsigned long elf_map(struct file *filep, unsigned long addr,
301 struct elf_phdr *eppnt, int prot, int type) 302 struct elf_phdr *eppnt, int prot, int type,
303 unsigned long total_size)
302{ 304{
303 unsigned long map_addr; 305 unsigned long map_addr;
304 unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr); 306 unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
307 unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
308 addr = ELF_PAGESTART(addr);
309 size = ELF_PAGEALIGN(size);
305 310
306 down_write(&current->mm->mmap_sem);
307 /* mmap() will return -EINVAL if given a zero size, but a 311 /* mmap() will return -EINVAL if given a zero size, but a
308 * segment with zero filesize is perfectly valid */ 312 * segment with zero filesize is perfectly valid */
309 if (eppnt->p_filesz + pageoffset) 313 if (!size)
310 map_addr = do_mmap(filep, ELF_PAGESTART(addr), 314 return addr;
311 eppnt->p_filesz + pageoffset, prot, type, 315
312 eppnt->p_offset - pageoffset); 316 down_write(&current->mm->mmap_sem);
313 else 317 /*
314 map_addr = ELF_PAGESTART(addr); 318 * total_size is the size of the ELF (interpreter) image.
319 * The _first_ mmap needs to know the full size, otherwise
320 * randomization might put this image into an overlapping
321 * position with the ELF binary image. (since size < total_size)
322 * So we first map the 'big' image - and unmap the remainder at
323 * the end. (which unmap is needed for ELF images with holes.)
324 */
325 if (total_size) {
326 total_size = ELF_PAGEALIGN(total_size);
327 map_addr = do_mmap(filep, addr, total_size, prot, type, off);
328 if (!BAD_ADDR(map_addr))
329 do_munmap(current->mm, map_addr+size, total_size-size);
330 } else
331 map_addr = do_mmap(filep, addr, size, prot, type, off);
332
315 up_write(&current->mm->mmap_sem); 333 up_write(&current->mm->mmap_sem);
316 return(map_addr); 334 return(map_addr);
317} 335}
318 336
319#endif /* !elf_map */ 337#endif /* !elf_map */
320 338
339static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
340{
341 int i, first_idx = -1, last_idx = -1;
342
343 for (i = 0; i < nr; i++) {
344 if (cmds[i].p_type == PT_LOAD) {
345 last_idx = i;
346 if (first_idx == -1)
347 first_idx = i;
348 }
349 }
350 if (first_idx == -1)
351 return 0;
352
353 return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
354 ELF_PAGESTART(cmds[first_idx].p_vaddr);
355}
356
357
321/* This is much more generalized than the library routine read function, 358/* This is much more generalized than the library routine read function,
322 so we keep this separate. Technically the library read function 359 so we keep this separate. Technically the library read function
323 is only provided so that we can read a.out libraries that have 360 is only provided so that we can read a.out libraries that have
324 an ELF header */ 361 an ELF header */
325 362
326static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, 363static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
327 struct file *interpreter, unsigned long *interp_load_addr) 364 struct file *interpreter, unsigned long *interp_map_addr,
365 unsigned long no_base)
328{ 366{
329 struct elf_phdr *elf_phdata; 367 struct elf_phdr *elf_phdata;
330 struct elf_phdr *eppnt; 368 struct elf_phdr *eppnt;
@@ -332,6 +370,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
332 int load_addr_set = 0; 370 int load_addr_set = 0;
333 unsigned long last_bss = 0, elf_bss = 0; 371 unsigned long last_bss = 0, elf_bss = 0;
334 unsigned long error = ~0UL; 372 unsigned long error = ~0UL;
373 unsigned long total_size;
335 int retval, i, size; 374 int retval, i, size;
336 375
337 /* First of all, some simple consistency checks */ 376 /* First of all, some simple consistency checks */
@@ -370,6 +409,12 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
370 goto out_close; 409 goto out_close;
371 } 410 }
372 411
412 total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
413 if (!total_size) {
414 error = -EINVAL;
415 goto out_close;
416 }
417
373 eppnt = elf_phdata; 418 eppnt = elf_phdata;
374 for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { 419 for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
375 if (eppnt->p_type == PT_LOAD) { 420 if (eppnt->p_type == PT_LOAD) {
@@ -387,9 +432,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
387 vaddr = eppnt->p_vaddr; 432 vaddr = eppnt->p_vaddr;
388 if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) 433 if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
389 elf_type |= MAP_FIXED; 434 elf_type |= MAP_FIXED;
435 else if (no_base && interp_elf_ex->e_type == ET_DYN)
436 load_addr = -vaddr;
390 437
391 map_addr = elf_map(interpreter, load_addr + vaddr, 438 map_addr = elf_map(interpreter, load_addr + vaddr,
392 eppnt, elf_prot, elf_type); 439 eppnt, elf_prot, elf_type, total_size);
440 total_size = 0;
441 if (!*interp_map_addr)
442 *interp_map_addr = map_addr;
393 error = map_addr; 443 error = map_addr;
394 if (BAD_ADDR(map_addr)) 444 if (BAD_ADDR(map_addr))
395 goto out_close; 445 goto out_close;
@@ -455,8 +505,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
455 goto out_close; 505 goto out_close;
456 } 506 }
457 507
458 *interp_load_addr = load_addr; 508 error = load_addr;
459 error = ((unsigned long)interp_elf_ex->e_entry) + load_addr;
460 509
461out_close: 510out_close:
462 kfree(elf_phdata); 511 kfree(elf_phdata);
@@ -546,14 +595,14 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
546 int load_addr_set = 0; 595 int load_addr_set = 0;
547 char * elf_interpreter = NULL; 596 char * elf_interpreter = NULL;
548 unsigned int interpreter_type = INTERPRETER_NONE; 597 unsigned int interpreter_type = INTERPRETER_NONE;
549 unsigned char ibcs2_interpreter = 0;
550 unsigned long error; 598 unsigned long error;
551 struct elf_phdr *elf_ppnt, *elf_phdata; 599 struct elf_phdr *elf_ppnt, *elf_phdata;
552 unsigned long elf_bss, elf_brk; 600 unsigned long elf_bss, elf_brk;
553 int elf_exec_fileno; 601 int elf_exec_fileno;
554 int retval, i; 602 int retval, i;
555 unsigned int size; 603 unsigned int size;
556 unsigned long elf_entry, interp_load_addr = 0; 604 unsigned long elf_entry;
605 unsigned long interp_load_addr = 0;
557 unsigned long start_code, end_code, start_data, end_data; 606 unsigned long start_code, end_code, start_data, end_data;
558 unsigned long reloc_func_desc = 0; 607 unsigned long reloc_func_desc = 0;
559 char passed_fileno[6]; 608 char passed_fileno[6];
@@ -663,14 +712,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
663 if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') 712 if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
664 goto out_free_interp; 713 goto out_free_interp;
665 714
666 /* If the program interpreter is one of these two,
667 * then assume an iBCS2 image. Otherwise assume
668 * a native linux image.
669 */
670 if (strcmp(elf_interpreter,"/usr/lib/libc.so.1") == 0 ||
671 strcmp(elf_interpreter,"/usr/lib/ld.so.1") == 0)
672 ibcs2_interpreter = 1;
673
674 /* 715 /*
675 * The early SET_PERSONALITY here is so that the lookup 716 * The early SET_PERSONALITY here is so that the lookup
676 * for the interpreter happens in the namespace of the 717 * for the interpreter happens in the namespace of the
@@ -690,7 +731,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
690 * switch really is going to happen - do this in 731 * switch really is going to happen - do this in
691 * flush_thread(). - akpm 732 * flush_thread(). - akpm
692 */ 733 */
693 SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter); 734 SET_PERSONALITY(loc->elf_ex, 0);
694 735
695 interpreter = open_exec(elf_interpreter); 736 interpreter = open_exec(elf_interpreter);
696 retval = PTR_ERR(interpreter); 737 retval = PTR_ERR(interpreter);
@@ -769,7 +810,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
769 goto out_free_dentry; 810 goto out_free_dentry;
770 } else { 811 } else {
771 /* Executables without an interpreter also need a personality */ 812 /* Executables without an interpreter also need a personality */
772 SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter); 813 SET_PERSONALITY(loc->elf_ex, 0);
773 } 814 }
774 815
775 /* OK, we are done with that, now set up the arg stuff, 816 /* OK, we are done with that, now set up the arg stuff,
@@ -803,7 +844,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
803 844
804 /* Do this immediately, since STACK_TOP as used in setup_arg_pages 845 /* Do this immediately, since STACK_TOP as used in setup_arg_pages
805 may depend on the personality. */ 846 may depend on the personality. */
806 SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter); 847 SET_PERSONALITY(loc->elf_ex, 0);
807 if (elf_read_implies_exec(loc->elf_ex, executable_stack)) 848 if (elf_read_implies_exec(loc->elf_ex, executable_stack))
808 current->personality |= READ_IMPLIES_EXEC; 849 current->personality |= READ_IMPLIES_EXEC;
809 850
@@ -825,9 +866,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
825 current->mm->start_stack = bprm->p; 866 current->mm->start_stack = bprm->p;
826 867
827 /* Now we do a little grungy work by mmaping the ELF image into 868 /* Now we do a little grungy work by mmaping the ELF image into
828 the correct location in memory. At this point, we assume that 869 the correct location in memory. */
829 the image should be loaded at fixed address, not at a variable
830 address. */
831 for(i = 0, elf_ppnt = elf_phdata; 870 for(i = 0, elf_ppnt = elf_phdata;
832 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { 871 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
833 int elf_prot = 0, elf_flags; 872 int elf_prot = 0, elf_flags;
@@ -881,11 +920,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
881 * default mmap base, as well as whatever program they 920 * default mmap base, as well as whatever program they
882 * might try to exec. This is because the brk will 921 * might try to exec. This is because the brk will
883 * follow the loader, and is not movable. */ 922 * follow the loader, and is not movable. */
923#ifdef CONFIG_X86
924 load_bias = 0;
925#else
884 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); 926 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
927#endif
885 } 928 }
886 929
887 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, 930 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
888 elf_prot, elf_flags); 931 elf_prot, elf_flags, 0);
889 if (BAD_ADDR(error)) { 932 if (BAD_ADDR(error)) {
890 send_sig(SIGKILL, current, 0); 933 send_sig(SIGKILL, current, 0);
891 retval = IS_ERR((void *)error) ? 934 retval = IS_ERR((void *)error) ?
@@ -961,13 +1004,25 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
961 } 1004 }
962 1005
963 if (elf_interpreter) { 1006 if (elf_interpreter) {
964 if (interpreter_type == INTERPRETER_AOUT) 1007 if (interpreter_type == INTERPRETER_AOUT) {
965 elf_entry = load_aout_interp(&loc->interp_ex, 1008 elf_entry = load_aout_interp(&loc->interp_ex,
966 interpreter); 1009 interpreter);
967 else 1010 } else {
1011 unsigned long uninitialized_var(interp_map_addr);
1012
968 elf_entry = load_elf_interp(&loc->interp_elf_ex, 1013 elf_entry = load_elf_interp(&loc->interp_elf_ex,
969 interpreter, 1014 interpreter,
970 &interp_load_addr); 1015 &interp_map_addr,
1016 load_bias);
1017 if (!IS_ERR((void *)elf_entry)) {
1018 /*
1019 * load_elf_interp() returns relocation
1020 * adjustment
1021 */
1022 interp_load_addr = elf_entry;
1023 elf_entry += loc->interp_elf_ex.e_entry;
1024 }
1025 }
971 if (BAD_ADDR(elf_entry)) { 1026 if (BAD_ADDR(elf_entry)) {
972 force_sig(SIGSEGV, current); 1027 force_sig(SIGSEGV, current);
973 retval = IS_ERR((void *)elf_entry) ? 1028 retval = IS_ERR((void *)elf_entry) ?
@@ -1021,6 +1076,12 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
1021 current->mm->end_data = end_data; 1076 current->mm->end_data = end_data;
1022 current->mm->start_stack = bprm->p; 1077 current->mm->start_stack = bprm->p;
1023 1078
1079#ifdef arch_randomize_brk
1080 if (current->flags & PF_RANDOMIZE)
1081 current->mm->brk = current->mm->start_brk =
1082 arch_randomize_brk(current->mm);
1083#endif
1084
1024 if (current->personality & MMAP_PAGE_ZERO) { 1085 if (current->personality & MMAP_PAGE_ZERO) {
1025 /* Why this, you ask??? Well SVr4 maps page 0 as read-only, 1086 /* Why this, you ask??? Well SVr4 maps page 0 as read-only,
1026 and some applications "depend" upon this behavior. 1087 and some applications "depend" upon this behavior.
@@ -1325,7 +1386,8 @@ static int writenote(struct memelfnote *men, struct file *file,
1325 if (!dump_seek(file, (off))) \ 1386 if (!dump_seek(file, (off))) \
1326 goto end_coredump; 1387 goto end_coredump;
1327 1388
1328static void fill_elf_header(struct elfhdr *elf, int segs) 1389static void fill_elf_header(struct elfhdr *elf, int segs,
1390 u16 machine, u32 flags, u8 osabi)
1329{ 1391{
1330 memcpy(elf->e_ident, ELFMAG, SELFMAG); 1392 memcpy(elf->e_ident, ELFMAG, SELFMAG);
1331 elf->e_ident[EI_CLASS] = ELF_CLASS; 1393 elf->e_ident[EI_CLASS] = ELF_CLASS;
@@ -1335,12 +1397,12 @@ static void fill_elf_header(struct elfhdr *elf, int segs)
1335 memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); 1397 memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
1336 1398
1337 elf->e_type = ET_CORE; 1399 elf->e_type = ET_CORE;
1338 elf->e_machine = ELF_ARCH; 1400 elf->e_machine = machine;
1339 elf->e_version = EV_CURRENT; 1401 elf->e_version = EV_CURRENT;
1340 elf->e_entry = 0; 1402 elf->e_entry = 0;
1341 elf->e_phoff = sizeof(struct elfhdr); 1403 elf->e_phoff = sizeof(struct elfhdr);
1342 elf->e_shoff = 0; 1404 elf->e_shoff = 0;
1343 elf->e_flags = ELF_CORE_EFLAGS; 1405 elf->e_flags = flags;
1344 elf->e_ehsize = sizeof(struct elfhdr); 1406 elf->e_ehsize = sizeof(struct elfhdr);
1345 elf->e_phentsize = sizeof(struct elf_phdr); 1407 elf->e_phentsize = sizeof(struct elf_phdr);
1346 elf->e_phnum = segs; 1408 elf->e_phnum = segs;
@@ -1384,7 +1446,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
1384 prstatus->pr_sigpend = p->pending.signal.sig[0]; 1446 prstatus->pr_sigpend = p->pending.signal.sig[0];
1385 prstatus->pr_sighold = p->blocked.sig[0]; 1447 prstatus->pr_sighold = p->blocked.sig[0];
1386 prstatus->pr_pid = task_pid_vnr(p); 1448 prstatus->pr_pid = task_pid_vnr(p);
1387 prstatus->pr_ppid = task_pid_vnr(p->parent); 1449 prstatus->pr_ppid = task_pid_vnr(p->real_parent);
1388 prstatus->pr_pgrp = task_pgrp_vnr(p); 1450 prstatus->pr_pgrp = task_pgrp_vnr(p);
1389 prstatus->pr_sid = task_session_vnr(p); 1451 prstatus->pr_sid = task_session_vnr(p);
1390 if (thread_group_leader(p)) { 1452 if (thread_group_leader(p)) {
@@ -1430,7 +1492,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
1430 psinfo->pr_psargs[len] = 0; 1492 psinfo->pr_psargs[len] = 0;
1431 1493
1432 psinfo->pr_pid = task_pid_vnr(p); 1494 psinfo->pr_pid = task_pid_vnr(p);
1433 psinfo->pr_ppid = task_pid_vnr(p->parent); 1495 psinfo->pr_ppid = task_pid_vnr(p->real_parent);
1434 psinfo->pr_pgrp = task_pgrp_vnr(p); 1496 psinfo->pr_pgrp = task_pgrp_vnr(p);
1435 psinfo->pr_sid = task_session_vnr(p); 1497 psinfo->pr_sid = task_session_vnr(p);
1436 1498
@@ -1447,6 +1509,238 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
1447 return 0; 1509 return 0;
1448} 1510}
1449 1511
1512static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
1513{
1514 elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv;
1515 int i = 0;
1516 do
1517 i += 2;
1518 while (auxv[i - 2] != AT_NULL);
1519 fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
1520}
1521
1522#ifdef CORE_DUMP_USE_REGSET
1523#include <linux/regset.h>
1524
1525struct elf_thread_core_info {
1526 struct elf_thread_core_info *next;
1527 struct task_struct *task;
1528 struct elf_prstatus prstatus;
1529 struct memelfnote notes[0];
1530};
1531
1532struct elf_note_info {
1533 struct elf_thread_core_info *thread;
1534 struct memelfnote psinfo;
1535 struct memelfnote auxv;
1536 size_t size;
1537 int thread_notes;
1538};
1539
1540static int fill_thread_core_info(struct elf_thread_core_info *t,
1541 const struct user_regset_view *view,
1542 long signr, size_t *total)
1543{
1544 unsigned int i;
1545
1546 /*
1547 * NT_PRSTATUS is the one special case, because the regset data
1548 * goes into the pr_reg field inside the note contents, rather
1549 * than being the whole note contents. We fill the reset in here.
1550 * We assume that regset 0 is NT_PRSTATUS.
1551 */
1552 fill_prstatus(&t->prstatus, t->task, signr);
1553 (void) view->regsets[0].get(t->task, &view->regsets[0],
1554 0, sizeof(t->prstatus.pr_reg),
1555 &t->prstatus.pr_reg, NULL);
1556
1557 fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
1558 sizeof(t->prstatus), &t->prstatus);
1559 *total += notesize(&t->notes[0]);
1560
1561 /*
1562 * Each other regset might generate a note too. For each regset
1563 * that has no core_note_type or is inactive, we leave t->notes[i]
1564 * all zero and we'll know to skip writing it later.
1565 */
1566 for (i = 1; i < view->n; ++i) {
1567 const struct user_regset *regset = &view->regsets[i];
1568 if (regset->core_note_type &&
1569 (!regset->active || regset->active(t->task, regset))) {
1570 int ret;
1571 size_t size = regset->n * regset->size;
1572 void *data = kmalloc(size, GFP_KERNEL);
1573 if (unlikely(!data))
1574 return 0;
1575 ret = regset->get(t->task, regset,
1576 0, size, data, NULL);
1577 if (unlikely(ret))
1578 kfree(data);
1579 else {
1580 if (regset->core_note_type != NT_PRFPREG)
1581 fill_note(&t->notes[i], "LINUX",
1582 regset->core_note_type,
1583 size, data);
1584 else {
1585 t->prstatus.pr_fpvalid = 1;
1586 fill_note(&t->notes[i], "CORE",
1587 NT_PRFPREG, size, data);
1588 }
1589 *total += notesize(&t->notes[i]);
1590 }
1591 }
1592 }
1593
1594 return 1;
1595}
1596
1597static int fill_note_info(struct elfhdr *elf, int phdrs,
1598 struct elf_note_info *info,
1599 long signr, struct pt_regs *regs)
1600{
1601 struct task_struct *dump_task = current;
1602 const struct user_regset_view *view = task_user_regset_view(dump_task);
1603 struct elf_thread_core_info *t;
1604 struct elf_prpsinfo *psinfo;
1605 struct task_struct *g, *p;
1606 unsigned int i;
1607
1608 info->size = 0;
1609 info->thread = NULL;
1610
1611 psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
1612 fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
1613
1614 if (psinfo == NULL)
1615 return 0;
1616
1617 /*
1618 * Figure out how many notes we're going to need for each thread.
1619 */
1620 info->thread_notes = 0;
1621 for (i = 0; i < view->n; ++i)
1622 if (view->regsets[i].core_note_type != 0)
1623 ++info->thread_notes;
1624
1625 /*
1626 * Sanity check. We rely on regset 0 being in NT_PRSTATUS,
1627 * since it is our one special case.
1628 */
1629 if (unlikely(info->thread_notes == 0) ||
1630 unlikely(view->regsets[0].core_note_type != NT_PRSTATUS)) {
1631 WARN_ON(1);
1632 return 0;
1633 }
1634
1635 /*
1636 * Initialize the ELF file header.
1637 */
1638 fill_elf_header(elf, phdrs,
1639 view->e_machine, view->e_flags, view->ei_osabi);
1640
1641 /*
1642 * Allocate a structure for each thread.
1643 */
1644 rcu_read_lock();
1645 do_each_thread(g, p)
1646 if (p->mm == dump_task->mm) {
1647 t = kzalloc(offsetof(struct elf_thread_core_info,
1648 notes[info->thread_notes]),
1649 GFP_ATOMIC);
1650 if (unlikely(!t)) {
1651 rcu_read_unlock();
1652 return 0;
1653 }
1654 t->task = p;
1655 if (p == dump_task || !info->thread) {
1656 t->next = info->thread;
1657 info->thread = t;
1658 } else {
1659 /*
1660 * Make sure to keep the original task at
1661 * the head of the list.
1662 */
1663 t->next = info->thread->next;
1664 info->thread->next = t;
1665 }
1666 }
1667 while_each_thread(g, p);
1668 rcu_read_unlock();
1669
1670 /*
1671 * Now fill in each thread's information.
1672 */
1673 for (t = info->thread; t != NULL; t = t->next)
1674 if (!fill_thread_core_info(t, view, signr, &info->size))
1675 return 0;
1676
1677 /*
1678 * Fill in the two process-wide notes.
1679 */
1680 fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
1681 info->size += notesize(&info->psinfo);
1682
1683 fill_auxv_note(&info->auxv, current->mm);
1684 info->size += notesize(&info->auxv);
1685
1686 return 1;
1687}
1688
1689static size_t get_note_info_size(struct elf_note_info *info)
1690{
1691 return info->size;
1692}
1693
1694/*
1695 * Write all the notes for each thread. When writing the first thread, the
1696 * process-wide notes are interleaved after the first thread-specific note.
1697 */
1698static int write_note_info(struct elf_note_info *info,
1699 struct file *file, loff_t *foffset)
1700{
1701 bool first = 1;
1702 struct elf_thread_core_info *t = info->thread;
1703
1704 do {
1705 int i;
1706
1707 if (!writenote(&t->notes[0], file, foffset))
1708 return 0;
1709
1710 if (first && !writenote(&info->psinfo, file, foffset))
1711 return 0;
1712 if (first && !writenote(&info->auxv, file, foffset))
1713 return 0;
1714
1715 for (i = 1; i < info->thread_notes; ++i)
1716 if (t->notes[i].data &&
1717 !writenote(&t->notes[i], file, foffset))
1718 return 0;
1719
1720 first = 0;
1721 t = t->next;
1722 } while (t);
1723
1724 return 1;
1725}
1726
1727static void free_note_info(struct elf_note_info *info)
1728{
1729 struct elf_thread_core_info *threads = info->thread;
1730 while (threads) {
1731 unsigned int i;
1732 struct elf_thread_core_info *t = threads;
1733 threads = t->next;
1734 WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus);
1735 for (i = 1; i < info->thread_notes; ++i)
1736 kfree(t->notes[i].data);
1737 kfree(t);
1738 }
1739 kfree(info->psinfo.data);
1740}
1741
1742#else
1743
1450/* Here is the structure in which status of each thread is captured. */ 1744/* Here is the structure in which status of each thread is captured. */
1451struct elf_thread_status 1745struct elf_thread_status
1452{ 1746{
@@ -1499,6 +1793,176 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
1499 return sz; 1793 return sz;
1500} 1794}
1501 1795
1796struct elf_note_info {
1797 struct memelfnote *notes;
1798 struct elf_prstatus *prstatus; /* NT_PRSTATUS */
1799 struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */
1800 struct list_head thread_list;
1801 elf_fpregset_t *fpu;
1802#ifdef ELF_CORE_COPY_XFPREGS
1803 elf_fpxregset_t *xfpu;
1804#endif
1805 int thread_status_size;
1806 int numnote;
1807};
1808
1809static int fill_note_info(struct elfhdr *elf, int phdrs,
1810 struct elf_note_info *info,
1811 long signr, struct pt_regs *regs)
1812{
1813#define NUM_NOTES 6
1814 struct list_head *t;
1815 struct task_struct *g, *p;
1816
1817 info->notes = NULL;
1818 info->prstatus = NULL;
1819 info->psinfo = NULL;
1820 info->fpu = NULL;
1821#ifdef ELF_CORE_COPY_XFPREGS
1822 info->xfpu = NULL;
1823#endif
1824 INIT_LIST_HEAD(&info->thread_list);
1825
1826 info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote),
1827 GFP_KERNEL);
1828 if (!info->notes)
1829 return 0;
1830 info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
1831 if (!info->psinfo)
1832 return 0;
1833 info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
1834 if (!info->prstatus)
1835 return 0;
1836 info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
1837 if (!info->fpu)
1838 return 0;
1839#ifdef ELF_CORE_COPY_XFPREGS
1840 info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
1841 if (!info->xfpu)
1842 return 0;
1843#endif
1844
1845 info->thread_status_size = 0;
1846 if (signr) {
1847 struct elf_thread_status *tmp;
1848 rcu_read_lock();
1849 do_each_thread(g, p)
1850 if (current->mm == p->mm && current != p) {
1851 tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
1852 if (!tmp) {
1853 rcu_read_unlock();
1854 return 0;
1855 }
1856 tmp->thread = p;
1857 list_add(&tmp->list, &info->thread_list);
1858 }
1859 while_each_thread(g, p);
1860 rcu_read_unlock();
1861 list_for_each(t, &info->thread_list) {
1862 struct elf_thread_status *tmp;
1863 int sz;
1864
1865 tmp = list_entry(t, struct elf_thread_status, list);
1866 sz = elf_dump_thread_status(signr, tmp);
1867 info->thread_status_size += sz;
1868 }
1869 }
1870 /* now collect the dump for the current */
1871 memset(info->prstatus, 0, sizeof(*info->prstatus));
1872 fill_prstatus(info->prstatus, current, signr);
1873 elf_core_copy_regs(&info->prstatus->pr_reg, regs);
1874
1875 /* Set up header */
1876 fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI);
1877
1878 /*
1879 * Set up the notes in similar form to SVR4 core dumps made
1880 * with info from their /proc.
1881 */
1882
1883 fill_note(info->notes + 0, "CORE", NT_PRSTATUS,
1884 sizeof(*info->prstatus), info->prstatus);
1885 fill_psinfo(info->psinfo, current->group_leader, current->mm);
1886 fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
1887 sizeof(*info->psinfo), info->psinfo);
1888
1889 info->numnote = 2;
1890
1891 fill_auxv_note(&info->notes[info->numnote++], current->mm);
1892
1893 /* Try to dump the FPU. */
1894 info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
1895 info->fpu);
1896 if (info->prstatus->pr_fpvalid)
1897 fill_note(info->notes + info->numnote++,
1898 "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
1899#ifdef ELF_CORE_COPY_XFPREGS
1900 if (elf_core_copy_task_xfpregs(current, info->xfpu))
1901 fill_note(info->notes + info->numnote++,
1902 "LINUX", ELF_CORE_XFPREG_TYPE,
1903 sizeof(*info->xfpu), info->xfpu);
1904#endif
1905
1906 return 1;
1907
1908#undef NUM_NOTES
1909}
1910
1911static size_t get_note_info_size(struct elf_note_info *info)
1912{
1913 int sz = 0;
1914 int i;
1915
1916 for (i = 0; i < info->numnote; i++)
1917 sz += notesize(info->notes + i);
1918
1919 sz += info->thread_status_size;
1920
1921 return sz;
1922}
1923
1924static int write_note_info(struct elf_note_info *info,
1925 struct file *file, loff_t *foffset)
1926{
1927 int i;
1928 struct list_head *t;
1929
1930 for (i = 0; i < info->numnote; i++)
1931 if (!writenote(info->notes + i, file, foffset))
1932 return 0;
1933
1934 /* write out the thread status notes section */
1935 list_for_each(t, &info->thread_list) {
1936 struct elf_thread_status *tmp =
1937 list_entry(t, struct elf_thread_status, list);
1938
1939 for (i = 0; i < tmp->num_notes; i++)
1940 if (!writenote(&tmp->notes[i], file, foffset))
1941 return 0;
1942 }
1943
1944 return 1;
1945}
1946
1947static void free_note_info(struct elf_note_info *info)
1948{
1949 while (!list_empty(&info->thread_list)) {
1950 struct list_head *tmp = info->thread_list.next;
1951 list_del(tmp);
1952 kfree(list_entry(tmp, struct elf_thread_status, list));
1953 }
1954
1955 kfree(info->prstatus);
1956 kfree(info->psinfo);
1957 kfree(info->notes);
1958 kfree(info->fpu);
1959#ifdef ELF_CORE_COPY_XFPREGS
1960 kfree(info->xfpu);
1961#endif
1962}
1963
1964#endif
1965
1502static struct vm_area_struct *first_vma(struct task_struct *tsk, 1966static struct vm_area_struct *first_vma(struct task_struct *tsk,
1503 struct vm_area_struct *gate_vma) 1967 struct vm_area_struct *gate_vma)
1504{ 1968{
@@ -1534,29 +1998,15 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
1534 */ 1998 */
1535static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) 1999static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
1536{ 2000{
1537#define NUM_NOTES 6
1538 int has_dumped = 0; 2001 int has_dumped = 0;
1539 mm_segment_t fs; 2002 mm_segment_t fs;
1540 int segs; 2003 int segs;
1541 size_t size = 0; 2004 size_t size = 0;
1542 int i;
1543 struct vm_area_struct *vma, *gate_vma; 2005 struct vm_area_struct *vma, *gate_vma;
1544 struct elfhdr *elf = NULL; 2006 struct elfhdr *elf = NULL;
1545 loff_t offset = 0, dataoff, foffset; 2007 loff_t offset = 0, dataoff, foffset;
1546 int numnote;
1547 struct memelfnote *notes = NULL;
1548 struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */
1549 struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */
1550 struct task_struct *g, *p;
1551 LIST_HEAD(thread_list);
1552 struct list_head *t;
1553 elf_fpregset_t *fpu = NULL;
1554#ifdef ELF_CORE_COPY_XFPREGS
1555 elf_fpxregset_t *xfpu = NULL;
1556#endif
1557 int thread_status_size = 0;
1558 elf_addr_t *auxv;
1559 unsigned long mm_flags; 2008 unsigned long mm_flags;
2009 struct elf_note_info info;
1560 2010
1561 /* 2011 /*
1562 * We no longer stop all VM operations. 2012 * We no longer stop all VM operations.
@@ -1574,52 +2024,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1574 elf = kmalloc(sizeof(*elf), GFP_KERNEL); 2024 elf = kmalloc(sizeof(*elf), GFP_KERNEL);
1575 if (!elf) 2025 if (!elf)
1576 goto cleanup; 2026 goto cleanup;
1577 prstatus = kmalloc(sizeof(*prstatus), GFP_KERNEL);
1578 if (!prstatus)
1579 goto cleanup;
1580 psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
1581 if (!psinfo)
1582 goto cleanup;
1583 notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL);
1584 if (!notes)
1585 goto cleanup;
1586 fpu = kmalloc(sizeof(*fpu), GFP_KERNEL);
1587 if (!fpu)
1588 goto cleanup;
1589#ifdef ELF_CORE_COPY_XFPREGS
1590 xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL);
1591 if (!xfpu)
1592 goto cleanup;
1593#endif
1594
1595 if (signr) {
1596 struct elf_thread_status *tmp;
1597 rcu_read_lock();
1598 do_each_thread(g,p)
1599 if (current->mm == p->mm && current != p) {
1600 tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
1601 if (!tmp) {
1602 rcu_read_unlock();
1603 goto cleanup;
1604 }
1605 tmp->thread = p;
1606 list_add(&tmp->list, &thread_list);
1607 }
1608 while_each_thread(g,p);
1609 rcu_read_unlock();
1610 list_for_each(t, &thread_list) {
1611 struct elf_thread_status *tmp;
1612 int sz;
1613
1614 tmp = list_entry(t, struct elf_thread_status, list);
1615 sz = elf_dump_thread_status(signr, tmp);
1616 thread_status_size += sz;
1617 }
1618 }
1619 /* now collect the dump for the current */
1620 memset(prstatus, 0, sizeof(*prstatus));
1621 fill_prstatus(prstatus, current, signr);
1622 elf_core_copy_regs(&prstatus->pr_reg, regs);
1623 2027
1624 segs = current->mm->map_count; 2028 segs = current->mm->map_count;
1625#ifdef ELF_CORE_EXTRA_PHDRS 2029#ifdef ELF_CORE_EXTRA_PHDRS
@@ -1630,42 +2034,16 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1630 if (gate_vma != NULL) 2034 if (gate_vma != NULL)
1631 segs++; 2035 segs++;
1632 2036
1633 /* Set up header */
1634 fill_elf_header(elf, segs + 1); /* including notes section */
1635
1636 has_dumped = 1;
1637 current->flags |= PF_DUMPCORE;
1638
1639 /* 2037 /*
1640 * Set up the notes in similar form to SVR4 core dumps made 2038 * Collect all the non-memory information about the process for the
1641 * with info from their /proc. 2039 * notes. This also sets up the file header.
1642 */ 2040 */
2041 if (!fill_note_info(elf, segs + 1, /* including notes section */
2042 &info, signr, regs))
2043 goto cleanup;
1643 2044
1644 fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus); 2045 has_dumped = 1;
1645 fill_psinfo(psinfo, current->group_leader, current->mm); 2046 current->flags |= PF_DUMPCORE;
1646 fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
1647
1648 numnote = 2;
1649
1650 auxv = (elf_addr_t *)current->mm->saved_auxv;
1651
1652 i = 0;
1653 do
1654 i += 2;
1655 while (auxv[i - 2] != AT_NULL);
1656 fill_note(&notes[numnote++], "CORE", NT_AUXV,
1657 i * sizeof(elf_addr_t), auxv);
1658
1659 /* Try to dump the FPU. */
1660 if ((prstatus->pr_fpvalid =
1661 elf_core_copy_task_fpregs(current, regs, fpu)))
1662 fill_note(notes + numnote++,
1663 "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
1664#ifdef ELF_CORE_COPY_XFPREGS
1665 if (elf_core_copy_task_xfpregs(current, xfpu))
1666 fill_note(notes + numnote++,
1667 "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
1668#endif
1669 2047
1670 fs = get_fs(); 2048 fs = get_fs();
1671 set_fs(KERNEL_DS); 2049 set_fs(KERNEL_DS);
@@ -1678,12 +2056,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1678 /* Write notes phdr entry */ 2056 /* Write notes phdr entry */
1679 { 2057 {
1680 struct elf_phdr phdr; 2058 struct elf_phdr phdr;
1681 int sz = 0; 2059 size_t sz = get_note_info_size(&info);
1682
1683 for (i = 0; i < numnote; i++)
1684 sz += notesize(notes + i);
1685
1686 sz += thread_status_size;
1687 2060
1688 sz += elf_coredump_extra_notes_size(); 2061 sz += elf_coredump_extra_notes_size();
1689 2062
@@ -1728,23 +2101,12 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1728#endif 2101#endif
1729 2102
1730 /* write out the notes section */ 2103 /* write out the notes section */
1731 for (i = 0; i < numnote; i++) 2104 if (!write_note_info(&info, file, &foffset))
1732 if (!writenote(notes + i, file, &foffset)) 2105 goto end_coredump;
1733 goto end_coredump;
1734 2106
1735 if (elf_coredump_extra_notes_write(file, &foffset)) 2107 if (elf_coredump_extra_notes_write(file, &foffset))
1736 goto end_coredump; 2108 goto end_coredump;
1737 2109
1738 /* write out the thread status notes section */
1739 list_for_each(t, &thread_list) {
1740 struct elf_thread_status *tmp =
1741 list_entry(t, struct elf_thread_status, list);
1742
1743 for (i = 0; i < tmp->num_notes; i++)
1744 if (!writenote(&tmp->notes[i], file, &foffset))
1745 goto end_coredump;
1746 }
1747
1748 /* Align to page */ 2110 /* Align to page */
1749 DUMP_SEEK(dataoff - foffset); 2111 DUMP_SEEK(dataoff - foffset);
1750 2112
@@ -1795,22 +2157,9 @@ end_coredump:
1795 set_fs(fs); 2157 set_fs(fs);
1796 2158
1797cleanup: 2159cleanup:
1798 while (!list_empty(&thread_list)) {
1799 struct list_head *tmp = thread_list.next;
1800 list_del(tmp);
1801 kfree(list_entry(tmp, struct elf_thread_status, list));
1802 }
1803
1804 kfree(elf); 2160 kfree(elf);
1805 kfree(prstatus); 2161 free_note_info(&info);
1806 kfree(psinfo);
1807 kfree(notes);
1808 kfree(fpu);
1809#ifdef ELF_CORE_COPY_XFPREGS
1810 kfree(xfpu);
1811#endif
1812 return has_dumped; 2162 return has_dumped;
1813#undef NUM_NOTES
1814} 2163}
1815 2164
1816#endif /* USE_ELF_CORE_DUMP */ 2165#endif /* USE_ELF_CORE_DUMP */
diff --git a/fs/bio.c b/fs/bio.c
index d59ddbf79626..242e409dab4b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -248,11 +248,13 @@ inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
248 */ 248 */
249void __bio_clone(struct bio *bio, struct bio *bio_src) 249void __bio_clone(struct bio *bio, struct bio *bio_src)
250{ 250{
251 struct request_queue *q = bdev_get_queue(bio_src->bi_bdev);
252
253 memcpy(bio->bi_io_vec, bio_src->bi_io_vec, 251 memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
254 bio_src->bi_max_vecs * sizeof(struct bio_vec)); 252 bio_src->bi_max_vecs * sizeof(struct bio_vec));
255 253
254 /*
255 * most users will be overriding ->bi_bdev with a new target,
256 * so we don't set nor calculate new physical/hw segment counts here
257 */
256 bio->bi_sector = bio_src->bi_sector; 258 bio->bi_sector = bio_src->bi_sector;
257 bio->bi_bdev = bio_src->bi_bdev; 259 bio->bi_bdev = bio_src->bi_bdev;
258 bio->bi_flags |= 1 << BIO_CLONED; 260 bio->bi_flags |= 1 << BIO_CLONED;
@@ -260,8 +262,6 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
260 bio->bi_vcnt = bio_src->bi_vcnt; 262 bio->bi_vcnt = bio_src->bi_vcnt;
261 bio->bi_size = bio_src->bi_size; 263 bio->bi_size = bio_src->bi_size;
262 bio->bi_idx = bio_src->bi_idx; 264 bio->bi_idx = bio_src->bi_idx;
263 bio_phys_segments(q, bio);
264 bio_hw_segments(q, bio);
265} 265}
266 266
267/** 267/**
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 993f78c55221..e48a630ae266 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -738,9 +738,9 @@ EXPORT_SYMBOL(bd_release);
738static struct kobject *bdev_get_kobj(struct block_device *bdev) 738static struct kobject *bdev_get_kobj(struct block_device *bdev)
739{ 739{
740 if (bdev->bd_contains != bdev) 740 if (bdev->bd_contains != bdev)
741 return kobject_get(&bdev->bd_part->kobj); 741 return kobject_get(&bdev->bd_part->dev.kobj);
742 else 742 else
743 return kobject_get(&bdev->bd_disk->kobj); 743 return kobject_get(&bdev->bd_disk->dev.kobj);
744} 744}
745 745
746static struct kobject *bdev_get_holder(struct block_device *bdev) 746static struct kobject *bdev_get_holder(struct block_device *bdev)
@@ -1176,7 +1176,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
1176 ret = -ENXIO; 1176 ret = -ENXIO;
1177 goto out_first; 1177 goto out_first;
1178 } 1178 }
1179 kobject_get(&p->kobj); 1179 kobject_get(&p->dev.kobj);
1180 bdev->bd_part = p; 1180 bdev->bd_part = p;
1181 bd_set_size(bdev, (loff_t) p->nr_sects << 9); 1181 bd_set_size(bdev, (loff_t) p->nr_sects << 9);
1182 } 1182 }
@@ -1299,7 +1299,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
1299 module_put(owner); 1299 module_put(owner);
1300 1300
1301 if (bdev->bd_contains != bdev) { 1301 if (bdev->bd_contains != bdev) {
1302 kobject_put(&bdev->bd_part->kobj); 1302 kobject_put(&bdev->bd_part->dev.kobj);
1303 bdev->bd_part = NULL; 1303 bdev->bd_part = NULL;
1304 } 1304 }
1305 bdev->bd_disk = NULL; 1305 bdev->bd_disk = NULL;
diff --git a/fs/buffer.c b/fs/buffer.c
index 7249e014819e..456c9ab7705b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3213,6 +3213,50 @@ static int buffer_cpu_notify(struct notifier_block *self,
3213 return NOTIFY_OK; 3213 return NOTIFY_OK;
3214} 3214}
3215 3215
3216/**
3217 * bh_uptodate_or_lock: Test whether the buffer is uptodate
3218 * @bh: struct buffer_head
3219 *
3220 * Return true if the buffer is up-to-date and false,
3221 * with the buffer locked, if not.
3222 */
3223int bh_uptodate_or_lock(struct buffer_head *bh)
3224{
3225 if (!buffer_uptodate(bh)) {
3226 lock_buffer(bh);
3227 if (!buffer_uptodate(bh))
3228 return 0;
3229 unlock_buffer(bh);
3230 }
3231 return 1;
3232}
3233EXPORT_SYMBOL(bh_uptodate_or_lock);
3234
3235/**
3236 * bh_submit_read: Submit a locked buffer for reading
3237 * @bh: struct buffer_head
3238 *
3239 * Returns zero on success and -EIO on error.
3240 */
3241int bh_submit_read(struct buffer_head *bh)
3242{
3243 BUG_ON(!buffer_locked(bh));
3244
3245 if (buffer_uptodate(bh)) {
3246 unlock_buffer(bh);
3247 return 0;
3248 }
3249
3250 get_bh(bh);
3251 bh->b_end_io = end_buffer_read_sync;
3252 submit_bh(READ, bh);
3253 wait_on_buffer(bh);
3254 if (buffer_uptodate(bh))
3255 return 0;
3256 return -EIO;
3257}
3258EXPORT_SYMBOL(bh_submit_read);
3259
3216void __init buffer_init(void) 3260void __init buffer_init(void)
3217{ 3261{
3218 int nrpages; 3262 int nrpages;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index c3bfa76765c4..2c7a8b5b4598 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -510,9 +510,8 @@ struct cdev *cdev_alloc(void)
510{ 510{
511 struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); 511 struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
512 if (p) { 512 if (p) {
513 p->kobj.ktype = &ktype_cdev_dynamic;
514 INIT_LIST_HEAD(&p->list); 513 INIT_LIST_HEAD(&p->list);
515 kobject_init(&p->kobj); 514 kobject_init(&p->kobj, &ktype_cdev_dynamic);
516 } 515 }
517 return p; 516 return p;
518} 517}
@@ -529,8 +528,7 @@ void cdev_init(struct cdev *cdev, const struct file_operations *fops)
529{ 528{
530 memset(cdev, 0, sizeof *cdev); 529 memset(cdev, 0, sizeof *cdev);
531 INIT_LIST_HEAD(&cdev->list); 530 INIT_LIST_HEAD(&cdev->list);
532 cdev->kobj.ktype = &ktype_cdev_default; 531 kobject_init(&cdev->kobj, &ktype_cdev_default);
533 kobject_init(&cdev->kobj);
534 cdev->ops = fops; 532 cdev->ops = fops;
535} 533}
536 534
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index a609599287aa..edd248367b36 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -3,7 +3,10 @@ Version 1.52
3Fix oops on second mount to server when null auth is used. 3Fix oops on second mount to server when null auth is used.
4Enable experimental Kerberos support. Return writebehind errors on flush 4Enable experimental Kerberos support. Return writebehind errors on flush
5and sync so that events like out of disk space get reported properly on 5and sync so that events like out of disk space get reported properly on
6cached files. 6cached files. Fix setxattr failure to certain Samba versions. Fix mount
7of second share to disconnected server session (autoreconnect on this).
8Add ability to modify cifs acls for handling chmod (when mounted with
9cifsacl flag).
7 10
8Version 1.51 11Version 1.51
9------------ 12------------
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 45e42fb97c19..6ba43fb346fb 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -9,3 +9,5 @@ cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
9 readdir.o ioctl.o sess.o export.o cifsacl.o 9 readdir.o ioctl.o sess.o export.o cifsacl.o
10 10
11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o 11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
12
13cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
diff --git a/fs/cifs/README b/fs/cifs/README
index bf11329ac784..c623e2f9c5db 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -56,7 +56,8 @@ the CIFS VFS web site) copy it to the same directory in which mount.smbfs and
56similar files reside (usually /sbin). Although the helper software is not 56similar files reside (usually /sbin). Although the helper software is not
57required, mount.cifs is recommended. Eventually the Samba 3.0 utility program 57required, mount.cifs is recommended. Eventually the Samba 3.0 utility program
58"net" may also be helpful since it may someday provide easier mount syntax for 58"net" may also be helpful since it may someday provide easier mount syntax for
59users who are used to Windows e.g. net use <mount point> <UNC name or cifs URL> 59users who are used to Windows e.g.
60 net use <mount point> <UNC name or cifs URL>
60Note that running the Winbind pam/nss module (logon service) on all of your 61Note that running the Winbind pam/nss module (logon service) on all of your
61Linux clients is useful in mapping Uids and Gids consistently across the 62Linux clients is useful in mapping Uids and Gids consistently across the
62domain to the proper network user. The mount.cifs mount helper can be 63domain to the proper network user. The mount.cifs mount helper can be
@@ -248,7 +249,7 @@ A partial list of the supported mount options follows:
248 the CIFS session. 249 the CIFS session.
249 password The user password. If the mount helper is 250 password The user password. If the mount helper is
250 installed, the user will be prompted for password 251 installed, the user will be prompted for password
251 if it is not supplied. 252 if not supplied.
252 ip The ip address of the target server 253 ip The ip address of the target server
253 unc The target server Universal Network Name (export) to 254 unc The target server Universal Network Name (export) to
254 mount. 255 mount.
@@ -283,7 +284,7 @@ A partial list of the supported mount options follows:
283 can be enabled by specifying file_mode and dir_mode on 284 can be enabled by specifying file_mode and dir_mode on
284 the client. Note that the mount.cifs helper must be 285 the client. Note that the mount.cifs helper must be
285 at version 1.10 or higher to support specifying the uid 286 at version 1.10 or higher to support specifying the uid
286 (or gid) in non-numberic form. 287 (or gid) in non-numeric form.
287 gid Set the default gid for inodes (similar to above). 288 gid Set the default gid for inodes (similar to above).
288 file_mode If CIFS Unix extensions are not supported by the server 289 file_mode If CIFS Unix extensions are not supported by the server
289 this overrides the default mode for file inodes. 290 this overrides the default mode for file inodes.
@@ -417,9 +418,10 @@ A partial list of the supported mount options follows:
417 acl Allow setfacl and getfacl to manage posix ACLs if server 418 acl Allow setfacl and getfacl to manage posix ACLs if server
418 supports them. (default) 419 supports them. (default)
419 noacl Do not allow setfacl and getfacl calls on this mount 420 noacl Do not allow setfacl and getfacl calls on this mount
420 user_xattr Allow getting and setting user xattrs as OS/2 EAs (extended 421 user_xattr Allow getting and setting user xattrs (those attributes whose
421 attributes) to the server (default) e.g. via setfattr 422 name begins with "user." or "os2.") as OS/2 EAs (extended
422 and getfattr utilities. 423 attributes) to the server. This allows support of the
424 setfattr and getfattr utilities. (default)
423 nouser_xattr Do not allow getfattr/setfattr to get/set/list xattrs 425 nouser_xattr Do not allow getfattr/setfattr to get/set/list xattrs
424 mapchars Translate six of the seven reserved characters (not backslash) 426 mapchars Translate six of the seven reserved characters (not backslash)
425 *?<>|: 427 *?<>|:
@@ -434,6 +436,7 @@ A partial list of the supported mount options follows:
434 nomapchars Do not translate any of these seven characters (default). 436 nomapchars Do not translate any of these seven characters (default).
435 nocase Request case insensitive path name matching (case 437 nocase Request case insensitive path name matching (case
436 sensitive is the default if the server suports it). 438 sensitive is the default if the server suports it).
439 (mount option "ignorecase" is identical to "nocase")
437 posixpaths If CIFS Unix extensions are supported, attempt to 440 posixpaths If CIFS Unix extensions are supported, attempt to
438 negotiate posix path name support which allows certain 441 negotiate posix path name support which allows certain
439 characters forbidden in typical CIFS filenames, without 442 characters forbidden in typical CIFS filenames, without
@@ -485,6 +488,9 @@ A partial list of the supported mount options follows:
485 ntlmv2i Use NTLMv2 password hashing with packet signing 488 ntlmv2i Use NTLMv2 password hashing with packet signing
486 lanman (if configured in kernel config) use older 489 lanman (if configured in kernel config) use older
487 lanman hash 490 lanman hash
491hard Retry file operations if server is not responding
492soft Limit retries to unresponsive servers (usually only
493 one retry) before returning an error. (default)
488 494
489The mount.cifs mount helper also accepts a few mount options before -o 495The mount.cifs mount helper also accepts a few mount options before -o
490including: 496including:
@@ -535,8 +541,8 @@ SecurityFlags Flags which control security negotiation and
535 must use NTLM 0x02002 541 must use NTLM 0x02002
536 may use NTLMv2 0x00004 542 may use NTLMv2 0x00004
537 must use NTLMv2 0x04004 543 must use NTLMv2 0x04004
538 may use Kerberos security (not implemented yet) 0x00008 544 may use Kerberos security 0x00008
539 must use Kerberos (not implemented yet) 0x08008 545 must use Kerberos 0x08008
540 may use lanman (weak) password hash 0x00010 546 may use lanman (weak) password hash 0x00010
541 must use lanman password hash 0x10010 547 must use lanman password hash 0x10010
542 may use plaintext passwords 0x00020 548 may use plaintext passwords 0x00020
@@ -626,6 +632,6 @@ returned success.
626 632
627Also note that "cat /proc/fs/cifs/DebugData" will display information about 633Also note that "cat /proc/fs/cifs/DebugData" will display information about
628the active sessions and the shares that are mounted. 634the active sessions and the shares that are mounted.
629Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is enabled 635Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is
630but requires a user space helper (from the Samba project). NTLM and NTLMv2 and 636on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and
631LANMAN support do not require this helpr. 637LANMAN support do not require this helper.
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index a8852c200728..92c9feac440f 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -1,4 +1,4 @@
1Version 1.49 April 26, 2007 1Version 1.52 January 3, 2008
2 2
3A Partial List of Missing Features 3A Partial List of Missing Features
4================================== 4==================================
@@ -16,16 +16,14 @@ SecurityDescriptors
16c) Better pam/winbind integration (e.g. to handle uid mapping 16c) Better pam/winbind integration (e.g. to handle uid mapping
17better) 17better)
18 18
19d) Verify that Kerberos signing works 19d) Cleanup now unneeded SessSetup code in
20
21e) Cleanup now unneeded SessSetup code in
22fs/cifs/connect.c and add back in NTLMSSP code if any servers 20fs/cifs/connect.c and add back in NTLMSSP code if any servers
23need it 21need it
24 22
25f) MD5-HMAC signing SMB PDUs when SPNEGO style SessionSetup 23e) ms-dfs and ms-dfs host name resolution cleanup
26used (Kerberos or NTLMSSP). Signing alreadyimplemented for NTLM 24
27and raw NTLMSSP already. This is important when enabling 25f) fix NTLMv2 signing when two mounts with different users to same
28extended security and mounting to Windows 2003 Servers 26server.
29 27
30g) Directory entry caching relies on a 1 second timer, rather than 28g) Directory entry caching relies on a 1 second timer, rather than
31using FindNotify or equivalent. - (started) 29using FindNotify or equivalent. - (started)
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
new file mode 100644
index 000000000000..413ee2349d1a
--- /dev/null
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -0,0 +1,377 @@
1/*
2 * Contains the CIFS DFS referral mounting routines used for handling
3 * traversal via DFS junction point
4 *
5 * Copyright (c) 2007 Igor Mammedov
6 * Copyright (C) International Business Machines Corp., 2008
7 * Author(s): Igor Mammedov (niallain@gmail.com)
8 * Steve French (sfrench@us.ibm.com)
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 */
14
15#include <linux/dcache.h>
16#include <linux/mount.h>
17#include <linux/namei.h>
18#include <linux/vfs.h>
19#include <linux/fs.h>
20#include "cifsglob.h"
21#include "cifsproto.h"
22#include "cifsfs.h"
23#include "dns_resolve.h"
24#include "cifs_debug.h"
25
26LIST_HEAD(cifs_dfs_automount_list);
27
28/*
29 * DFS functions
30*/
31
32void dfs_shrink_umount_helper(struct vfsmount *vfsmnt)
33{
34 mark_mounts_for_expiry(&cifs_dfs_automount_list);
35 mark_mounts_for_expiry(&cifs_dfs_automount_list);
36 shrink_submounts(vfsmnt, &cifs_dfs_automount_list);
37}
38
39/**
40 * cifs_get_share_name - extracts share name from UNC
41 * @node_name: pointer to UNC string
42 *
43 * Extracts sharename form full UNC.
44 * i.e. strips from UNC trailing path that is not part of share
45 * name and fixup missing '\' in the begining of DFS node refferal
46 * if neccessary.
47 * Returns pointer to share name on success or NULL on error.
48 * Caller is responsible for freeing returned string.
49 */
50static char *cifs_get_share_name(const char *node_name)
51{
52 int len;
53 char *UNC;
54 char *pSep;
55
56 len = strlen(node_name);
57 UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
58 GFP_KERNEL);
59 if (!UNC)
60 return NULL;
61
62 /* get share name and server name */
63 if (node_name[1] != '\\') {
64 UNC[0] = '\\';
65 strncpy(UNC+1, node_name, len);
66 len++;
67 UNC[len] = 0;
68 } else {
69 strncpy(UNC, node_name, len);
70 UNC[len] = 0;
71 }
72
73 /* find server name end */
74 pSep = memchr(UNC+2, '\\', len-2);
75 if (!pSep) {
76 cERROR(1, ("%s: no server name end in node name: %s",
77 __FUNCTION__, node_name));
78 kfree(UNC);
79 return NULL;
80 }
81
82 /* find sharename end */
83 pSep++;
84 pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC));
85 if (!pSep) {
86 cERROR(1, ("%s:2 cant find share name in node name: %s",
87 __FUNCTION__, node_name));
88 kfree(UNC);
89 return NULL;
90 }
91 /* trim path up to sharename end
92 * * now we have share name in UNC */
93 *pSep = 0;
94
95 return UNC;
96}
97
98
99/**
100 * compose_mount_options - creates mount options for refferral
101 * @sb_mountdata: parent/root DFS mount options (template)
102 * @ref_unc: refferral server UNC
103 * @devname: pointer for saving device name
104 *
105 * creates mount options for submount based on template options sb_mountdata
106 * and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
107 *
108 * Returns: pointer to new mount options or ERR_PTR.
109 * Caller is responcible for freeing retunrned value if it is not error.
110 */
111static char *compose_mount_options(const char *sb_mountdata,
112 const char *ref_unc,
113 char **devname)
114{
115 int rc;
116 char *mountdata;
117 int md_len;
118 char *tkn_e;
119 char *srvIP = NULL;
120 char sep = ',';
121 int off, noff;
122
123 if (sb_mountdata == NULL)
124 return ERR_PTR(-EINVAL);
125
126 *devname = cifs_get_share_name(ref_unc);
127 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
128 if (rc != 0) {
129 cERROR(1, ("%s: Failed to resolve server part of %s to IP",
130 __FUNCTION__, *devname));
131 mountdata = ERR_PTR(rc);
132 goto compose_mount_options_out;
133 }
134 md_len = strlen(sb_mountdata) + strlen(srvIP) + strlen(ref_unc) + 3;
135 mountdata = kzalloc(md_len+1, GFP_KERNEL);
136 if (mountdata == NULL) {
137 mountdata = ERR_PTR(-ENOMEM);
138 goto compose_mount_options_out;
139 }
140
141 /* copy all options except of unc,ip,prefixpath */
142 off = 0;
143 if (strncmp(sb_mountdata, "sep=", 4) == 0) {
144 sep = sb_mountdata[4];
145 strncpy(mountdata, sb_mountdata, 5);
146 off += 5;
147 }
148 while ((tkn_e = strchr(sb_mountdata+off, sep))) {
149 noff = (tkn_e - (sb_mountdata+off)) + 1;
150 if (strnicmp(sb_mountdata+off, "unc=", 4) == 0) {
151 off += noff;
152 continue;
153 }
154 if (strnicmp(sb_mountdata+off, "ip=", 3) == 0) {
155 off += noff;
156 continue;
157 }
158 if (strnicmp(sb_mountdata+off, "prefixpath=", 3) == 0) {
159 off += noff;
160 continue;
161 }
162 strncat(mountdata, sb_mountdata+off, noff);
163 off += noff;
164 }
165 strcat(mountdata, sb_mountdata+off);
166 mountdata[md_len] = '\0';
167
168 /* copy new IP and ref share name */
169 strcat(mountdata, ",ip=");
170 strcat(mountdata, srvIP);
171 strcat(mountdata, ",unc=");
172 strcat(mountdata, *devname);
173
174 /* find & copy prefixpath */
175 tkn_e = strchr(ref_unc+2, '\\');
176 if (tkn_e) {
177 tkn_e = strchr(tkn_e+1, '\\');
178 if (tkn_e) {
179 strcat(mountdata, ",prefixpath=");
180 strcat(mountdata, tkn_e);
181 }
182 }
183
184 /*cFYI(1,("%s: parent mountdata: %s", __FUNCTION__,sb_mountdata));*/
185 /*cFYI(1, ("%s: submount mountdata: %s", __FUNCTION__, mountdata ));*/
186
187compose_mount_options_out:
188 kfree(srvIP);
189 return mountdata;
190}
191
192
193static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
194 struct dentry *dentry, char *ref_unc)
195{
196 struct cifs_sb_info *cifs_sb;
197 struct vfsmount *mnt;
198 char *mountdata;
199 char *devname = NULL;
200
201 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
202 mountdata = compose_mount_options(cifs_sb->mountdata,
203 ref_unc, &devname);
204
205 if (IS_ERR(mountdata))
206 return (struct vfsmount *)mountdata;
207
208 mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata);
209 kfree(mountdata);
210 kfree(devname);
211 return mnt;
212
213}
214
215static char *build_full_dfs_path_from_dentry(struct dentry *dentry)
216{
217 char *full_path = NULL;
218 char *search_path;
219 char *tmp_path;
220 size_t l_max_len;
221 struct cifs_sb_info *cifs_sb;
222
223 if (dentry->d_inode == NULL)
224 return NULL;
225
226 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
227
228 if (cifs_sb->tcon == NULL)
229 return NULL;
230
231 search_path = build_path_from_dentry(dentry);
232 if (search_path == NULL)
233 return NULL;
234
235 if (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS) {
236 /* we should use full path name to correct working with DFS */
237 l_max_len = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE+1) +
238 strnlen(search_path, MAX_PATHCONF) + 1;
239 tmp_path = kmalloc(l_max_len, GFP_KERNEL);
240 if (tmp_path == NULL) {
241 kfree(search_path);
242 return NULL;
243 }
244 strncpy(tmp_path, cifs_sb->tcon->treeName, l_max_len);
245 strcat(tmp_path, search_path);
246 tmp_path[l_max_len-1] = 0;
247 full_path = tmp_path;
248 kfree(search_path);
249 } else {
250 full_path = search_path;
251 }
252 return full_path;
253}
254
255static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
256 struct list_head *mntlist)
257{
258 /* stolen from afs code */
259 int err;
260
261 mntget(newmnt);
262 err = do_add_mount(newmnt, nd, nd->mnt->mnt_flags, mntlist);
263 switch (err) {
264 case 0:
265 dput(nd->dentry);
266 mntput(nd->mnt);
267 nd->mnt = newmnt;
268 nd->dentry = dget(newmnt->mnt_root);
269 break;
270 case -EBUSY:
271 /* someone else made a mount here whilst we were busy */
272 while (d_mountpoint(nd->dentry) &&
273 follow_down(&nd->mnt, &nd->dentry))
274 ;
275 err = 0;
276 default:
277 mntput(newmnt);
278 break;
279 }
280 return err;
281}
282
283static void dump_referral(const struct dfs_info3_param *ref)
284{
285 cFYI(1, ("DFS: ref path: %s", ref->path_name));
286 cFYI(1, ("DFS: node path: %s", ref->node_name));
287 cFYI(1, ("DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type));
288 cFYI(1, ("DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
289 ref->PathConsumed));
290}
291
292
293static void*
294cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
295{
296 struct dfs_info3_param *referrals = NULL;
297 unsigned int num_referrals = 0;
298 struct cifs_sb_info *cifs_sb;
299 struct cifsSesInfo *ses;
300 char *full_path = NULL;
301 int xid, i;
302 int rc = 0;
303 struct vfsmount *mnt = ERR_PTR(-ENOENT);
304
305 cFYI(1, ("in %s", __FUNCTION__));
306 BUG_ON(IS_ROOT(dentry));
307
308 xid = GetXid();
309
310 dput(nd->dentry);
311 nd->dentry = dget(dentry);
312
313 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
314 ses = cifs_sb->tcon->ses;
315
316 if (!ses) {
317 rc = -EINVAL;
318 goto out_err;
319 }
320
321 full_path = build_full_dfs_path_from_dentry(dentry);
322 if (full_path == NULL) {
323 rc = -ENOMEM;
324 goto out_err;
325 }
326
327 rc = get_dfs_path(xid, ses , full_path, cifs_sb->local_nls,
328 &num_referrals, &referrals,
329 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
330
331 for (i = 0; i < num_referrals; i++) {
332 dump_referral(referrals+i);
333 /* connect to a storage node */
334 if (referrals[i].flags & DFSREF_STORAGE_SERVER) {
335 int len;
336 len = strlen(referrals[i].node_name);
337 if (len < 2) {
338 cERROR(1, ("%s: Net Address path too short: %s",
339 __FUNCTION__, referrals[i].node_name));
340 rc = -EINVAL;
341 goto out_err;
342 }
343 mnt = cifs_dfs_do_refmount(nd->mnt, nd->dentry,
344 referrals[i].node_name);
345 cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p",
346 __FUNCTION__,
347 referrals[i].node_name, mnt));
348
349 /* complete mount procedure if we accured submount */
350 if (!IS_ERR(mnt))
351 break;
352 }
353 }
354
355 /* we need it cause for() above could exit without valid submount */
356 rc = PTR_ERR(mnt);
357 if (IS_ERR(mnt))
358 goto out_err;
359
360 nd->mnt->mnt_flags |= MNT_SHRINKABLE;
361 rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
362
363out:
364 FreeXid(xid);
365 free_dfs_info_array(referrals, num_referrals);
366 kfree(full_path);
367 cFYI(1, ("leaving %s" , __FUNCTION__));
368 return ERR_PTR(rc);
369out_err:
370 path_release(nd);
371 goto out;
372}
373
374struct inode_operations cifs_dfs_referral_inode_operations = {
375 .follow_link = cifs_dfs_follow_mountpoint,
376};
377
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 34af556cdd8d..8ad2330ba061 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -43,6 +43,9 @@ struct cifs_sb_info {
43 mode_t mnt_dir_mode; 43 mode_t mnt_dir_mode;
44 int mnt_cifs_flags; 44 int mnt_cifs_flags;
45 int prepathlen; 45 int prepathlen;
46 char *prepath; 46 char *prepath; /* relative path under the share to mount to */
47#ifdef CONFIG_CIFS_DFS_UPCALL
48 char *mountdata; /* mount options received at mount time */
49#endif
47}; 50};
48#endif /* _CIFS_FS_SB_H */ 51#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 1529d2b12e9c..d543accc10dd 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -122,11 +122,13 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
122 cFYI(1, ("key description = %s", description)); 122 cFYI(1, ("key description = %s", description));
123 spnego_key = request_key(&cifs_spnego_key_type, description, ""); 123 spnego_key = request_key(&cifs_spnego_key_type, description, "");
124 124
125#ifdef CONFIG_CIFS_DEBUG2
125 if (cifsFYI && !IS_ERR(spnego_key)) { 126 if (cifsFYI && !IS_ERR(spnego_key)) {
126 struct cifs_spnego_msg *msg = spnego_key->payload.data; 127 struct cifs_spnego_msg *msg = spnego_key->payload.data;
127 cifs_dump_mem("SPNEGO reply blob:", msg->data, 128 cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024,
128 msg->secblob_len + msg->sesskey_len); 129 msg->secblob_len + msg->sesskey_len));
129 } 130 }
131#endif /* CONFIG_CIFS_DEBUG2 */
130 132
131out: 133out:
132 kfree(description); 134 kfree(description);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index c312adcba4fc..a7035bd18e4e 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -129,6 +129,54 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
129 return (1); /* sids compare/match */ 129 return (1); /* sids compare/match */
130} 130}
131 131
132
133/* copy ntsd, owner sid, and group sid from a security descriptor to another */
134static void copy_sec_desc(const struct cifs_ntsd *pntsd,
135 struct cifs_ntsd *pnntsd, __u32 sidsoffset)
136{
137 int i;
138
139 struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
140 struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
141
142 /* copy security descriptor control portion */
143 pnntsd->revision = pntsd->revision;
144 pnntsd->type = pntsd->type;
145 pnntsd->dacloffset = cpu_to_le32(sizeof(struct cifs_ntsd));
146 pnntsd->sacloffset = 0;
147 pnntsd->osidoffset = cpu_to_le32(sidsoffset);
148 pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid));
149
150 /* copy owner sid */
151 owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
152 le32_to_cpu(pntsd->osidoffset));
153 nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
154
155 nowner_sid_ptr->revision = owner_sid_ptr->revision;
156 nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth;
157 for (i = 0; i < 6; i++)
158 nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i];
159 for (i = 0; i < 5; i++)
160 nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i];
161
162 /* copy group sid */
163 group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
164 le32_to_cpu(pntsd->gsidoffset));
165 ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
166 sizeof(struct cifs_sid));
167
168 ngroup_sid_ptr->revision = group_sid_ptr->revision;
169 ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth;
170 for (i = 0; i < 6; i++)
171 ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
172 for (i = 0; i < 5; i++)
173 ngroup_sid_ptr->sub_auth[i] =
174 cpu_to_le32(group_sid_ptr->sub_auth[i]);
175
176 return;
177}
178
179
132/* 180/*
133 change posix mode to reflect permissions 181 change posix mode to reflect permissions
134 pmode is the existing mode (we only want to overwrite part of this 182 pmode is the existing mode (we only want to overwrite part of this
@@ -220,6 +268,33 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
220 return; 268 return;
221} 269}
222 270
271static __le16 fill_ace_for_sid(struct cifs_ace *pntace,
272 const struct cifs_sid *psid, __u64 nmode, umode_t bits)
273{
274 int i;
275 __u16 size = 0;
276 __u32 access_req = 0;
277
278 pntace->type = ACCESS_ALLOWED;
279 pntace->flags = 0x0;
280 mode_to_access_flags(nmode, bits, &access_req);
281 if (!access_req)
282 access_req = SET_MINIMUM_RIGHTS;
283 pntace->access_req = cpu_to_le32(access_req);
284
285 pntace->sid.revision = psid->revision;
286 pntace->sid.num_subauth = psid->num_subauth;
287 for (i = 0; i < 6; i++)
288 pntace->sid.authority[i] = psid->authority[i];
289 for (i = 0; i < psid->num_subauth; i++)
290 pntace->sid.sub_auth[i] = psid->sub_auth[i];
291
292 size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4);
293 pntace->size = cpu_to_le16(size);
294
295 return (size);
296}
297
223 298
224#ifdef CONFIG_CIFS_DEBUG2 299#ifdef CONFIG_CIFS_DEBUG2
225static void dump_ace(struct cifs_ace *pace, char *end_of_acl) 300static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
@@ -243,7 +318,7 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
243 int i; 318 int i;
244 cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d", 319 cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d",
245 pace->sid.revision, pace->sid.num_subauth, pace->type, 320 pace->sid.revision, pace->sid.num_subauth, pace->type,
246 pace->flags, pace->size)); 321 pace->flags, le16_to_cpu(pace->size)));
247 for (i = 0; i < num_subauth; ++i) { 322 for (i = 0; i < num_subauth; ++i) {
248 cFYI(1, ("ACE sub_auth[%d]: 0x%x", i, 323 cFYI(1, ("ACE sub_auth[%d]: 0x%x", i,
249 le32_to_cpu(pace->sid.sub_auth[i]))); 324 le32_to_cpu(pace->sid.sub_auth[i])));
@@ -346,6 +421,28 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
346} 421}
347 422
348 423
424static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
425 struct cifs_sid *pgrpsid, __u64 nmode)
426{
427 __le16 size = 0;
428 struct cifs_acl *pnndacl;
429
430 pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl));
431
432 size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size),
433 pownersid, nmode, S_IRWXU);
434 size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
435 pgrpsid, nmode, S_IRWXG);
436 size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
437 &sid_everyone, nmode, S_IRWXO);
438
439 pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl));
440 pndacl->num_aces = 3;
441
442 return (0);
443}
444
445
349static int parse_sid(struct cifs_sid *psid, char *end_of_acl) 446static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
350{ 447{
351 /* BB need to add parm so we can store the SID BB */ 448 /* BB need to add parm so we can store the SID BB */
@@ -432,6 +529,46 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
432} 529}
433 530
434 531
532/* Convert permission bits from mode to equivalent CIFS ACL */
533static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
534 int acl_len, struct inode *inode, __u64 nmode)
535{
536 int rc = 0;
537 __u32 dacloffset;
538 __u32 ndacloffset;
539 __u32 sidsoffset;
540 struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
541 struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */
542 struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
543
544 if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
545 return (-EIO);
546
547 owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
548 le32_to_cpu(pntsd->osidoffset));
549 group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
550 le32_to_cpu(pntsd->gsidoffset));
551
552 dacloffset = le32_to_cpu(pntsd->dacloffset);
553 dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
554
555 ndacloffset = sizeof(struct cifs_ntsd);
556 ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
557 ndacl_ptr->revision = dacl_ptr->revision;
558 ndacl_ptr->size = 0;
559 ndacl_ptr->num_aces = 0;
560
561 rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode);
562
563 sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
564
565 /* copy security descriptor control portion and owner and group sid */
566 copy_sec_desc(pntsd, pnntsd, sidsoffset);
567
568 return (rc);
569}
570
571
435/* Retrieve an ACL from the server */ 572/* Retrieve an ACL from the server */
436static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode, 573static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
437 const char *path) 574 const char *path)
@@ -487,6 +624,64 @@ static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
487 return pntsd; 624 return pntsd;
488} 625}
489 626
627/* Set an ACL on the server */
628static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
629 struct inode *inode, const char *path)
630{
631 struct cifsFileInfo *open_file;
632 int unlock_file = FALSE;
633 int xid;
634 int rc = -EIO;
635 __u16 fid;
636 struct super_block *sb;
637 struct cifs_sb_info *cifs_sb;
638
639#ifdef CONFIG_CIFS_DEBUG2
640 cFYI(1, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
641#endif
642
643 if (!inode)
644 return (rc);
645
646 sb = inode->i_sb;
647 if (sb == NULL)
648 return (rc);
649
650 cifs_sb = CIFS_SB(sb);
651 xid = GetXid();
652
653 open_file = find_readable_file(CIFS_I(inode));
654 if (open_file) {
655 unlock_file = TRUE;
656 fid = open_file->netfid;
657 } else {
658 int oplock = FALSE;
659 /* open file */
660 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
661 WRITE_DAC, 0, &fid, &oplock, NULL,
662 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
663 CIFS_MOUNT_MAP_SPECIAL_CHR);
664 if (rc != 0) {
665 cERROR(1, ("Unable to open file to set ACL"));
666 FreeXid(xid);
667 return (rc);
668 }
669 }
670
671 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
672#ifdef CONFIG_CIFS_DEBUG2
673 cFYI(1, ("SetCIFSACL rc = %d", rc));
674#endif
675 if (unlock_file == TRUE)
676 atomic_dec(&open_file->wrtPending);
677 else
678 CIFSSMBClose(xid, cifs_sb->tcon, fid);
679
680 FreeXid(xid);
681
682 return (rc);
683}
684
490/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ 685/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
491void acl_to_uid_mode(struct inode *inode, const char *path) 686void acl_to_uid_mode(struct inode *inode, const char *path)
492{ 687{
@@ -510,24 +705,53 @@ void acl_to_uid_mode(struct inode *inode, const char *path)
510} 705}
511 706
512/* Convert mode bits to an ACL so we can update the ACL on the server */ 707/* Convert mode bits to an ACL so we can update the ACL on the server */
513int mode_to_acl(struct inode *inode, const char *path) 708int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
514{ 709{
515 int rc = 0; 710 int rc = 0;
516 __u32 acllen = 0; 711 __u32 acllen = 0;
517 struct cifs_ntsd *pntsd = NULL; 712 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
713 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
518 714
715#ifdef CONFIG_CIFS_DEBUG2
519 cFYI(1, ("set ACL from mode for %s", path)); 716 cFYI(1, ("set ACL from mode for %s", path));
717#endif
520 718
521 /* Get the security descriptor */ 719 /* Get the security descriptor */
522 pntsd = get_cifs_acl(&acllen, inode, path); 720 pntsd = get_cifs_acl(&acllen, inode, path);
523 721
524 /* Add/Modify the three ACEs for owner, group, everyone 722 /* Add three ACEs for owner, group, everyone getting rid of
525 while retaining the other ACEs */ 723 other ACEs as chmod disables ACEs and set the security descriptor */
526 724
527 /* Set the security descriptor */ 725 if (pntsd) {
726 /* allocate memory for the smb header,
727 set security descriptor request security descriptor
728 parameters, and secuirty descriptor itself */
528 729
730 pnntsd = kmalloc(acllen, GFP_KERNEL);
731 if (!pnntsd) {
732 cERROR(1, ("Unable to allocate security descriptor"));
733 kfree(pntsd);
734 return (-ENOMEM);
735 }
529 736
530 kfree(pntsd); 737 rc = build_sec_desc(pntsd, pnntsd, acllen, inode, nmode);
531 return rc; 738
739#ifdef CONFIG_CIFS_DEBUG2
740 cFYI(1, ("build_sec_desc rc: %d", rc));
741#endif
742
743 if (!rc) {
744 /* Set the security descriptor */
745 rc = set_cifs_acl(pnntsd, acllen, inode, path);
746#ifdef CONFIG_CIFS_DEBUG2
747 cFYI(1, ("set_cifs_acl rc: %d", rc));
748#endif
749 }
750
751 kfree(pnntsd);
752 kfree(pntsd);
753 }
754
755 return (rc);
532} 756}
533#endif /* CONFIG_CIFS_EXPERIMENTAL */ 757#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 093beaa3900d..e9f4ec701092 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -44,6 +44,7 @@
44#include "cifs_fs_sb.h" 44#include "cifs_fs_sb.h"
45#include <linux/mm.h> 45#include <linux/mm.h>
46#include <linux/key-type.h> 46#include <linux/key-type.h>
47#include "dns_resolve.h"
47#include "cifs_spnego.h" 48#include "cifs_spnego.h"
48#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ 49#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
49 50
@@ -96,6 +97,9 @@ cifs_read_super(struct super_block *sb, void *data,
96{ 97{
97 struct inode *inode; 98 struct inode *inode;
98 struct cifs_sb_info *cifs_sb; 99 struct cifs_sb_info *cifs_sb;
100#ifdef CONFIG_CIFS_DFS_UPCALL
101 int len;
102#endif
99 int rc = 0; 103 int rc = 0;
100 104
101 /* BB should we make this contingent on mount parm? */ 105 /* BB should we make this contingent on mount parm? */
@@ -105,6 +109,25 @@ cifs_read_super(struct super_block *sb, void *data,
105 if (cifs_sb == NULL) 109 if (cifs_sb == NULL)
106 return -ENOMEM; 110 return -ENOMEM;
107 111
112#ifdef CONFIG_CIFS_DFS_UPCALL
113 /* copy mount params to sb for use in submounts */
114 /* BB: should we move this after the mount so we
115 * do not have to do the copy on failed mounts?
116 * BB: May be it is better to do simple copy before
117 * complex operation (mount), and in case of fail
118 * just exit instead of doing mount and attempting
119 * undo it if this copy fails?*/
120 len = strlen(data);
121 cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
122 if (cifs_sb->mountdata == NULL) {
123 kfree(sb->s_fs_info);
124 sb->s_fs_info = NULL;
125 return -ENOMEM;
126 }
127 strncpy(cifs_sb->mountdata, data, len + 1);
128 cifs_sb->mountdata[len] = '\0';
129#endif
130
108 rc = cifs_mount(sb, cifs_sb, data, devname); 131 rc = cifs_mount(sb, cifs_sb, data, devname);
109 132
110 if (rc) { 133 if (rc) {
@@ -154,6 +177,12 @@ out_no_root:
154 177
155out_mount_failed: 178out_mount_failed:
156 if (cifs_sb) { 179 if (cifs_sb) {
180#ifdef CONFIG_CIFS_DFS_UPCALL
181 if (cifs_sb->mountdata) {
182 kfree(cifs_sb->mountdata);
183 cifs_sb->mountdata = NULL;
184 }
185#endif
157 if (cifs_sb->local_nls) 186 if (cifs_sb->local_nls)
158 unload_nls(cifs_sb->local_nls); 187 unload_nls(cifs_sb->local_nls);
159 kfree(cifs_sb); 188 kfree(cifs_sb);
@@ -177,6 +206,13 @@ cifs_put_super(struct super_block *sb)
177 if (rc) { 206 if (rc) {
178 cERROR(1, ("cifs_umount failed with return code %d", rc)); 207 cERROR(1, ("cifs_umount failed with return code %d", rc));
179 } 208 }
209#ifdef CONFIG_CIFS_DFS_UPCALL
210 if (cifs_sb->mountdata) {
211 kfree(cifs_sb->mountdata);
212 cifs_sb->mountdata = NULL;
213 }
214#endif
215
180 unload_nls(cifs_sb->local_nls); 216 unload_nls(cifs_sb->local_nls);
181 kfree(cifs_sb); 217 kfree(cifs_sb);
182 return; 218 return;
@@ -435,6 +471,10 @@ static void cifs_umount_begin(struct vfsmount *vfsmnt, int flags)
435 struct cifs_sb_info *cifs_sb; 471 struct cifs_sb_info *cifs_sb;
436 struct cifsTconInfo *tcon; 472 struct cifsTconInfo *tcon;
437 473
474#ifdef CONFIG_CIFS_DFS_UPCALL
475 dfs_shrink_umount_helper(vfsmnt);
476#endif /* CONFIG CIFS_DFS_UPCALL */
477
438 if (!(flags & MNT_FORCE)) 478 if (!(flags & MNT_FORCE))
439 return; 479 return;
440 cifs_sb = CIFS_SB(vfsmnt->mnt_sb); 480 cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
@@ -552,7 +592,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
552 return remote_llseek(file, offset, origin); 592 return remote_llseek(file, offset, origin);
553} 593}
554 594
555static struct file_system_type cifs_fs_type = { 595struct file_system_type cifs_fs_type = {
556 .owner = THIS_MODULE, 596 .owner = THIS_MODULE,
557 .name = "cifs", 597 .name = "cifs",
558 .get_sb = cifs_get_sb, 598 .get_sb = cifs_get_sb,
@@ -1015,11 +1055,16 @@ init_cifs(void)
1015 if (rc) 1055 if (rc)
1016 goto out_unregister_filesystem; 1056 goto out_unregister_filesystem;
1017#endif 1057#endif
1058#ifdef CONFIG_CIFS_DFS_UPCALL
1059 rc = register_key_type(&key_type_dns_resolver);
1060 if (rc)
1061 goto out_unregister_key_type;
1062#endif
1018 oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd"); 1063 oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd");
1019 if (IS_ERR(oplockThread)) { 1064 if (IS_ERR(oplockThread)) {
1020 rc = PTR_ERR(oplockThread); 1065 rc = PTR_ERR(oplockThread);
1021 cERROR(1, ("error %d create oplock thread", rc)); 1066 cERROR(1, ("error %d create oplock thread", rc));
1022 goto out_unregister_key_type; 1067 goto out_unregister_dfs_key_type;
1023 } 1068 }
1024 1069
1025 dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd"); 1070 dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd");
@@ -1033,7 +1078,11 @@ init_cifs(void)
1033 1078
1034 out_stop_oplock_thread: 1079 out_stop_oplock_thread:
1035 kthread_stop(oplockThread); 1080 kthread_stop(oplockThread);
1081 out_unregister_dfs_key_type:
1082#ifdef CONFIG_CIFS_DFS_UPCALL
1083 unregister_key_type(&key_type_dns_resolver);
1036 out_unregister_key_type: 1084 out_unregister_key_type:
1085#endif
1037#ifdef CONFIG_CIFS_UPCALL 1086#ifdef CONFIG_CIFS_UPCALL
1038 unregister_key_type(&cifs_spnego_key_type); 1087 unregister_key_type(&cifs_spnego_key_type);
1039 out_unregister_filesystem: 1088 out_unregister_filesystem:
@@ -1059,6 +1108,9 @@ exit_cifs(void)
1059#ifdef CONFIG_PROC_FS 1108#ifdef CONFIG_PROC_FS
1060 cifs_proc_clean(); 1109 cifs_proc_clean();
1061#endif 1110#endif
1111#ifdef CONFIG_CIFS_DFS_UPCALL
1112 unregister_key_type(&key_type_dns_resolver);
1113#endif
1062#ifdef CONFIG_CIFS_UPCALL 1114#ifdef CONFIG_CIFS_UPCALL
1063 unregister_key_type(&cifs_spnego_key_type); 1115 unregister_key_type(&cifs_spnego_key_type);
1064#endif 1116#endif
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2a21dc66f0de..195b14de5567 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -32,6 +32,7 @@
32#define TRUE 1 32#define TRUE 1
33#endif 33#endif
34 34
35extern struct file_system_type cifs_fs_type;
35extern const struct address_space_operations cifs_addr_ops; 36extern const struct address_space_operations cifs_addr_ops;
36extern const struct address_space_operations cifs_addr_ops_smallbuf; 37extern const struct address_space_operations cifs_addr_ops_smallbuf;
37 38
@@ -60,6 +61,10 @@ extern int cifs_setattr(struct dentry *, struct iattr *);
60 61
61extern const struct inode_operations cifs_file_inode_ops; 62extern const struct inode_operations cifs_file_inode_ops;
62extern const struct inode_operations cifs_symlink_inode_ops; 63extern const struct inode_operations cifs_symlink_inode_ops;
64extern struct list_head cifs_dfs_automount_list;
65extern struct inode_operations cifs_dfs_referral_inode_operations;
66
67
63 68
64/* Functions related to files and directories */ 69/* Functions related to files and directories */
65extern const struct file_operations cifs_file_ops; 70extern const struct file_operations cifs_file_ops;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1fde2197ad76..5d32d8ddc82e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifsglob.h 2 * fs/cifs/cifsglob.h
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2007 4 * Copyright (C) International Business Machines Corp., 2002,2008
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * Jeremy Allison (jra@samba.org) 6 * Jeremy Allison (jra@samba.org)
7 * 7 *
@@ -70,14 +70,6 @@
70#endif 70#endif
71 71
72/* 72/*
73 * This information is kept on every Server we know about.
74 *
75 * Some things to note:
76 *
77 */
78#define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1)
79
80/*
81 * CIFS vfs client Status information (based on what we know.) 73 * CIFS vfs client Status information (based on what we know.)
82 */ 74 */
83 75
@@ -460,6 +452,37 @@ struct dir_notify_req {
460 struct file *pfile; 452 struct file *pfile;
461}; 453};
462 454
455struct dfs_info3_param {
456 int flags; /* DFSREF_REFERRAL_SERVER, DFSREF_STORAGE_SERVER*/
457 int PathConsumed;
458 int server_type;
459 int ref_flag;
460 char *path_name;
461 char *node_name;
462};
463
464static inline void free_dfs_info_param(struct dfs_info3_param *param)
465{
466 if (param) {
467 kfree(param->path_name);
468 kfree(param->node_name);
469 kfree(param);
470 }
471}
472
473static inline void free_dfs_info_array(struct dfs_info3_param *param,
474 int number_of_items)
475{
476 int i;
477 if ((number_of_items == 0) || (param == NULL))
478 return;
479 for (i = 0; i < number_of_items; i++) {
480 kfree(param[i].path_name);
481 kfree(param[i].node_name);
482 }
483 kfree(param);
484}
485
463#define MID_FREE 0 486#define MID_FREE 0
464#define MID_REQUEST_ALLOCATED 1 487#define MID_REQUEST_ALLOCATED 1
465#define MID_REQUEST_SUBMITTED 2 488#define MID_REQUEST_SUBMITTED 2
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index dbe6b846f37f..47f79504f57b 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -237,6 +237,9 @@
237 | DELETE | READ_CONTROL | WRITE_DAC \ 237 | DELETE | READ_CONTROL | WRITE_DAC \
238 | WRITE_OWNER | SYNCHRONIZE) 238 | WRITE_OWNER | SYNCHRONIZE)
239 239
240#define SET_MINIMUM_RIGHTS (FILE_READ_EA | FILE_READ_ATTRIBUTES \
241 | READ_CONTROL | SYNCHRONIZE)
242
240 243
241/* 244/*
242 * Invalid readdir handle 245 * Invalid readdir handle
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8350eec49663..2f09f565a3d9 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifsproto.h 2 * fs/cifs/cifsproto.h
3 * 3 *
4 * Copyright (c) International Business Machines Corp., 2002,2007 4 * Copyright (c) International Business Machines Corp., 2002,2008
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -97,11 +97,14 @@ extern int cifs_get_inode_info_unix(struct inode **pinode,
97 const unsigned char *search_path, 97 const unsigned char *search_path,
98 struct super_block *sb, int xid); 98 struct super_block *sb, int xid);
99extern void acl_to_uid_mode(struct inode *inode, const char *search_path); 99extern void acl_to_uid_mode(struct inode *inode, const char *search_path);
100extern int mode_to_acl(struct inode *inode, const char *path); 100extern int mode_to_acl(struct inode *inode, const char *path, __u64);
101 101
102extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, 102extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
103 const char *); 103 const char *);
104extern int cifs_umount(struct super_block *, struct cifs_sb_info *); 104extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
105#ifdef CONFIG_CIFS_DFS_UPCALL
106extern void dfs_shrink_umount_helper(struct vfsmount *vfsmnt);
107#endif
105void cifs_proc_init(void); 108void cifs_proc_init(void);
106void cifs_proc_clean(void); 109void cifs_proc_clean(void);
107 110
@@ -153,7 +156,7 @@ extern int get_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
153 const char *old_path, 156 const char *old_path,
154 const struct nls_table *nls_codepage, 157 const struct nls_table *nls_codepage,
155 unsigned int *pnum_referrals, 158 unsigned int *pnum_referrals,
156 unsigned char **preferrals, 159 struct dfs_info3_param **preferrals,
157 int remap); 160 int remap);
158extern void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon, 161extern void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
159 struct super_block *sb, struct smb_vol *vol); 162 struct super_block *sb, struct smb_vol *vol);
@@ -342,6 +345,8 @@ extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
342 const struct nls_table *nls_codepage, int remap_special_chars); 345 const struct nls_table *nls_codepage, int remap_special_chars);
343extern int CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, 346extern int CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon,
344 __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); 347 __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
348extern int CIFSSMBSetCIFSACL(const int, struct cifsTconInfo *, __u16,
349 struct cifs_ntsd *, __u32);
345extern int CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon, 350extern int CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
346 const unsigned char *searchName, 351 const unsigned char *searchName,
347 char *acl_inf, const int buflen, const int acl_type, 352 char *acl_inf, const int buflen, const int acl_type,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9e8a6bef029a..9409524e4bf8 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3156,6 +3156,71 @@ qsec_out:
3156/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ 3156/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
3157 return rc; 3157 return rc;
3158} 3158}
3159
3160int
3161CIFSSMBSetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3162 struct cifs_ntsd *pntsd, __u32 acllen)
3163{
3164 __u16 byte_count, param_count, data_count, param_offset, data_offset;
3165 int rc = 0;
3166 int bytes_returned = 0;
3167 SET_SEC_DESC_REQ *pSMB = NULL;
3168 NTRANSACT_RSP *pSMBr = NULL;
3169
3170setCifsAclRetry:
3171 rc = smb_init(SMB_COM_NT_TRANSACT, 19, tcon, (void **) &pSMB,
3172 (void **) &pSMBr);
3173 if (rc)
3174 return (rc);
3175
3176 pSMB->MaxSetupCount = 0;
3177 pSMB->Reserved = 0;
3178
3179 param_count = 8;
3180 param_offset = offsetof(struct smb_com_transaction_ssec_req, Fid) - 4;
3181 data_count = acllen;
3182 data_offset = param_offset + param_count;
3183 byte_count = 3 /* pad */ + param_count;
3184
3185 pSMB->DataCount = cpu_to_le32(data_count);
3186 pSMB->TotalDataCount = pSMB->DataCount;
3187 pSMB->MaxParameterCount = cpu_to_le32(4);
3188 pSMB->MaxDataCount = cpu_to_le32(16384);
3189 pSMB->ParameterCount = cpu_to_le32(param_count);
3190 pSMB->ParameterOffset = cpu_to_le32(param_offset);
3191 pSMB->TotalParameterCount = pSMB->ParameterCount;
3192 pSMB->DataOffset = cpu_to_le32(data_offset);
3193 pSMB->SetupCount = 0;
3194 pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_SET_SECURITY_DESC);
3195 pSMB->ByteCount = cpu_to_le16(byte_count+data_count);
3196
3197 pSMB->Fid = fid; /* file handle always le */
3198 pSMB->Reserved2 = 0;
3199 pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL);
3200
3201 if (pntsd && acllen) {
3202 memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
3203 (char *) pntsd,
3204 acllen);
3205 pSMB->hdr.smb_buf_length += (byte_count + data_count);
3206
3207 } else
3208 pSMB->hdr.smb_buf_length += byte_count;
3209
3210 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3211 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3212
3213 cFYI(1, ("SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc));
3214 if (rc)
3215 cFYI(1, ("Set CIFS ACL returned %d", rc));
3216 cifs_buf_release(pSMB);
3217
3218 if (rc == -EAGAIN)
3219 goto setCifsAclRetry;
3220
3221 return (rc);
3222}
3223
3159#endif /* CONFIG_CIFS_EXPERIMENTAL */ 3224#endif /* CONFIG_CIFS_EXPERIMENTAL */
3160 3225
3161/* Legacy Query Path Information call for lookup to old servers such 3226/* Legacy Query Path Information call for lookup to old servers such
@@ -5499,7 +5564,7 @@ SetEARetry:
5499 else 5564 else
5500 name_len = strnlen(ea_name, 255); 5565 name_len = strnlen(ea_name, 255);
5501 5566
5502 count = sizeof(*parm_data) + ea_value_len + name_len + 1; 5567 count = sizeof(*parm_data) + ea_value_len + name_len;
5503 pSMB->MaxParameterCount = cpu_to_le16(2); 5568 pSMB->MaxParameterCount = cpu_to_le16(2);
5504 pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB size from sess */ 5569 pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB size from sess */
5505 pSMB->MaxSetupCount = 0; 5570 pSMB->MaxSetupCount = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index fd9147cdb5a9..65d0ba72e78f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/connect.c 2 * fs/cifs/connect.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2007 4 * Copyright (C) International Business Machines Corp., 2002,2008
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -1410,7 +1410,7 @@ connect_to_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
1410 const char *old_path, const struct nls_table *nls_codepage, 1410 const char *old_path, const struct nls_table *nls_codepage,
1411 int remap) 1411 int remap)
1412{ 1412{
1413 unsigned char *referrals = NULL; 1413 struct dfs_info3_param *referrals = NULL;
1414 unsigned int num_referrals; 1414 unsigned int num_referrals;
1415 int rc = 0; 1415 int rc = 0;
1416 1416
@@ -1429,12 +1429,14 @@ connect_to_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
1429int 1429int
1430get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path, 1430get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
1431 const struct nls_table *nls_codepage, unsigned int *pnum_referrals, 1431 const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
1432 unsigned char **preferrals, int remap) 1432 struct dfs_info3_param **preferrals, int remap)
1433{ 1433{
1434 char *temp_unc; 1434 char *temp_unc;
1435 int rc = 0; 1435 int rc = 0;
1436 unsigned char *targetUNCs;
1436 1437
1437 *pnum_referrals = 0; 1438 *pnum_referrals = 0;
1439 *preferrals = NULL;
1438 1440
1439 if (pSesInfo->ipc_tid == 0) { 1441 if (pSesInfo->ipc_tid == 0) {
1440 temp_unc = kmalloc(2 /* for slashes */ + 1442 temp_unc = kmalloc(2 /* for slashes */ +
@@ -1454,8 +1456,10 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
1454 kfree(temp_unc); 1456 kfree(temp_unc);
1455 } 1457 }
1456 if (rc == 0) 1458 if (rc == 0)
1457 rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, preferrals, 1459 rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, &targetUNCs,
1458 pnum_referrals, nls_codepage, remap); 1460 pnum_referrals, nls_codepage, remap);
1461 /* BB map targetUNCs to dfs_info3 structures, here or
1462 in CIFSGetDFSRefer BB */
1459 1463
1460 return rc; 1464 return rc;
1461} 1465}
@@ -1964,7 +1968,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1964 1968
1965 if (existingCifsSes) { 1969 if (existingCifsSes) {
1966 pSesInfo = existingCifsSes; 1970 pSesInfo = existingCifsSes;
1967 cFYI(1, ("Existing smb sess found")); 1971 cFYI(1, ("Existing smb sess found (status=%d)",
1972 pSesInfo->status));
1973 down(&pSesInfo->sesSem);
1974 if (pSesInfo->status == CifsNeedReconnect) {
1975 cFYI(1, ("Session needs reconnect"));
1976 rc = cifs_setup_session(xid, pSesInfo,
1977 cifs_sb->local_nls);
1978 }
1979 up(&pSesInfo->sesSem);
1968 } else if (!rc) { 1980 } else if (!rc) {
1969 cFYI(1, ("Existing smb sess not found")); 1981 cFYI(1, ("Existing smb sess not found"));
1970 pSesInfo = sesInfoAlloc(); 1982 pSesInfo = sesInfoAlloc();
@@ -3514,7 +3526,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
3514 sesInfoFree(ses); 3526 sesInfoFree(ses);
3515 3527
3516 FreeXid(xid); 3528 FreeXid(xid);
3517 return rc; /* BB check if we should always return zero here */ 3529 return rc;
3518} 3530}
3519 3531
3520int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, 3532int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 37dc97af1487..699ec1198409 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -517,12 +517,10 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
517 d_add(direntry, NULL); 517 d_add(direntry, NULL);
518 /* if it was once a directory (but how can we tell?) we could do 518 /* if it was once a directory (but how can we tell?) we could do
519 shrink_dcache_parent(direntry); */ 519 shrink_dcache_parent(direntry); */
520 } else { 520 } else if (rc != -EACCES) {
521 cERROR(1, ("Error 0x%x on cifs_get_inode_info in lookup of %s", 521 cERROR(1, ("Unexpected lookup error %d", rc));
522 rc, full_path)); 522 /* We special case check for Access Denied - since that
523 /* BB special case check for Access Denied - watch security 523 is a common return code */
524 exposure of returning dir info implicitly via different rc
525 if file exists or not but no access BB */
526 } 524 }
527 525
528 kfree(full_path); 526 kfree(full_path);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
new file mode 100644
index 000000000000..ef7f43824347
--- /dev/null
+++ b/fs/cifs/dns_resolve.c
@@ -0,0 +1,124 @@
1/*
2 * fs/cifs/dns_resolve.c
3 *
4 * Copyright (c) 2007 Igor Mammedov
5 * Author(s): Igor Mammedov (niallain@gmail.com)
6 * Steve French (sfrench@us.ibm.com)
7 *
8 * Contains the CIFS DFS upcall routines used for hostname to
9 * IP address translation.
10 *
11 * This library is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser General Public License as published
13 * by the Free Software Foundation; either version 2.1 of the License, or
14 * (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
19 * the GNU Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public License
22 * along with this library; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25
26#include <keys/user-type.h>
27#include "dns_resolve.h"
28#include "cifsglob.h"
29#include "cifsproto.h"
30#include "cifs_debug.h"
31
32static int dns_resolver_instantiate(struct key *key, const void *data,
33 size_t datalen)
34{
35 int rc = 0;
36 char *ip;
37
38 ip = kmalloc(datalen+1, GFP_KERNEL);
39 if (!ip)
40 return -ENOMEM;
41
42 memcpy(ip, data, datalen);
43 ip[datalen] = '\0';
44
45 rcu_assign_pointer(key->payload.data, ip);
46
47 return rc;
48}
49
50struct key_type key_type_dns_resolver = {
51 .name = "dns_resolver",
52 .def_datalen = sizeof(struct in_addr),
53 .describe = user_describe,
54 .instantiate = dns_resolver_instantiate,
55 .match = user_match,
56};
57
58
59/* Resolves server name to ip address.
60 * input:
61 * unc - server UNC
62 * output:
63 * *ip_addr - pointer to server ip, caller responcible for freeing it.
64 * return 0 on success
65 */
66int
67dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
68{
69 int rc = -EAGAIN;
70 struct key *rkey;
71 char *name;
72 int len;
73
74 if (!ip_addr || !unc)
75 return -EINVAL;
76
77 /* search for server name delimiter */
78 len = strlen(unc);
79 if (len < 3) {
80 cFYI(1, ("%s: unc is too short: %s", __FUNCTION__, unc));
81 return -EINVAL;
82 }
83 len -= 2;
84 name = memchr(unc+2, '\\', len);
85 if (!name) {
86 cFYI(1, ("%s: probably server name is whole unc: %s",
87 __FUNCTION__, unc));
88 } else {
89 len = (name - unc) - 2/* leading // */;
90 }
91
92 name = kmalloc(len+1, GFP_KERNEL);
93 if (!name) {
94 rc = -ENOMEM;
95 return rc;
96 }
97 memcpy(name, unc+2, len);
98 name[len] = 0;
99
100 rkey = request_key(&key_type_dns_resolver, name, "");
101 if (!IS_ERR(rkey)) {
102 len = strlen(rkey->payload.data);
103 *ip_addr = kmalloc(len+1, GFP_KERNEL);
104 if (*ip_addr) {
105 memcpy(*ip_addr, rkey->payload.data, len);
106 (*ip_addr)[len] = '\0';
107 cFYI(1, ("%s: resolved: %s to %s", __FUNCTION__,
108 rkey->description,
109 *ip_addr
110 ));
111 rc = 0;
112 } else {
113 rc = -ENOMEM;
114 }
115 key_put(rkey);
116 } else {
117 cERROR(1, ("%s: unable to resolve: %s", __FUNCTION__, name));
118 }
119
120 kfree(name);
121 return rc;
122}
123
124
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
new file mode 100644
index 000000000000..073fdc3db419
--- /dev/null
+++ b/fs/cifs/dns_resolve.h
@@ -0,0 +1,32 @@
1/*
2 * fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS
3 * Handles host name to IP address resolution
4 *
5 * Copyright (c) International Business Machines Corp., 2008
6 * Author(s): Steve French (sfrench@us.ibm.com)
7 *
8 * This library is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU Lesser General Public License as published
10 * by the Free Software Foundation; either version 2.1 of the License, or
11 * (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
16 * the GNU Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public License
19 * along with this library; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef _DNS_RESOLVE_H
24#define _DNS_RESOLVE_H
25
26#ifdef __KERNEL__
27#include <linux/key-type.h>
28extern struct key_type key_type_dns_resolver;
29extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
30#endif /* KERNEL */
31
32#endif /* _DNS_RESOLVE_H */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index dd26e2759b17..5f7c374ae89c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1179,12 +1179,10 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1179 atomic_dec(&open_file->wrtPending); 1179 atomic_dec(&open_file->wrtPending);
1180 /* Does mm or vfs already set times? */ 1180 /* Does mm or vfs already set times? */
1181 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); 1181 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
1182 if ((bytes_written > 0) && (offset)) { 1182 if ((bytes_written > 0) && (offset))
1183 rc = 0; 1183 rc = 0;
1184 } else if (bytes_written < 0) { 1184 else if (bytes_written < 0)
1185 if (rc != -EBADF) 1185 rc = bytes_written;
1186 rc = bytes_written;
1187 }
1188 } else { 1186 } else {
1189 cFYI(1, ("No writeable filehandles for inode")); 1187 cFYI(1, ("No writeable filehandles for inode"));
1190 rc = -EIO; 1188 rc = -EIO;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index e915eb1d2e66..d9567ba2960b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -54,9 +54,9 @@ int cifs_get_inode_info_unix(struct inode **pinode,
54 MAX_TREE_SIZE + 1) + 54 MAX_TREE_SIZE + 1) +
55 strnlen(search_path, MAX_PATHCONF) + 1, 55 strnlen(search_path, MAX_PATHCONF) + 1,
56 GFP_KERNEL); 56 GFP_KERNEL);
57 if (tmp_path == NULL) { 57 if (tmp_path == NULL)
58 return -ENOMEM; 58 return -ENOMEM;
59 } 59
60 /* have to skip first of the double backslash of 60 /* have to skip first of the double backslash of
61 UNC name */ 61 UNC name */
62 strncpy(tmp_path, pTcon->treeName, MAX_TREE_SIZE); 62 strncpy(tmp_path, pTcon->treeName, MAX_TREE_SIZE);
@@ -511,7 +511,8 @@ int cifs_get_inode_info(struct inode **pinode,
511 } 511 }
512 512
513 spin_lock(&inode->i_lock); 513 spin_lock(&inode->i_lock);
514 if (is_size_safe_to_change(cifsInfo, le64_to_cpu(pfindData->EndOfFile))) { 514 if (is_size_safe_to_change(cifsInfo,
515 le64_to_cpu(pfindData->EndOfFile))) {
515 /* can not safely shrink the file size here if the 516 /* can not safely shrink the file size here if the
516 client is writing to it due to potential races */ 517 client is writing to it due to potential races */
517 i_size_write(inode, le64_to_cpu(pfindData->EndOfFile)); 518 i_size_write(inode, le64_to_cpu(pfindData->EndOfFile));
@@ -931,7 +932,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
931 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 932 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
932 le64_to_cpu(pTcon->fsUnixInfo.Capability))) { 933 le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
933 u32 oplock = 0; 934 u32 oplock = 0;
934 FILE_UNIX_BASIC_INFO * pInfo = 935 FILE_UNIX_BASIC_INFO *pInfo =
935 kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); 936 kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
936 if (pInfo == NULL) { 937 if (pInfo == NULL) {
937 rc = -ENOMEM; 938 rc = -ENOMEM;
@@ -1607,7 +1608,14 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
1607 CIFS_MOUNT_MAP_SPECIAL_CHR); 1608 CIFS_MOUNT_MAP_SPECIAL_CHR);
1608 else if (attrs->ia_valid & ATTR_MODE) { 1609 else if (attrs->ia_valid & ATTR_MODE) {
1609 rc = 0; 1610 rc = 0;
1610 if ((mode & S_IWUGO) == 0) /* not writeable */ { 1611#ifdef CONFIG_CIFS_EXPERIMENTAL
1612 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
1613 rc = mode_to_acl(direntry->d_inode, full_path, mode);
1614 else if ((mode & S_IWUGO) == 0) {
1615#else
1616 if ((mode & S_IWUGO) == 0) {
1617#endif
1618 /* not writeable */
1611 if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) { 1619 if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
1612 set_dosattr = TRUE; 1620 set_dosattr = TRUE;
1613 time_buf.Attributes = 1621 time_buf.Attributes =
@@ -1626,10 +1634,10 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
1626 if (time_buf.Attributes == 0) 1634 if (time_buf.Attributes == 0)
1627 time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL); 1635 time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
1628 } 1636 }
1629 /* BB to be implemented - 1637#ifdef CONFIG_CIFS_EXPERIMENTAL
1630 via Windows security descriptors or streams */ 1638 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
1631 /* CIFSSMBWinSetPerms(xid, pTcon, full_path, mode, uid, gid, 1639 mode_to_acl(direntry->d_inode, full_path, mode);
1632 cifs_sb->local_nls); */ 1640#endif
1633 } 1641 }
1634 1642
1635 if (attrs->ia_valid & ATTR_ATIME) { 1643 if (attrs->ia_valid & ATTR_ATIME) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 11f265726db7..1d6fb01b8e6d 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/link.c 2 * fs/cifs/link.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2003 4 * Copyright (C) International Business Machines Corp., 2002,2008
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -236,8 +236,6 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
236 char *full_path = NULL; 236 char *full_path = NULL;
237 char *tmp_path = NULL; 237 char *tmp_path = NULL;
238 char *tmpbuffer; 238 char *tmpbuffer;
239 unsigned char *referrals = NULL;
240 unsigned int num_referrals = 0;
241 int len; 239 int len;
242 __u16 fid; 240 __u16 fid;
243 241
@@ -297,8 +295,11 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
297 cFYI(1, ("Error closing junction point " 295 cFYI(1, ("Error closing junction point "
298 "(open for ioctl)")); 296 "(open for ioctl)"));
299 } 297 }
298 /* BB unwind this long, nested function, or remove BB */
300 if (rc == -EIO) { 299 if (rc == -EIO) {
301 /* Query if DFS Junction */ 300 /* Query if DFS Junction */
301 unsigned int num_referrals = 0;
302 struct dfs_info3_param *refs = NULL;
302 tmp_path = 303 tmp_path =
303 kmalloc(MAX_TREE_SIZE + MAX_PATHCONF + 1, 304 kmalloc(MAX_TREE_SIZE + MAX_PATHCONF + 1,
304 GFP_KERNEL); 305 GFP_KERNEL);
@@ -310,7 +311,7 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
310 rc = get_dfs_path(xid, pTcon->ses, 311 rc = get_dfs_path(xid, pTcon->ses,
311 tmp_path, 312 tmp_path,
312 cifs_sb->local_nls, 313 cifs_sb->local_nls,
313 &num_referrals, &referrals, 314 &num_referrals, &refs,
314 cifs_sb->mnt_cifs_flags & 315 cifs_sb->mnt_cifs_flags &
315 CIFS_MOUNT_MAP_SPECIAL_CHR); 316 CIFS_MOUNT_MAP_SPECIAL_CHR);
316 cFYI(1, ("Get DFS for %s rc = %d ", 317 cFYI(1, ("Get DFS for %s rc = %d ",
@@ -320,14 +321,13 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
320 else { 321 else {
321 cFYI(1, ("num referral: %d", 322 cFYI(1, ("num referral: %d",
322 num_referrals)); 323 num_referrals));
323 if (referrals) { 324 if (refs && refs->path_name) {
324 cFYI(1,("referral string: %s", referrals));
325 strncpy(tmpbuffer, 325 strncpy(tmpbuffer,
326 referrals, 326 refs->path_name,
327 len-1); 327 len-1);
328 } 328 }
329 } 329 }
330 kfree(referrals); 330 kfree(refs);
331 kfree(tmp_path); 331 kfree(tmp_path);
332} 332}
333 /* BB add code like else decode referrals 333 /* BB add code like else decode referrals
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d0cb469daab7..d2153abcba6d 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -528,9 +528,11 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
528 rc = -EOVERFLOW; 528 rc = -EOVERFLOW;
529 goto ssetup_exit; 529 goto ssetup_exit;
530 } 530 }
531 ses->server->mac_signing_key.len = msg->sesskey_len; 531 if (first_time) {
532 memcpy(ses->server->mac_signing_key.data.krb5, msg->data, 532 ses->server->mac_signing_key.len = msg->sesskey_len;
533 msg->sesskey_len); 533 memcpy(ses->server->mac_signing_key.data.krb5,
534 msg->data, msg->sesskey_len);
535 }
534 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 536 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
535 capabilities |= CAP_EXTENDED_SECURITY; 537 capabilities |= CAP_EXTENDED_SECURITY;
536 pSMB->req.Capabilities = cpu_to_le32(capabilities); 538 pSMB->req.Capabilities = cpu_to_le32(capabilities);
@@ -540,7 +542,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
540 542
541 if (ses->capabilities & CAP_UNICODE) { 543 if (ses->capabilities & CAP_UNICODE) {
542 /* unicode strings must be word aligned */ 544 /* unicode strings must be word aligned */
543 if (iov[0].iov_len % 2) { 545 if ((iov[0].iov_len + iov[1].iov_len) % 2) {
544 *bcc_ptr = 0; 546 *bcc_ptr = 0;
545 bcc_ptr++; 547 bcc_ptr++;
546 } 548 }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index dcc6aead70f5..e3eb3556622b 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -362,8 +362,8 @@ static int init_coda_psdev(void)
362 goto out_chrdev; 362 goto out_chrdev;
363 } 363 }
364 for (i = 0; i < MAX_CODADEVS; i++) 364 for (i = 0; i < MAX_CODADEVS; i++)
365 class_device_create(coda_psdev_class, NULL, 365 device_create(coda_psdev_class, NULL,
366 MKDEV(CODA_PSDEV_MAJOR,i), NULL, "cfs%d", i); 366 MKDEV(CODA_PSDEV_MAJOR,i), "cfs%d", i);
367 coda_sysctl_init(); 367 coda_sysctl_init();
368 goto out; 368 goto out;
369 369
@@ -405,7 +405,7 @@ static int __init init_coda(void)
405 return 0; 405 return 0;
406out: 406out:
407 for (i = 0; i < MAX_CODADEVS; i++) 407 for (i = 0; i < MAX_CODADEVS; i++)
408 class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); 408 device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
409 class_destroy(coda_psdev_class); 409 class_destroy(coda_psdev_class);
410 unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); 410 unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
411 coda_sysctl_clean(); 411 coda_sysctl_clean();
@@ -424,7 +424,7 @@ static void __exit exit_coda(void)
424 printk("coda: failed to unregister filesystem\n"); 424 printk("coda: failed to unregister filesystem\n");
425 } 425 }
426 for (i = 0; i < MAX_CODADEVS; i++) 426 for (i = 0; i < MAX_CODADEVS; i++)
427 class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); 427 device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
428 class_destroy(coda_psdev_class); 428 class_destroy(coda_psdev_class);
429 unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); 429 unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
430 coda_sysctl_clean(); 430 coda_sysctl_clean();
diff --git a/fs/compat.c b/fs/compat.c
index 15078ce4c04a..5216c3fd7517 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1104,10 +1104,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1104 if (ret < 0) 1104 if (ret < 0)
1105 goto out; 1105 goto out;
1106 1106
1107 ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE);
1108 if (ret)
1109 goto out;
1110
1111 fnv = NULL; 1107 fnv = NULL;
1112 if (type == READ) { 1108 if (type == READ) {
1113 fn = file->f_op->read; 1109 fn = file->f_op->read;
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
new file mode 100644
index 000000000000..0adced2f296f
--- /dev/null
+++ b/fs/compat_binfmt_elf.c
@@ -0,0 +1,131 @@
1/*
2 * 32-bit compatibility support for ELF format executables and core dumps.
3 *
4 * Copyright (C) 2007 Red Hat, Inc. All rights reserved.
5 *
6 * This copyrighted material is made available to anyone wishing to use,
7 * modify, copy, or redistribute it subject to the terms and conditions
8 * of the GNU General Public License v.2.
9 *
10 * Red Hat Author: Roland McGrath.
11 *
12 * This file is used in a 64-bit kernel that wants to support 32-bit ELF.
13 * asm/elf.h is responsible for defining the compat_* and COMPAT_* macros
14 * used below, with definitions appropriate for 32-bit ABI compatibility.
15 *
16 * We use macros to rename the ABI types and machine-dependent
17 * functions used in binfmt_elf.c to compat versions.
18 */
19
20#include <linux/elfcore-compat.h>
21#include <linux/time.h>
22
23/*
24 * Rename the basic ELF layout types to refer to the 32-bit class of files.
25 */
26#undef ELF_CLASS
27#define ELF_CLASS ELFCLASS32
28
29#undef elfhdr
30#undef elf_phdr
31#undef elf_note
32#undef elf_addr_t
33#define elfhdr elf32_hdr
34#define elf_phdr elf32_phdr
35#define elf_note elf32_note
36#define elf_addr_t Elf32_Addr
37
38/*
39 * The machine-dependent core note format types are defined in elfcore-compat.h,
40 * which requires asm/elf.h to define compat_elf_gregset_t et al.
41 */
42#define elf_prstatus compat_elf_prstatus
43#define elf_prpsinfo compat_elf_prpsinfo
44
45/*
46 * Compat version of cputime_to_compat_timeval, perhaps this
47 * should be an inline in <linux/compat.h>.
48 */
49static void cputime_to_compat_timeval(const cputime_t cputime,
50 struct compat_timeval *value)
51{
52 struct timeval tv;
53 cputime_to_timeval(cputime, &tv);
54 value->tv_sec = tv.tv_sec;
55 value->tv_usec = tv.tv_usec;
56}
57
58#undef cputime_to_timeval
59#define cputime_to_timeval cputime_to_compat_timeval
60
61
62/*
63 * To use this file, asm/elf.h must define compat_elf_check_arch.
64 * The other following macros can be defined if the compat versions
65 * differ from the native ones, or omitted when they match.
66 */
67
68#undef ELF_ARCH
69#undef elf_check_arch
70#define elf_check_arch compat_elf_check_arch
71
72#ifdef COMPAT_ELF_PLATFORM
73#undef ELF_PLATFORM
74#define ELF_PLATFORM COMPAT_ELF_PLATFORM
75#endif
76
77#ifdef COMPAT_ELF_HWCAP
78#undef ELF_HWCAP
79#define ELF_HWCAP COMPAT_ELF_HWCAP
80#endif
81
82#ifdef COMPAT_ARCH_DLINFO
83#undef ARCH_DLINFO
84#define ARCH_DLINFO COMPAT_ARCH_DLINFO
85#endif
86
87#ifdef COMPAT_ELF_ET_DYN_BASE
88#undef ELF_ET_DYN_BASE
89#define ELF_ET_DYN_BASE COMPAT_ELF_ET_DYN_BASE
90#endif
91
92#ifdef COMPAT_ELF_EXEC_PAGESIZE
93#undef ELF_EXEC_PAGESIZE
94#define ELF_EXEC_PAGESIZE COMPAT_ELF_EXEC_PAGESIZE
95#endif
96
97#ifdef COMPAT_ELF_PLAT_INIT
98#undef ELF_PLAT_INIT
99#define ELF_PLAT_INIT COMPAT_ELF_PLAT_INIT
100#endif
101
102#ifdef COMPAT_SET_PERSONALITY
103#undef SET_PERSONALITY
104#define SET_PERSONALITY COMPAT_SET_PERSONALITY
105#endif
106
107#ifdef compat_start_thread
108#undef start_thread
109#define start_thread compat_start_thread
110#endif
111
112#ifdef compat_arch_setup_additional_pages
113#undef ARCH_HAS_SETUP_ADDITIONAL_PAGES
114#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
115#undef arch_setup_additional_pages
116#define arch_setup_additional_pages compat_arch_setup_additional_pages
117#endif
118
119/*
120 * Rename a few of the symbols that binfmt_elf.c will define.
121 * These are all local so the names don't really matter, but it
122 * might make some debugging less confusing not to duplicate them.
123 */
124#define elf_format compat_elf_format
125#define init_elf_binfmt init_compat_elf_binfmt
126#define exit_elf_binfmt exit_compat_elf_binfmt
127
128/*
129 * We share all the actual code with the native (64-bit) version.
130 */
131#include "binfmt_elf.c"
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index e8b7c3a98a54..ffdc022cae64 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -10,6 +10,8 @@
10 * ioctls. 10 * ioctls.
11 */ 11 */
12 12
13#include <linux/joystick.h>
14
13#include <linux/types.h> 15#include <linux/types.h>
14#include <linux/compat.h> 16#include <linux/compat.h>
15#include <linux/kernel.h> 17#include <linux/kernel.h>
@@ -1374,7 +1376,7 @@ static int do_atm_ioctl(unsigned int fd, unsigned int cmd32, unsigned long arg)
1374 return -EINVAL; 1376 return -EINVAL;
1375} 1377}
1376 1378
1377static __attribute_used__ int 1379static __used int
1378ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg) 1380ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
1379{ 1381{
1380 return -EINVAL; 1382 return -EINVAL;
@@ -2642,6 +2644,12 @@ COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES)
2642COMPATIBLE_IOCTL(VIDEO_GET_SIZE) 2644COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
2643COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE) 2645COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE)
2644 2646
2647/* joystick */
2648COMPATIBLE_IOCTL(JSIOCGVERSION)
2649COMPATIBLE_IOCTL(JSIOCGAXES)
2650COMPATIBLE_IOCTL(JSIOCGBUTTONS)
2651COMPATIBLE_IOCTL(JSIOCGNAME(0))
2652
2645/* now things that need handlers */ 2653/* now things that need handlers */
2646HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob) 2654HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob)
2647HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob) 2655HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 50ed691098bc..a48dc7dd8765 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -546,7 +546,7 @@ static int populate_groups(struct config_group *group)
546 * That said, taking our i_mutex is closer to mkdir 546 * That said, taking our i_mutex is closer to mkdir
547 * emulation, and shouldn't hurt. 547 * emulation, and shouldn't hurt.
548 */ 548 */
549 mutex_lock(&dentry->d_inode->i_mutex); 549 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
550 550
551 for (i = 0; group->default_groups[i]; i++) { 551 for (i = 0; group->default_groups[i]; i++) {
552 new_group = group->default_groups[i]; 552 new_group = group->default_groups[i];
@@ -1405,7 +1405,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
1405 sd = configfs_sb->s_root->d_fsdata; 1405 sd = configfs_sb->s_root->d_fsdata;
1406 link_group(to_config_group(sd->s_element), group); 1406 link_group(to_config_group(sd->s_element), group);
1407 1407
1408 mutex_lock(&configfs_sb->s_root->d_inode->i_mutex); 1408 mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
1409 I_MUTEX_PARENT);
1409 1410
1410 name.name = group->cg_item.ci_name; 1411 name.name = group->cg_item.ci_name;
1411 name.len = strlen(name.name); 1412 name.len = strlen(name.name);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index a3658f9a082c..397cb503a180 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -320,7 +320,7 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att
320 umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; 320 umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
321 int error = 0; 321 int error = 0;
322 322
323 mutex_lock(&dir->d_inode->i_mutex); 323 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
324 error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type); 324 error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
325 mutex_unlock(&dir->d_inode->i_mutex); 325 mutex_unlock(&dir->d_inode->i_mutex);
326 326
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 3bf0278ea843..de3b31d0a37d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -128,7 +128,7 @@ void configfs_release_fs(void)
128} 128}
129 129
130 130
131static decl_subsys(config, NULL, NULL); 131static struct kobject *config_kobj;
132 132
133static int __init configfs_init(void) 133static int __init configfs_init(void)
134{ 134{
@@ -140,9 +140,8 @@ static int __init configfs_init(void)
140 if (!configfs_dir_cachep) 140 if (!configfs_dir_cachep)
141 goto out; 141 goto out;
142 142
143 kobj_set_kset_s(&config_subsys, kernel_subsys); 143 config_kobj = kobject_create_and_add("config", kernel_kobj);
144 err = subsystem_register(&config_subsys); 144 if (!config_kobj) {
145 if (err) {
146 kmem_cache_destroy(configfs_dir_cachep); 145 kmem_cache_destroy(configfs_dir_cachep);
147 configfs_dir_cachep = NULL; 146 configfs_dir_cachep = NULL;
148 goto out; 147 goto out;
@@ -151,7 +150,7 @@ static int __init configfs_init(void)
151 err = register_filesystem(&configfs_fs_type); 150 err = register_filesystem(&configfs_fs_type);
152 if (err) { 151 if (err) {
153 printk(KERN_ERR "configfs: Unable to register filesystem!\n"); 152 printk(KERN_ERR "configfs: Unable to register filesystem!\n");
154 subsystem_unregister(&config_subsys); 153 kobject_put(config_kobj);
155 kmem_cache_destroy(configfs_dir_cachep); 154 kmem_cache_destroy(configfs_dir_cachep);
156 configfs_dir_cachep = NULL; 155 configfs_dir_cachep = NULL;
157 goto out; 156 goto out;
@@ -160,7 +159,7 @@ static int __init configfs_init(void)
160 err = configfs_inode_init(); 159 err = configfs_inode_init();
161 if (err) { 160 if (err) {
162 unregister_filesystem(&configfs_fs_type); 161 unregister_filesystem(&configfs_fs_type);
163 subsystem_unregister(&config_subsys); 162 kobject_put(config_kobj);
164 kmem_cache_destroy(configfs_dir_cachep); 163 kmem_cache_destroy(configfs_dir_cachep);
165 configfs_dir_cachep = NULL; 164 configfs_dir_cachep = NULL;
166 } 165 }
@@ -171,7 +170,7 @@ out:
171static void __exit configfs_exit(void) 170static void __exit configfs_exit(void)
172{ 171{
173 unregister_filesystem(&configfs_fs_type); 172 unregister_filesystem(&configfs_fs_type);
174 subsystem_unregister(&config_subsys); 173 kobject_put(config_kobj);
175 kmem_cache_destroy(configfs_dir_cachep); 174 kmem_cache_destroy(configfs_dir_cachep);
176 configfs_dir_cachep = NULL; 175 configfs_dir_cachep = NULL;
177 configfs_inode_exit(); 176 configfs_inode_exit();
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 6a713b33992f..d26e2826ba5b 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -426,20 +426,19 @@ exit:
426} 426}
427EXPORT_SYMBOL_GPL(debugfs_rename); 427EXPORT_SYMBOL_GPL(debugfs_rename);
428 428
429static decl_subsys(debug, NULL, NULL); 429static struct kobject *debug_kobj;
430 430
431static int __init debugfs_init(void) 431static int __init debugfs_init(void)
432{ 432{
433 int retval; 433 int retval;
434 434
435 kobj_set_kset_s(&debug_subsys, kernel_subsys); 435 debug_kobj = kobject_create_and_add("debug", kernel_kobj);
436 retval = subsystem_register(&debug_subsys); 436 if (!debug_kobj)
437 if (retval) 437 return -EINVAL;
438 return retval;
439 438
440 retval = register_filesystem(&debug_fs_type); 439 retval = register_filesystem(&debug_fs_type);
441 if (retval) 440 if (retval)
442 subsystem_unregister(&debug_subsys); 441 kobject_put(debug_kobj);
443 return retval; 442 return retval;
444} 443}
445 444
@@ -447,7 +446,7 @@ static void __exit debugfs_exit(void)
447{ 446{
448 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 447 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
449 unregister_filesystem(&debug_fs_type); 448 unregister_filesystem(&debug_fs_type);
450 subsystem_unregister(&debug_subsys); 449 kobject_put(debug_kobj);
451} 450}
452 451
453core_initcall(debugfs_init); 452core_initcall(debugfs_init);
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 46754553fdcc..ff97ba924333 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,7 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
49 spin_unlock(&ls->ls_recover_list_lock); 49 spin_unlock(&ls->ls_recover_list_lock);
50 50
51 if (!found) 51 if (!found)
52 de = allocate_direntry(ls, len); 52 de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL);
53 return de; 53 return de;
54} 54}
55 55
@@ -62,7 +62,7 @@ void dlm_clear_free_entries(struct dlm_ls *ls)
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry, 62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list); 63 list);
64 list_del(&de->list); 64 list_del(&de->list);
65 free_direntry(de); 65 kfree(de);
66 } 66 }
67 spin_unlock(&ls->ls_recover_list_lock); 67 spin_unlock(&ls->ls_recover_list_lock);
68} 68}
@@ -171,7 +171,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
171 } 171 }
172 172
173 list_del(&de->list); 173 list_del(&de->list);
174 free_direntry(de); 174 kfree(de);
175 out: 175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock); 176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177} 177}
@@ -302,7 +302,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
302 302
303 write_unlock(&ls->ls_dirtbl[bucket].lock); 303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304 304
305 de = allocate_direntry(ls, namelen); 305 de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL);
306 if (!de) 306 if (!de)
307 return -ENOMEM; 307 return -ENOMEM;
308 308
@@ -313,7 +313,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
313 write_lock(&ls->ls_dirtbl[bucket].lock); 313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket); 314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) { 315 if (tmp) {
316 free_direntry(de); 316 kfree(de);
317 de = tmp; 317 de = tmp;
318 } else { 318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); 319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
@@ -329,49 +329,47 @@ int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
329 return get_entry(ls, nodeid, name, namelen, r_nodeid); 329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330} 330}
331 331
332/* Copy the names of master rsb's into the buffer provided. 332static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
333 Only select names whose dir node is the given nodeid. */ 333{
334 struct dlm_rsb *r;
335
336 down_read(&ls->ls_root_sem);
337 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
338 if (len == r->res_length && !memcmp(name, r->res_name, len)) {
339 up_read(&ls->ls_root_sem);
340 return r;
341 }
342 }
343 up_read(&ls->ls_root_sem);
344 return NULL;
345}
346
347/* Find the rsb where we left off (or start again), then send rsb names
348 for rsb's we're master of and whose directory node matches the requesting
349 node. inbuf is the rsb name last sent, inlen is the name's length */
334 350
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, 351void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid) 352 char *outbuf, int outlen, int nodeid)
337{ 353{
338 struct list_head *list; 354 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL; 355 struct dlm_rsb *r;
340 int offset = 0, start_namelen, error, dir_nodeid; 356 int offset = 0, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen; 357 uint16_t be_namelen;
343 358
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem); 359 down_read(&ls->ls_root_sem);
371 if (start_r) 360
372 list = start_r->res_root_list.next; 361 if (inlen > 1) {
373 else 362 r = find_rsb_root(ls, inbuf, inlen);
363 if (!r) {
364 inbuf[inlen - 1] = '\0';
365 log_error(ls, "copy_master_names from %d start %d %s",
366 nodeid, inlen, inbuf);
367 goto out;
368 }
369 list = r->res_root_list.next;
370 } else {
374 list = ls->ls_root_list.next; 371 list = ls->ls_root_list.next;
372 }
375 373
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) { 374 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list); 375 r = list_entry(list, struct dlm_rsb, res_root_list);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d2fc2384c3be..ec61bbaf25df 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -570,5 +570,21 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
570 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0; 570 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
571} 571}
572 572
573int dlm_netlink_init(void);
574void dlm_netlink_exit(void);
575void dlm_timeout_warn(struct dlm_lkb *lkb);
576
577#ifdef CONFIG_DLM_DEBUG
578int dlm_register_debugfs(void);
579void dlm_unregister_debugfs(void);
580int dlm_create_debug_file(struct dlm_ls *ls);
581void dlm_delete_debug_file(struct dlm_ls *ls);
582#else
583static inline int dlm_register_debugfs(void) { return 0; }
584static inline void dlm_unregister_debugfs(void) { }
585static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
586static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
587#endif
588
573#endif /* __DLM_INTERNAL_DOT_H__ */ 589#endif /* __DLM_INTERNAL_DOT_H__ */
574 590
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 3915b8e14146..ff4a198fa677 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -88,7 +88,6 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
88static int receive_extralen(struct dlm_message *ms); 88static int receive_extralen(struct dlm_message *ms);
89static void do_purge(struct dlm_ls *ls, int nodeid, int pid); 89static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
90static void del_timeout(struct dlm_lkb *lkb); 90static void del_timeout(struct dlm_lkb *lkb);
91void dlm_timeout_warn(struct dlm_lkb *lkb);
92 91
93/* 92/*
94 * Lock compatibilty matrix - thanks Steve 93 * Lock compatibilty matrix - thanks Steve
@@ -335,7 +334,7 @@ static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
335{ 334{
336 struct dlm_rsb *r; 335 struct dlm_rsb *r;
337 336
338 r = allocate_rsb(ls, len); 337 r = dlm_allocate_rsb(ls, len);
339 if (!r) 338 if (!r)
340 return NULL; 339 return NULL;
341 340
@@ -478,7 +477,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
478 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); 477 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
479 if (!error) { 478 if (!error) {
480 write_unlock(&ls->ls_rsbtbl[bucket].lock); 479 write_unlock(&ls->ls_rsbtbl[bucket].lock);
481 free_rsb(r); 480 dlm_free_rsb(r);
482 r = tmp; 481 r = tmp;
483 goto out; 482 goto out;
484 } 483 }
@@ -490,12 +489,6 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
490 return error; 489 return error;
491} 490}
492 491
493int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
494 unsigned int flags, struct dlm_rsb **r_ret)
495{
496 return find_rsb(ls, name, namelen, flags, r_ret);
497}
498
499/* This is only called to add a reference when the code already holds 492/* This is only called to add a reference when the code already holds
500 a valid reference to the rsb, so there's no need for locking. */ 493 a valid reference to the rsb, so there's no need for locking. */
501 494
@@ -519,7 +512,7 @@ static void toss_rsb(struct kref *kref)
519 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss); 512 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
520 r->res_toss_time = jiffies; 513 r->res_toss_time = jiffies;
521 if (r->res_lvbptr) { 514 if (r->res_lvbptr) {
522 free_lvb(r->res_lvbptr); 515 dlm_free_lvb(r->res_lvbptr);
523 r->res_lvbptr = NULL; 516 r->res_lvbptr = NULL;
524 } 517 }
525} 518}
@@ -589,7 +582,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
589 uint32_t lkid = 0; 582 uint32_t lkid = 0;
590 uint16_t bucket; 583 uint16_t bucket;
591 584
592 lkb = allocate_lkb(ls); 585 lkb = dlm_allocate_lkb(ls);
593 if (!lkb) 586 if (!lkb)
594 return -ENOMEM; 587 return -ENOMEM;
595 588
@@ -683,8 +676,8 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
683 676
684 /* for local/process lkbs, lvbptr points to caller's lksb */ 677 /* for local/process lkbs, lvbptr points to caller's lksb */
685 if (lkb->lkb_lvbptr && is_master_copy(lkb)) 678 if (lkb->lkb_lvbptr && is_master_copy(lkb))
686 free_lvb(lkb->lkb_lvbptr); 679 dlm_free_lvb(lkb->lkb_lvbptr);
687 free_lkb(lkb); 680 dlm_free_lkb(lkb);
688 return 1; 681 return 1;
689 } else { 682 } else {
690 write_unlock(&ls->ls_lkbtbl[bucket].lock); 683 write_unlock(&ls->ls_lkbtbl[bucket].lock);
@@ -988,7 +981,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
988 981
989 if (is_master(r)) 982 if (is_master(r))
990 dir_remove(r); 983 dir_remove(r);
991 free_rsb(r); 984 dlm_free_rsb(r);
992 count++; 985 count++;
993 } else { 986 } else {
994 write_unlock(&ls->ls_rsbtbl[b].lock); 987 write_unlock(&ls->ls_rsbtbl[b].lock);
@@ -1171,7 +1164,7 @@ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1171 return; 1164 return;
1172 1165
1173 if (!r->res_lvbptr) 1166 if (!r->res_lvbptr)
1174 r->res_lvbptr = allocate_lvb(r->res_ls); 1167 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
1175 1168
1176 if (!r->res_lvbptr) 1169 if (!r->res_lvbptr)
1177 return; 1170 return;
@@ -1203,7 +1196,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1203 return; 1196 return;
1204 1197
1205 if (!r->res_lvbptr) 1198 if (!r->res_lvbptr)
1206 r->res_lvbptr = allocate_lvb(r->res_ls); 1199 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
1207 1200
1208 if (!r->res_lvbptr) 1201 if (!r->res_lvbptr)
1209 return; 1202 return;
@@ -1852,7 +1845,7 @@ static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1852static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) 1845static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1853{ 1846{
1854 struct dlm_ls *ls = r->res_ls; 1847 struct dlm_ls *ls = r->res_ls;
1855 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); 1848 int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1856 1849
1857 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { 1850 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1858 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); 1851 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
@@ -1886,7 +1879,7 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1886 return 1; 1879 return 1;
1887 } 1880 }
1888 1881
1889 for (;;) { 1882 for (i = 0; i < 2; i++) {
1890 /* It's possible for dlm_scand to remove an old rsb for 1883 /* It's possible for dlm_scand to remove an old rsb for
1891 this same resource from the toss list, us to create 1884 this same resource from the toss list, us to create
1892 a new one, look up the master locally, and find it 1885 a new one, look up the master locally, and find it
@@ -1900,6 +1893,8 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1900 log_debug(ls, "dir_lookup error %d %s", error, r->res_name); 1893 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1901 schedule(); 1894 schedule();
1902 } 1895 }
1896 if (error && error != -EEXIST)
1897 return error;
1903 1898
1904 if (ret_nodeid == our_nodeid) { 1899 if (ret_nodeid == our_nodeid) {
1905 r->res_first_lkid = 0; 1900 r->res_first_lkid = 0;
@@ -1941,8 +1936,11 @@ static void confirm_master(struct dlm_rsb *r, int error)
1941 break; 1936 break;
1942 1937
1943 case -EAGAIN: 1938 case -EAGAIN:
1944 /* the remote master didn't queue our NOQUEUE request; 1939 case -EBADR:
1945 make a waiting lkb the first_lkid */ 1940 case -ENOTBLK:
1941 /* the remote request failed and won't be retried (it was
1942 a NOQUEUE, or has been canceled/unlocked); make a waiting
1943 lkb the first_lkid */
1946 1944
1947 r->res_first_lkid = 0; 1945 r->res_first_lkid = 0;
1948 1946
@@ -2108,17 +2106,18 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
2108 /* an lkb may be waiting for an rsb lookup to complete where the 2106 /* an lkb may be waiting for an rsb lookup to complete where the
2109 lookup was initiated by another lock */ 2107 lookup was initiated by another lock */
2110 2108
2111 if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) { 2109 if (!list_empty(&lkb->lkb_rsb_lookup)) {
2112 if (!list_empty(&lkb->lkb_rsb_lookup)) { 2110 if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
2113 log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id); 2111 log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
2114 list_del_init(&lkb->lkb_rsb_lookup); 2112 list_del_init(&lkb->lkb_rsb_lookup);
2115 queue_cast(lkb->lkb_resource, lkb, 2113 queue_cast(lkb->lkb_resource, lkb,
2116 args->flags & DLM_LKF_CANCEL ? 2114 args->flags & DLM_LKF_CANCEL ?
2117 -DLM_ECANCEL : -DLM_EUNLOCK); 2115 -DLM_ECANCEL : -DLM_EUNLOCK);
2118 unhold_lkb(lkb); /* undoes create_lkb() */ 2116 unhold_lkb(lkb); /* undoes create_lkb() */
2119 rv = -EBUSY;
2120 goto out;
2121 } 2117 }
2118 /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
2119 rv = -EBUSY;
2120 goto out;
2122 } 2121 }
2123 2122
2124 /* cancel not allowed with another cancel/unlock in progress */ 2123 /* cancel not allowed with another cancel/unlock in progress */
@@ -2986,7 +2985,7 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2986 2985
2987 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 2986 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2988 if (!lkb->lkb_lvbptr) 2987 if (!lkb->lkb_lvbptr)
2989 lkb->lkb_lvbptr = allocate_lvb(ls); 2988 lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
2990 if (!lkb->lkb_lvbptr) 2989 if (!lkb->lkb_lvbptr)
2991 return -ENOMEM; 2990 return -ENOMEM;
2992 len = receive_extralen(ms); 2991 len = receive_extralen(ms);
@@ -3006,11 +3005,9 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3006 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST); 3005 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
3007 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP); 3006 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
3008 3007
3009 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
3010
3011 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 3008 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3012 /* lkb was just created so there won't be an lvb yet */ 3009 /* lkb was just created so there won't be an lvb yet */
3013 lkb->lkb_lvbptr = allocate_lvb(ls); 3010 lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
3014 if (!lkb->lkb_lvbptr) 3011 if (!lkb->lkb_lvbptr)
3015 return -ENOMEM; 3012 return -ENOMEM;
3016 } 3013 }
@@ -3021,16 +3018,6 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3021static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, 3018static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3022 struct dlm_message *ms) 3019 struct dlm_message *ms)
3023{ 3020{
3024 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
3025 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
3026 lkb->lkb_nodeid, ms->m_header.h_nodeid,
3027 lkb->lkb_id, lkb->lkb_remid);
3028 return -EINVAL;
3029 }
3030
3031 if (!is_master_copy(lkb))
3032 return -EINVAL;
3033
3034 if (lkb->lkb_status != DLM_LKSTS_GRANTED) 3021 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
3035 return -EBUSY; 3022 return -EBUSY;
3036 3023
@@ -3046,8 +3033,6 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3046static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, 3033static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3047 struct dlm_message *ms) 3034 struct dlm_message *ms)
3048{ 3035{
3049 if (!is_master_copy(lkb))
3050 return -EINVAL;
3051 if (receive_lvb(ls, lkb, ms)) 3036 if (receive_lvb(ls, lkb, ms))
3052 return -ENOMEM; 3037 return -ENOMEM;
3053 return 0; 3038 return 0;
@@ -3063,6 +3048,50 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
3063 lkb->lkb_remid = ms->m_lkid; 3048 lkb->lkb_remid = ms->m_lkid;
3064} 3049}
3065 3050
3051/* This is called after the rsb is locked so that we can safely inspect
3052 fields in the lkb. */
3053
3054static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
3055{
3056 int from = ms->m_header.h_nodeid;
3057 int error = 0;
3058
3059 switch (ms->m_type) {
3060 case DLM_MSG_CONVERT:
3061 case DLM_MSG_UNLOCK:
3062 case DLM_MSG_CANCEL:
3063 if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
3064 error = -EINVAL;
3065 break;
3066
3067 case DLM_MSG_CONVERT_REPLY:
3068 case DLM_MSG_UNLOCK_REPLY:
3069 case DLM_MSG_CANCEL_REPLY:
3070 case DLM_MSG_GRANT:
3071 case DLM_MSG_BAST:
3072 if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
3073 error = -EINVAL;
3074 break;
3075
3076 case DLM_MSG_REQUEST_REPLY:
3077 if (!is_process_copy(lkb))
3078 error = -EINVAL;
3079 else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
3080 error = -EINVAL;
3081 break;
3082
3083 default:
3084 error = -EINVAL;
3085 }
3086
3087 if (error)
3088 log_error(lkb->lkb_resource->res_ls,
3089 "ignore invalid message %d from %d %x %x %x %d",
3090 ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
3091 lkb->lkb_flags, lkb->lkb_nodeid);
3092 return error;
3093}
3094
3066static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) 3095static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
3067{ 3096{
3068 struct dlm_lkb *lkb; 3097 struct dlm_lkb *lkb;
@@ -3124,17 +3153,21 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
3124 hold_rsb(r); 3153 hold_rsb(r);
3125 lock_rsb(r); 3154 lock_rsb(r);
3126 3155
3156 error = validate_message(lkb, ms);
3157 if (error)
3158 goto out;
3159
3127 receive_flags(lkb, ms); 3160 receive_flags(lkb, ms);
3128 error = receive_convert_args(ls, lkb, ms); 3161 error = receive_convert_args(ls, lkb, ms);
3129 if (error) 3162 if (error)
3130 goto out; 3163 goto out_reply;
3131 reply = !down_conversion(lkb); 3164 reply = !down_conversion(lkb);
3132 3165
3133 error = do_convert(r, lkb); 3166 error = do_convert(r, lkb);
3134 out: 3167 out_reply:
3135 if (reply) 3168 if (reply)
3136 send_convert_reply(r, lkb, error); 3169 send_convert_reply(r, lkb, error);
3137 3170 out:
3138 unlock_rsb(r); 3171 unlock_rsb(r);
3139 put_rsb(r); 3172 put_rsb(r);
3140 dlm_put_lkb(lkb); 3173 dlm_put_lkb(lkb);
@@ -3160,15 +3193,19 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
3160 hold_rsb(r); 3193 hold_rsb(r);
3161 lock_rsb(r); 3194 lock_rsb(r);
3162 3195
3196 error = validate_message(lkb, ms);
3197 if (error)
3198 goto out;
3199
3163 receive_flags(lkb, ms); 3200 receive_flags(lkb, ms);
3164 error = receive_unlock_args(ls, lkb, ms); 3201 error = receive_unlock_args(ls, lkb, ms);
3165 if (error) 3202 if (error)
3166 goto out; 3203 goto out_reply;
3167 3204
3168 error = do_unlock(r, lkb); 3205 error = do_unlock(r, lkb);
3169 out: 3206 out_reply:
3170 send_unlock_reply(r, lkb, error); 3207 send_unlock_reply(r, lkb, error);
3171 3208 out:
3172 unlock_rsb(r); 3209 unlock_rsb(r);
3173 put_rsb(r); 3210 put_rsb(r);
3174 dlm_put_lkb(lkb); 3211 dlm_put_lkb(lkb);
@@ -3196,9 +3233,13 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
3196 hold_rsb(r); 3233 hold_rsb(r);
3197 lock_rsb(r); 3234 lock_rsb(r);
3198 3235
3236 error = validate_message(lkb, ms);
3237 if (error)
3238 goto out;
3239
3199 error = do_cancel(r, lkb); 3240 error = do_cancel(r, lkb);
3200 send_cancel_reply(r, lkb, error); 3241 send_cancel_reply(r, lkb, error);
3201 3242 out:
3202 unlock_rsb(r); 3243 unlock_rsb(r);
3203 put_rsb(r); 3244 put_rsb(r);
3204 dlm_put_lkb(lkb); 3245 dlm_put_lkb(lkb);
@@ -3217,22 +3258,26 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
3217 3258
3218 error = find_lkb(ls, ms->m_remid, &lkb); 3259 error = find_lkb(ls, ms->m_remid, &lkb);
3219 if (error) { 3260 if (error) {
3220 log_error(ls, "receive_grant no lkb"); 3261 log_debug(ls, "receive_grant from %d no lkb %x",
3262 ms->m_header.h_nodeid, ms->m_remid);
3221 return; 3263 return;
3222 } 3264 }
3223 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3224 3265
3225 r = lkb->lkb_resource; 3266 r = lkb->lkb_resource;
3226 3267
3227 hold_rsb(r); 3268 hold_rsb(r);
3228 lock_rsb(r); 3269 lock_rsb(r);
3229 3270
3271 error = validate_message(lkb, ms);
3272 if (error)
3273 goto out;
3274
3230 receive_flags_reply(lkb, ms); 3275 receive_flags_reply(lkb, ms);
3231 if (is_altmode(lkb)) 3276 if (is_altmode(lkb))
3232 munge_altmode(lkb, ms); 3277 munge_altmode(lkb, ms);
3233 grant_lock_pc(r, lkb, ms); 3278 grant_lock_pc(r, lkb, ms);
3234 queue_cast(r, lkb, 0); 3279 queue_cast(r, lkb, 0);
3235 3280 out:
3236 unlock_rsb(r); 3281 unlock_rsb(r);
3237 put_rsb(r); 3282 put_rsb(r);
3238 dlm_put_lkb(lkb); 3283 dlm_put_lkb(lkb);
@@ -3246,18 +3291,22 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
3246 3291
3247 error = find_lkb(ls, ms->m_remid, &lkb); 3292 error = find_lkb(ls, ms->m_remid, &lkb);
3248 if (error) { 3293 if (error) {
3249 log_error(ls, "receive_bast no lkb"); 3294 log_debug(ls, "receive_bast from %d no lkb %x",
3295 ms->m_header.h_nodeid, ms->m_remid);
3250 return; 3296 return;
3251 } 3297 }
3252 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3253 3298
3254 r = lkb->lkb_resource; 3299 r = lkb->lkb_resource;
3255 3300
3256 hold_rsb(r); 3301 hold_rsb(r);
3257 lock_rsb(r); 3302 lock_rsb(r);
3258 3303
3259 queue_bast(r, lkb, ms->m_bastmode); 3304 error = validate_message(lkb, ms);
3305 if (error)
3306 goto out;
3260 3307
3308 queue_bast(r, lkb, ms->m_bastmode);
3309 out:
3261 unlock_rsb(r); 3310 unlock_rsb(r);
3262 put_rsb(r); 3311 put_rsb(r);
3263 dlm_put_lkb(lkb); 3312 dlm_put_lkb(lkb);
@@ -3323,15 +3372,19 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
3323 3372
3324 error = find_lkb(ls, ms->m_remid, &lkb); 3373 error = find_lkb(ls, ms->m_remid, &lkb);
3325 if (error) { 3374 if (error) {
3326 log_error(ls, "receive_request_reply no lkb"); 3375 log_debug(ls, "receive_request_reply from %d no lkb %x",
3376 ms->m_header.h_nodeid, ms->m_remid);
3327 return; 3377 return;
3328 } 3378 }
3329 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3330 3379
3331 r = lkb->lkb_resource; 3380 r = lkb->lkb_resource;
3332 hold_rsb(r); 3381 hold_rsb(r);
3333 lock_rsb(r); 3382 lock_rsb(r);
3334 3383
3384 error = validate_message(lkb, ms);
3385 if (error)
3386 goto out;
3387
3335 mstype = lkb->lkb_wait_type; 3388 mstype = lkb->lkb_wait_type;
3336 error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); 3389 error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
3337 if (error) 3390 if (error)
@@ -3383,6 +3436,7 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
3383 if (is_overlap(lkb)) { 3436 if (is_overlap(lkb)) {
3384 /* we'll ignore error in cancel/unlock reply */ 3437 /* we'll ignore error in cancel/unlock reply */
3385 queue_cast_overlap(r, lkb); 3438 queue_cast_overlap(r, lkb);
3439 confirm_master(r, result);
3386 unhold_lkb(lkb); /* undoes create_lkb() */ 3440 unhold_lkb(lkb); /* undoes create_lkb() */
3387 } else 3441 } else
3388 _request_lock(r, lkb); 3442 _request_lock(r, lkb);
@@ -3463,6 +3517,10 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3463 hold_rsb(r); 3517 hold_rsb(r);
3464 lock_rsb(r); 3518 lock_rsb(r);
3465 3519
3520 error = validate_message(lkb, ms);
3521 if (error)
3522 goto out;
3523
3466 /* stub reply can happen with waiters_mutex held */ 3524 /* stub reply can happen with waiters_mutex held */
3467 error = remove_from_waiters_ms(lkb, ms); 3525 error = remove_from_waiters_ms(lkb, ms);
3468 if (error) 3526 if (error)
@@ -3481,10 +3539,10 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
3481 3539
3482 error = find_lkb(ls, ms->m_remid, &lkb); 3540 error = find_lkb(ls, ms->m_remid, &lkb);
3483 if (error) { 3541 if (error) {
3484 log_error(ls, "receive_convert_reply no lkb"); 3542 log_debug(ls, "receive_convert_reply from %d no lkb %x",
3543 ms->m_header.h_nodeid, ms->m_remid);
3485 return; 3544 return;
3486 } 3545 }
3487 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3488 3546
3489 _receive_convert_reply(lkb, ms); 3547 _receive_convert_reply(lkb, ms);
3490 dlm_put_lkb(lkb); 3548 dlm_put_lkb(lkb);
@@ -3498,6 +3556,10 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3498 hold_rsb(r); 3556 hold_rsb(r);
3499 lock_rsb(r); 3557 lock_rsb(r);
3500 3558
3559 error = validate_message(lkb, ms);
3560 if (error)
3561 goto out;
3562
3501 /* stub reply can happen with waiters_mutex held */ 3563 /* stub reply can happen with waiters_mutex held */
3502 error = remove_from_waiters_ms(lkb, ms); 3564 error = remove_from_waiters_ms(lkb, ms);
3503 if (error) 3565 if (error)
@@ -3529,10 +3591,10 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
3529 3591
3530 error = find_lkb(ls, ms->m_remid, &lkb); 3592 error = find_lkb(ls, ms->m_remid, &lkb);
3531 if (error) { 3593 if (error) {
3532 log_error(ls, "receive_unlock_reply no lkb"); 3594 log_debug(ls, "receive_unlock_reply from %d no lkb %x",
3595 ms->m_header.h_nodeid, ms->m_remid);
3533 return; 3596 return;
3534 } 3597 }
3535 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3536 3598
3537 _receive_unlock_reply(lkb, ms); 3599 _receive_unlock_reply(lkb, ms);
3538 dlm_put_lkb(lkb); 3600 dlm_put_lkb(lkb);
@@ -3546,6 +3608,10 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3546 hold_rsb(r); 3608 hold_rsb(r);
3547 lock_rsb(r); 3609 lock_rsb(r);
3548 3610
3611 error = validate_message(lkb, ms);
3612 if (error)
3613 goto out;
3614
3549 /* stub reply can happen with waiters_mutex held */ 3615 /* stub reply can happen with waiters_mutex held */
3550 error = remove_from_waiters_ms(lkb, ms); 3616 error = remove_from_waiters_ms(lkb, ms);
3551 if (error) 3617 if (error)
@@ -3577,10 +3643,10 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
3577 3643
3578 error = find_lkb(ls, ms->m_remid, &lkb); 3644 error = find_lkb(ls, ms->m_remid, &lkb);
3579 if (error) { 3645 if (error) {
3580 log_error(ls, "receive_cancel_reply no lkb"); 3646 log_debug(ls, "receive_cancel_reply from %d no lkb %x",
3647 ms->m_header.h_nodeid, ms->m_remid);
3581 return; 3648 return;
3582 } 3649 }
3583 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3584 3650
3585 _receive_cancel_reply(lkb, ms); 3651 _receive_cancel_reply(lkb, ms);
3586 dlm_put_lkb(lkb); 3652 dlm_put_lkb(lkb);
@@ -3640,6 +3706,13 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
3640 3706
3641static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) 3707static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
3642{ 3708{
3709 if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
3710 log_debug(ls, "ignore non-member message %d from %d %x %x %d",
3711 ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
3712 ms->m_remid, ms->m_result);
3713 return;
3714 }
3715
3643 switch (ms->m_type) { 3716 switch (ms->m_type) {
3644 3717
3645 /* messages sent to a master node */ 3718 /* messages sent to a master node */
@@ -3778,8 +3851,9 @@ void dlm_receive_buffer(struct dlm_header *hd, int nodeid)
3778 3851
3779 ls = dlm_find_lockspace_global(hd->h_lockspace); 3852 ls = dlm_find_lockspace_global(hd->h_lockspace);
3780 if (!ls) { 3853 if (!ls) {
3781 log_print("invalid h_lockspace %x from %d cmd %d type %d", 3854 if (dlm_config.ci_log_debug)
3782 hd->h_lockspace, nodeid, hd->h_cmd, type); 3855 log_print("invalid lockspace %x from %d cmd %d type %d",
3856 hd->h_lockspace, nodeid, hd->h_cmd, type);
3783 3857
3784 if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) 3858 if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
3785 dlm_send_ls_not_ready(nodeid, rc); 3859 dlm_send_ls_not_ready(nodeid, rc);
@@ -3806,6 +3880,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3806 ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; 3880 ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
3807 ls->ls_stub_ms.m_result = -EINPROGRESS; 3881 ls->ls_stub_ms.m_result = -EINPROGRESS;
3808 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 3882 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3883 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
3809 _receive_convert_reply(lkb, &ls->ls_stub_ms); 3884 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3810 3885
3811 /* Same special case as in receive_rcom_lock_args() */ 3886 /* Same special case as in receive_rcom_lock_args() */
@@ -3847,6 +3922,7 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3847void dlm_recover_waiters_pre(struct dlm_ls *ls) 3922void dlm_recover_waiters_pre(struct dlm_ls *ls)
3848{ 3923{
3849 struct dlm_lkb *lkb, *safe; 3924 struct dlm_lkb *lkb, *safe;
3925 int wait_type, stub_unlock_result, stub_cancel_result;
3850 3926
3851 mutex_lock(&ls->ls_waiters_mutex); 3927 mutex_lock(&ls->ls_waiters_mutex);
3852 3928
@@ -3865,7 +3941,33 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3865 if (!waiter_needs_recovery(ls, lkb)) 3941 if (!waiter_needs_recovery(ls, lkb))
3866 continue; 3942 continue;
3867 3943
3868 switch (lkb->lkb_wait_type) { 3944 wait_type = lkb->lkb_wait_type;
3945 stub_unlock_result = -DLM_EUNLOCK;
3946 stub_cancel_result = -DLM_ECANCEL;
3947
3948 /* Main reply may have been received leaving a zero wait_type,
3949 but a reply for the overlapping op may not have been
3950 received. In that case we need to fake the appropriate
3951 reply for the overlap op. */
3952
3953 if (!wait_type) {
3954 if (is_overlap_cancel(lkb)) {
3955 wait_type = DLM_MSG_CANCEL;
3956 if (lkb->lkb_grmode == DLM_LOCK_IV)
3957 stub_cancel_result = 0;
3958 }
3959 if (is_overlap_unlock(lkb)) {
3960 wait_type = DLM_MSG_UNLOCK;
3961 if (lkb->lkb_grmode == DLM_LOCK_IV)
3962 stub_unlock_result = -ENOENT;
3963 }
3964
3965 log_debug(ls, "rwpre overlap %x %x %d %d %d",
3966 lkb->lkb_id, lkb->lkb_flags, wait_type,
3967 stub_cancel_result, stub_unlock_result);
3968 }
3969
3970 switch (wait_type) {
3869 3971
3870 case DLM_MSG_REQUEST: 3972 case DLM_MSG_REQUEST:
3871 lkb->lkb_flags |= DLM_IFL_RESEND; 3973 lkb->lkb_flags |= DLM_IFL_RESEND;
@@ -3878,8 +3980,9 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3878 case DLM_MSG_UNLOCK: 3980 case DLM_MSG_UNLOCK:
3879 hold_lkb(lkb); 3981 hold_lkb(lkb);
3880 ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY; 3982 ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
3881 ls->ls_stub_ms.m_result = -DLM_EUNLOCK; 3983 ls->ls_stub_ms.m_result = stub_unlock_result;
3882 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 3984 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3985 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
3883 _receive_unlock_reply(lkb, &ls->ls_stub_ms); 3986 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3884 dlm_put_lkb(lkb); 3987 dlm_put_lkb(lkb);
3885 break; 3988 break;
@@ -3887,15 +3990,16 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3887 case DLM_MSG_CANCEL: 3990 case DLM_MSG_CANCEL:
3888 hold_lkb(lkb); 3991 hold_lkb(lkb);
3889 ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY; 3992 ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
3890 ls->ls_stub_ms.m_result = -DLM_ECANCEL; 3993 ls->ls_stub_ms.m_result = stub_cancel_result;
3891 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 3994 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3995 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
3892 _receive_cancel_reply(lkb, &ls->ls_stub_ms); 3996 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3893 dlm_put_lkb(lkb); 3997 dlm_put_lkb(lkb);
3894 break; 3998 break;
3895 3999
3896 default: 4000 default:
3897 log_error(ls, "invalid lkb wait_type %d", 4001 log_error(ls, "invalid lkb wait_type %d %d",
3898 lkb->lkb_wait_type); 4002 lkb->lkb_wait_type, wait_type);
3899 } 4003 }
3900 schedule(); 4004 schedule();
3901 } 4005 }
@@ -4184,7 +4288,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
4184 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP); 4288 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
4185 4289
4186 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 4290 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
4187 lkb->lkb_lvbptr = allocate_lvb(ls); 4291 lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
4188 if (!lkb->lkb_lvbptr) 4292 if (!lkb->lkb_lvbptr)
4189 return -ENOMEM; 4293 return -ENOMEM;
4190 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - 4294 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4259,7 +4363,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
4259 put_rsb(r); 4363 put_rsb(r);
4260 out: 4364 out:
4261 if (error) 4365 if (error)
4262 log_print("recover_master_copy %d %x", error, rl->rl_lkid); 4366 log_debug(ls, "recover_master_copy %d %x", error, rl->rl_lkid);
4263 rl->rl_result = error; 4367 rl->rl_result = error;
4264 return error; 4368 return error;
4265} 4369}
@@ -4342,7 +4446,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
4342 } 4446 }
4343 } 4447 }
4344 4448
4345 /* After ua is attached to lkb it will be freed by free_lkb(). 4449 /* After ua is attached to lkb it will be freed by dlm_free_lkb().
4346 When DLM_IFL_USER is set, the dlm knows that this is a userspace 4450 When DLM_IFL_USER is set, the dlm knows that this is a userspace
4347 lock and that lkb_astparam is the dlm_user_args structure. */ 4451 lock and that lkb_astparam is the dlm_user_args structure. */
4348 4452
@@ -4679,6 +4783,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
4679 } 4783 }
4680 4784
4681 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { 4785 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
4786 lkb->lkb_ast_type = 0;
4682 list_del(&lkb->lkb_astqueue); 4787 list_del(&lkb->lkb_astqueue);
4683 dlm_put_lkb(lkb); 4788 dlm_put_lkb(lkb);
4684 } 4789 }
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index ada04680a1e5..27b6ed302911 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -19,8 +19,6 @@ void dlm_print_lkb(struct dlm_lkb *lkb);
19void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); 19void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
20void dlm_receive_buffer(struct dlm_header *hd, int nodeid); 20void dlm_receive_buffer(struct dlm_header *hd, int nodeid);
21int dlm_modes_compat(int mode1, int mode2); 21int dlm_modes_compat(int mode1, int mode2);
22int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
23 unsigned int flags, struct dlm_rsb **r_ret);
24void dlm_put_rsb(struct dlm_rsb *r); 22void dlm_put_rsb(struct dlm_rsb *r);
25void dlm_hold_rsb(struct dlm_rsb *r); 23void dlm_hold_rsb(struct dlm_rsb *r);
26int dlm_put_lkb(struct dlm_lkb *lkb); 24int dlm_put_lkb(struct dlm_lkb *lkb);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 6353a8384520..b180fdc51085 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -24,14 +24,6 @@
24#include "recover.h" 24#include "recover.h"
25#include "requestqueue.h" 25#include "requestqueue.h"
26 26
27#ifdef CONFIG_DLM_DEBUG
28int dlm_create_debug_file(struct dlm_ls *ls);
29void dlm_delete_debug_file(struct dlm_ls *ls);
30#else
31static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
32static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
33#endif
34
35static int ls_count; 27static int ls_count;
36static struct mutex ls_lock; 28static struct mutex ls_lock;
37static struct list_head lslist; 29static struct list_head lslist;
@@ -166,26 +158,7 @@ static struct kobj_type dlm_ktype = {
166 .release = lockspace_kobj_release, 158 .release = lockspace_kobj_release,
167}; 159};
168 160
169static struct kset dlm_kset = { 161static struct kset *dlm_kset;
170 .ktype = &dlm_ktype,
171};
172
173static int kobject_setup(struct dlm_ls *ls)
174{
175 char lsname[DLM_LOCKSPACE_LEN];
176 int error;
177
178 memset(lsname, 0, DLM_LOCKSPACE_LEN);
179 snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
180
181 error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
182 if (error)
183 return error;
184
185 ls->ls_kobj.kset = &dlm_kset;
186 ls->ls_kobj.ktype = &dlm_ktype;
187 return 0;
188}
189 162
190static int do_uevent(struct dlm_ls *ls, int in) 163static int do_uevent(struct dlm_ls *ls, int in)
191{ 164{
@@ -220,24 +193,22 @@ static int do_uevent(struct dlm_ls *ls, int in)
220 193
221int dlm_lockspace_init(void) 194int dlm_lockspace_init(void)
222{ 195{
223 int error;
224
225 ls_count = 0; 196 ls_count = 0;
226 mutex_init(&ls_lock); 197 mutex_init(&ls_lock);
227 INIT_LIST_HEAD(&lslist); 198 INIT_LIST_HEAD(&lslist);
228 spin_lock_init(&lslist_lock); 199 spin_lock_init(&lslist_lock);
229 200
230 kobject_set_name(&dlm_kset.kobj, "dlm"); 201 dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj);
231 kobj_set_kset_s(&dlm_kset, kernel_subsys); 202 if (!dlm_kset) {
232 error = kset_register(&dlm_kset); 203 printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
233 if (error) 204 return -ENOMEM;
234 printk("dlm_lockspace_init: cannot register kset %d\n", error); 205 }
235 return error; 206 return 0;
236} 207}
237 208
238void dlm_lockspace_exit(void) 209void dlm_lockspace_exit(void)
239{ 210{
240 kset_unregister(&dlm_kset); 211 kset_unregister(dlm_kset);
241} 212}
242 213
243static int dlm_scand(void *data) 214static int dlm_scand(void *data)
@@ -549,13 +520,12 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
549 goto out_delist; 520 goto out_delist;
550 } 521 }
551 522
552 error = kobject_setup(ls); 523 ls->ls_kobj.kset = dlm_kset;
553 if (error) 524 error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL,
554 goto out_stop; 525 "%s", ls->ls_name);
555
556 error = kobject_register(&ls->ls_kobj);
557 if (error) 526 if (error)
558 goto out_stop; 527 goto out_stop;
528 kobject_uevent(&ls->ls_kobj, KOBJ_ADD);
559 529
560 /* let kobject handle freeing of ls if there's an error */ 530 /* let kobject handle freeing of ls if there's an error */
561 do_unreg = 1; 531 do_unreg = 1;
@@ -601,7 +571,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
601 kfree(ls->ls_rsbtbl); 571 kfree(ls->ls_rsbtbl);
602 out_lsfree: 572 out_lsfree:
603 if (do_unreg) 573 if (do_unreg)
604 kobject_unregister(&ls->ls_kobj); 574 kobject_put(&ls->ls_kobj);
605 else 575 else
606 kfree(ls); 576 kfree(ls);
607 out: 577 out:
@@ -706,9 +676,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
706 dlm_del_ast(lkb); 676 dlm_del_ast(lkb);
707 677
708 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) 678 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
709 free_lvb(lkb->lkb_lvbptr); 679 dlm_free_lvb(lkb->lkb_lvbptr);
710 680
711 free_lkb(lkb); 681 dlm_free_lkb(lkb);
712 } 682 }
713 } 683 }
714 dlm_astd_resume(); 684 dlm_astd_resume();
@@ -726,7 +696,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
726 res_hashchain); 696 res_hashchain);
727 697
728 list_del(&rsb->res_hashchain); 698 list_del(&rsb->res_hashchain);
729 free_rsb(rsb); 699 dlm_free_rsb(rsb);
730 } 700 }
731 701
732 head = &ls->ls_rsbtbl[i].toss; 702 head = &ls->ls_rsbtbl[i].toss;
@@ -734,7 +704,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
734 rsb = list_entry(head->next, struct dlm_rsb, 704 rsb = list_entry(head->next, struct dlm_rsb,
735 res_hashchain); 705 res_hashchain);
736 list_del(&rsb->res_hashchain); 706 list_del(&rsb->res_hashchain);
737 free_rsb(rsb); 707 dlm_free_rsb(rsb);
738 } 708 }
739 } 709 }
740 710
@@ -750,7 +720,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
750 dlm_clear_members(ls); 720 dlm_clear_members(ls);
751 dlm_clear_members_gone(ls); 721 dlm_clear_members_gone(ls);
752 kfree(ls->ls_node_array); 722 kfree(ls->ls_node_array);
753 kobject_unregister(&ls->ls_kobj); 723 kobject_put(&ls->ls_kobj);
754 /* The ls structure will be freed when the kobject is done with */ 724 /* The ls structure will be freed when the kobject is done with */
755 725
756 mutex_lock(&ls_lock); 726 mutex_lock(&ls_lock);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index e9923ca9c2d9..7c1e5e5cccd8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -864,7 +864,7 @@ static void sctp_init_assoc(struct connection *con)
864static void tcp_connect_to_sock(struct connection *con) 864static void tcp_connect_to_sock(struct connection *con)
865{ 865{
866 int result = -EHOSTUNREACH; 866 int result = -EHOSTUNREACH;
867 struct sockaddr_storage saddr; 867 struct sockaddr_storage saddr, src_addr;
868 int addr_len; 868 int addr_len;
869 struct socket *sock; 869 struct socket *sock;
870 870
@@ -898,6 +898,17 @@ static void tcp_connect_to_sock(struct connection *con)
898 con->connect_action = tcp_connect_to_sock; 898 con->connect_action = tcp_connect_to_sock;
899 add_sock(sock, con); 899 add_sock(sock, con);
900 900
901 /* Bind to our cluster-known address connecting to avoid
902 routing problems */
903 memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
904 make_sockaddr(&src_addr, 0, &addr_len);
905 result = sock->ops->bind(sock, (struct sockaddr *) &src_addr,
906 addr_len);
907 if (result < 0) {
908 log_print("could not bind for connect: %d", result);
909 /* This *may* not indicate a critical error */
910 }
911
901 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); 912 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
902 913
903 log_print("connecting to %d", con->nodeid); 914 log_print("connecting to %d", con->nodeid);
@@ -1426,6 +1437,8 @@ void dlm_lowcomms_stop(void)
1426 con = __nodeid2con(i, 0); 1437 con = __nodeid2con(i, 0);
1427 if (con) { 1438 if (con) {
1428 close_connection(con, true); 1439 close_connection(con, true);
1440 if (con->othercon)
1441 kmem_cache_free(con_cache, con->othercon);
1429 kmem_cache_free(con_cache, con); 1442 kmem_cache_free(con_cache, con);
1430 } 1443 }
1431 } 1444 }
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index eca2907f2386..58487fb95a4c 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -18,16 +18,6 @@
18#include "memory.h" 18#include "memory.h"
19#include "config.h" 19#include "config.h"
20 20
21#ifdef CONFIG_DLM_DEBUG
22int dlm_register_debugfs(void);
23void dlm_unregister_debugfs(void);
24#else
25static inline int dlm_register_debugfs(void) { return 0; }
26static inline void dlm_unregister_debugfs(void) { }
27#endif
28int dlm_netlink_init(void);
29void dlm_netlink_exit(void);
30
31static int __init init_dlm(void) 21static int __init init_dlm(void)
32{ 22{
33 int error; 23 int error;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index e9cdcab306e2..fa17f5a27883 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -70,7 +70,7 @@ static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
70 ls->ls_num_nodes--; 70 ls->ls_num_nodes--;
71} 71}
72 72
73static int dlm_is_member(struct dlm_ls *ls, int nodeid) 73int dlm_is_member(struct dlm_ls *ls, int nodeid)
74{ 74{
75 struct dlm_member *memb; 75 struct dlm_member *memb;
76 76
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 927c08c19214..7a26fca1e0b5 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,7 @@ void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls); 19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); 20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid); 21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22int dlm_is_member(struct dlm_ls *ls, int nodeid);
22 23
23#endif /* __MEMBER_DOT_H__ */ 24#endif /* __MEMBER_DOT_H__ */
24 25
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index ecf0e5cb2035..f7783867491a 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -35,7 +35,7 @@ void dlm_memory_exit(void)
35 kmem_cache_destroy(lkb_cache); 35 kmem_cache_destroy(lkb_cache);
36} 36}
37 37
38char *allocate_lvb(struct dlm_ls *ls) 38char *dlm_allocate_lvb(struct dlm_ls *ls)
39{ 39{
40 char *p; 40 char *p;
41 41
@@ -43,7 +43,7 @@ char *allocate_lvb(struct dlm_ls *ls)
43 return p; 43 return p;
44} 44}
45 45
46void free_lvb(char *p) 46void dlm_free_lvb(char *p)
47{ 47{
48 kfree(p); 48 kfree(p);
49} 49}
@@ -51,7 +51,7 @@ void free_lvb(char *p)
51/* FIXME: have some minimal space built-in to rsb for the name and 51/* FIXME: have some minimal space built-in to rsb for the name and
52 kmalloc a separate name if needed, like dentries are done */ 52 kmalloc a separate name if needed, like dentries are done */
53 53
54struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen) 54struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
55{ 55{
56 struct dlm_rsb *r; 56 struct dlm_rsb *r;
57 57
@@ -61,14 +61,14 @@ struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
61 return r; 61 return r;
62} 62}
63 63
64void free_rsb(struct dlm_rsb *r) 64void dlm_free_rsb(struct dlm_rsb *r)
65{ 65{
66 if (r->res_lvbptr) 66 if (r->res_lvbptr)
67 free_lvb(r->res_lvbptr); 67 dlm_free_lvb(r->res_lvbptr);
68 kfree(r); 68 kfree(r);
69} 69}
70 70
71struct dlm_lkb *allocate_lkb(struct dlm_ls *ls) 71struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
72{ 72{
73 struct dlm_lkb *lkb; 73 struct dlm_lkb *lkb;
74 74
@@ -76,7 +76,7 @@ struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
76 return lkb; 76 return lkb;
77} 77}
78 78
79void free_lkb(struct dlm_lkb *lkb) 79void dlm_free_lkb(struct dlm_lkb *lkb)
80{ 80{
81 if (lkb->lkb_flags & DLM_IFL_USER) { 81 if (lkb->lkb_flags & DLM_IFL_USER) {
82 struct dlm_user_args *ua; 82 struct dlm_user_args *ua;
@@ -90,19 +90,3 @@ void free_lkb(struct dlm_lkb *lkb)
90 kmem_cache_free(lkb_cache, lkb); 90 kmem_cache_free(lkb_cache, lkb);
91} 91}
92 92
93struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
94{
95 struct dlm_direntry *de;
96
97 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
98 printk("namelen = %d\n", namelen););
99
100 de = kzalloc(sizeof(*de) + namelen, GFP_KERNEL);
101 return de;
102}
103
104void free_direntry(struct dlm_direntry *de)
105{
106 kfree(de);
107}
108
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 6ead158ccc5c..485fb29143bd 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -16,14 +16,12 @@
16 16
17int dlm_memory_init(void); 17int dlm_memory_init(void);
18void dlm_memory_exit(void); 18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen); 19struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r); 20void dlm_free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls); 21struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l); 22void dlm_free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen); 23char *dlm_allocate_lvb(struct dlm_ls *ls);
24void free_direntry(struct dlm_direntry *de); 24void dlm_free_lvb(char *l);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27 25
28#endif /* __MEMORY_DOT_H__ */ 26#endif /* __MEMORY_DOT_H__ */
29 27
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index f8c69dda16a0..e69926e984db 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -58,8 +58,12 @@ static void copy_from_cb(void *dst, const void *base, unsigned offset,
58int dlm_process_incoming_buffer(int nodeid, const void *base, 58int dlm_process_incoming_buffer(int nodeid, const void *base,
59 unsigned offset, unsigned len, unsigned limit) 59 unsigned offset, unsigned len, unsigned limit)
60{ 60{
61 unsigned char __tmp[DLM_INBUF_LEN]; 61 union {
62 struct dlm_header *msg = (struct dlm_header *) __tmp; 62 unsigned char __buf[DLM_INBUF_LEN];
63 /* this is to force proper alignment on some arches */
64 struct dlm_header dlm;
65 } __tmp;
66 struct dlm_header *msg = &__tmp.dlm;
63 int ret = 0; 67 int ret = 0;
64 int err = 0; 68 int err = 0;
65 uint16_t msglen; 69 uint16_t msglen;
@@ -100,8 +104,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
100 in the buffer on the stack (which should work for most 104 in the buffer on the stack (which should work for most
101 ordinary messages). */ 105 ordinary messages). */
102 106
103 if (msglen > sizeof(__tmp) && 107 if (msglen > DLM_INBUF_LEN && msg == &__tmp.dlm) {
104 msg == (struct dlm_header *) __tmp) {
105 msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); 108 msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
106 if (msg == NULL) 109 if (msg == NULL)
107 return ret; 110 return ret;
@@ -119,7 +122,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
119 dlm_receive_buffer(msg, nodeid); 122 dlm_receive_buffer(msg, nodeid);
120 } 123 }
121 124
122 if (msg != (struct dlm_header *) __tmp) 125 if (msg != &__tmp.dlm)
123 kfree(msg); 126 kfree(msg);
124 127
125 return err ? err : ret; 128 return err ? err : ret;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index ae2fd97fa4ad..026824cd3acb 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -197,11 +197,6 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
197 spin_unlock(&ls->ls_rcom_spin); 197 spin_unlock(&ls->ls_rcom_spin);
198} 198}
199 199
200static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
201{
202 receive_sync_reply(ls, rc_in);
203}
204
205int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) 200int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
206{ 201{
207 struct dlm_rcom *rc; 202 struct dlm_rcom *rc;
@@ -254,11 +249,6 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
254 send_rcom(ls, mh, rc); 249 send_rcom(ls, mh, rc);
255} 250}
256 251
257static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
258{
259 receive_sync_reply(ls, rc_in);
260}
261
262int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) 252int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
263{ 253{
264 struct dlm_rcom *rc; 254 struct dlm_rcom *rc;
@@ -381,11 +371,6 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
381 send_rcom(ls, mh, rc); 371 send_rcom(ls, mh, rc);
382} 372}
383 373
384static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
385{
386 dlm_recover_process_copy(ls, rc_in);
387}
388
389/* If the lockspace doesn't exist then still send a status message 374/* If the lockspace doesn't exist then still send a status message
390 back; it's possible that it just doesn't have its global_id yet. */ 375 back; it's possible that it just doesn't have its global_id yet. */
391 376
@@ -481,11 +466,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
481 break; 466 break;
482 467
483 case DLM_RCOM_STATUS_REPLY: 468 case DLM_RCOM_STATUS_REPLY:
484 receive_rcom_status_reply(ls, rc); 469 receive_sync_reply(ls, rc);
485 break; 470 break;
486 471
487 case DLM_RCOM_NAMES_REPLY: 472 case DLM_RCOM_NAMES_REPLY:
488 receive_rcom_names_reply(ls, rc); 473 receive_sync_reply(ls, rc);
489 break; 474 break;
490 475
491 case DLM_RCOM_LOOKUP_REPLY: 476 case DLM_RCOM_LOOKUP_REPLY:
@@ -493,11 +478,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
493 break; 478 break;
494 479
495 case DLM_RCOM_LOCK_REPLY: 480 case DLM_RCOM_LOCK_REPLY:
496 receive_rcom_lock_reply(ls, rc); 481 dlm_recover_process_copy(ls, rc);
497 break; 482 break;
498 483
499 default: 484 default:
500 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type);); 485 log_error(ls, "receive_rcom bad type %d", rc->rc_type);
501 } 486 }
502 out: 487 out:
503 return; 488 return;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index c2cc7694cd16..df075dc300fa 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -629,7 +629,7 @@ static void recover_lvb(struct dlm_rsb *r)
629 goto out; 629 goto out;
630 630
631 if (!r->res_lvbptr) { 631 if (!r->res_lvbptr) {
632 r->res_lvbptr = allocate_lvb(r->res_ls); 632 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
633 if (!r->res_lvbptr) 633 if (!r->res_lvbptr)
634 goto out; 634 goto out;
635 } 635 }
@@ -731,6 +731,20 @@ int dlm_create_root_list(struct dlm_ls *ls)
731 list_add(&r->res_root_list, &ls->ls_root_list); 731 list_add(&r->res_root_list, &ls->ls_root_list);
732 dlm_hold_rsb(r); 732 dlm_hold_rsb(r);
733 } 733 }
734
735 /* If we're using a directory, add tossed rsbs to the root
736 list; they'll have entries created in the new directory,
737 but no other recovery steps should do anything with them. */
738
739 if (dlm_no_directory(ls)) {
740 read_unlock(&ls->ls_rsbtbl[i].lock);
741 continue;
742 }
743
744 list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
745 list_add(&r->res_root_list, &ls->ls_root_list);
746 dlm_hold_rsb(r);
747 }
734 read_unlock(&ls->ls_rsbtbl[i].lock); 748 read_unlock(&ls->ls_rsbtbl[i].lock);
735 } 749 }
736 out: 750 out:
@@ -750,6 +764,11 @@ void dlm_release_root_list(struct dlm_ls *ls)
750 up_write(&ls->ls_root_sem); 764 up_write(&ls->ls_root_sem);
751} 765}
752 766
767/* If not using a directory, clear the entire toss list, there's no benefit to
768 caching the master value since it's fixed. If we are using a dir, keep the
769 rsb's we're the master of. Recovery will add them to the root list and from
770 there they'll be entered in the rebuilt directory. */
771
753void dlm_clear_toss_list(struct dlm_ls *ls) 772void dlm_clear_toss_list(struct dlm_ls *ls)
754{ 773{
755 struct dlm_rsb *r, *safe; 774 struct dlm_rsb *r, *safe;
@@ -759,8 +778,10 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
759 write_lock(&ls->ls_rsbtbl[i].lock); 778 write_lock(&ls->ls_rsbtbl[i].lock);
760 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, 779 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
761 res_hashchain) { 780 res_hashchain) {
762 list_del(&r->res_hashchain); 781 if (dlm_no_directory(ls) || !is_master(r)) {
763 free_rsb(r); 782 list_del(&r->res_hashchain);
783 dlm_free_rsb(r);
784 }
764 } 785 }
765 write_unlock(&ls->ls_rsbtbl[i].lock); 786 write_unlock(&ls->ls_rsbtbl[i].lock);
766 } 787 }
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 4b89e20eebe7..997f9531d594 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -67,17 +67,18 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
67 dlm_astd_resume(); 67 dlm_astd_resume();
68 68
69 /* 69 /*
70 * This list of root rsb's will be the basis of most of the recovery 70 * Free non-master tossed rsb's. Master rsb's are kept on toss
71 * routines. 71 * list and put on root list to be included in resdir recovery.
72 */ 72 */
73 73
74 dlm_create_root_list(ls); 74 dlm_clear_toss_list(ls);
75 75
76 /* 76 /*
77 * Free all the tossed rsb's so we don't have to recover them. 77 * This list of root rsb's will be the basis of most of the recovery
78 * routines.
78 */ 79 */
79 80
80 dlm_clear_toss_list(ls); 81 dlm_create_root_list(ls);
81 82
82 /* 83 /*
83 * Add or remove nodes from the lockspace's ls_nodes list. 84 * Add or remove nodes from the lockspace's ls_nodes list.
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 4f741546f4bb..7cbc6826239b 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -24,8 +24,7 @@
24#include "lvb_table.h" 24#include "lvb_table.h"
25#include "user.h" 25#include "user.h"
26 26
27static const char *name_prefix="dlm"; 27static const char name_prefix[] = "dlm";
28static struct miscdevice ctl_device;
29static const struct file_operations device_fops; 28static const struct file_operations device_fops;
30 29
31#ifdef CONFIG_COMPAT 30#ifdef CONFIG_COMPAT
@@ -82,7 +81,8 @@ struct dlm_lock_result32 {
82}; 81};
83 82
84static void compat_input(struct dlm_write_request *kb, 83static void compat_input(struct dlm_write_request *kb,
85 struct dlm_write_request32 *kb32) 84 struct dlm_write_request32 *kb32,
85 int max_namelen)
86{ 86{
87 kb->version[0] = kb32->version[0]; 87 kb->version[0] = kb32->version[0];
88 kb->version[1] = kb32->version[1]; 88 kb->version[1] = kb32->version[1];
@@ -112,7 +112,11 @@ static void compat_input(struct dlm_write_request *kb,
112 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr; 112 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
113 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb; 113 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
114 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); 114 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
115 memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen); 115 if (kb->i.lock.namelen <= max_namelen)
116 memcpy(kb->i.lock.name, kb32->i.lock.name,
117 kb->i.lock.namelen);
118 else
119 kb->i.lock.namelen = max_namelen;
116 } 120 }
117} 121}
118 122
@@ -236,12 +240,12 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
236 spin_unlock(&proc->asts_spin); 240 spin_unlock(&proc->asts_spin);
237 241
238 if (eol) { 242 if (eol) {
239 spin_lock(&ua->proc->locks_spin); 243 spin_lock(&proc->locks_spin);
240 if (!list_empty(&lkb->lkb_ownqueue)) { 244 if (!list_empty(&lkb->lkb_ownqueue)) {
241 list_del_init(&lkb->lkb_ownqueue); 245 list_del_init(&lkb->lkb_ownqueue);
242 dlm_put_lkb(lkb); 246 dlm_put_lkb(lkb);
243 } 247 }
244 spin_unlock(&ua->proc->locks_spin); 248 spin_unlock(&proc->locks_spin);
245 } 249 }
246 out: 250 out:
247 mutex_unlock(&ls->ls_clear_proc_locks); 251 mutex_unlock(&ls->ls_clear_proc_locks);
@@ -529,7 +533,8 @@ static ssize_t device_write(struct file *file, const char __user *buf,
529 533
530 if (proc) 534 if (proc)
531 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); 535 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
532 compat_input(kbuf, k32buf); 536 compat_input(kbuf, k32buf,
537 count - sizeof(struct dlm_write_request32));
533 kfree(k32buf); 538 kfree(k32buf);
534 } 539 }
535#endif 540#endif
@@ -896,14 +901,16 @@ static const struct file_operations ctl_device_fops = {
896 .owner = THIS_MODULE, 901 .owner = THIS_MODULE,
897}; 902};
898 903
904static struct miscdevice ctl_device = {
905 .name = "dlm-control",
906 .fops = &ctl_device_fops,
907 .minor = MISC_DYNAMIC_MINOR,
908};
909
899int dlm_user_init(void) 910int dlm_user_init(void)
900{ 911{
901 int error; 912 int error;
902 913
903 ctl_device.name = "dlm-control";
904 ctl_device.fops = &ctl_device_fops;
905 ctl_device.minor = MISC_DYNAMIC_MINOR;
906
907 error = misc_register(&ctl_device); 914 error = misc_register(&ctl_device);
908 if (error) 915 if (error)
909 log_print("misc_register failed for control device"); 916 log_print("misc_register failed for control device");
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 963889cf6740..4d9c1f4e1bd1 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,14 @@
14#include "rcom.h" 14#include "rcom.h"
15#include "util.h" 15#include "util.h"
16 16
17#define DLM_ERRNO_EDEADLK 35
18#define DLM_ERRNO_EBADR 53
19#define DLM_ERRNO_EBADSLT 57
20#define DLM_ERRNO_EPROTO 71
21#define DLM_ERRNO_EOPNOTSUPP 95
22#define DLM_ERRNO_ETIMEDOUT 110
23#define DLM_ERRNO_EINPROGRESS 115
24
17static void header_out(struct dlm_header *hd) 25static void header_out(struct dlm_header *hd)
18{ 26{
19 hd->h_version = cpu_to_le32(hd->h_version); 27 hd->h_version = cpu_to_le32(hd->h_version);
@@ -30,11 +38,54 @@ static void header_in(struct dlm_header *hd)
30 hd->h_length = le16_to_cpu(hd->h_length); 38 hd->h_length = le16_to_cpu(hd->h_length);
31} 39}
32 40
33void dlm_message_out(struct dlm_message *ms) 41/* higher errno values are inconsistent across architectures, so select
42 one set of values for on the wire */
43
44static int to_dlm_errno(int err)
45{
46 switch (err) {
47 case -EDEADLK:
48 return -DLM_ERRNO_EDEADLK;
49 case -EBADR:
50 return -DLM_ERRNO_EBADR;
51 case -EBADSLT:
52 return -DLM_ERRNO_EBADSLT;
53 case -EPROTO:
54 return -DLM_ERRNO_EPROTO;
55 case -EOPNOTSUPP:
56 return -DLM_ERRNO_EOPNOTSUPP;
57 case -ETIMEDOUT:
58 return -DLM_ERRNO_ETIMEDOUT;
59 case -EINPROGRESS:
60 return -DLM_ERRNO_EINPROGRESS;
61 }
62 return err;
63}
64
65static int from_dlm_errno(int err)
34{ 66{
35 struct dlm_header *hd = (struct dlm_header *) ms; 67 switch (err) {
68 case -DLM_ERRNO_EDEADLK:
69 return -EDEADLK;
70 case -DLM_ERRNO_EBADR:
71 return -EBADR;
72 case -DLM_ERRNO_EBADSLT:
73 return -EBADSLT;
74 case -DLM_ERRNO_EPROTO:
75 return -EPROTO;
76 case -DLM_ERRNO_EOPNOTSUPP:
77 return -EOPNOTSUPP;
78 case -DLM_ERRNO_ETIMEDOUT:
79 return -ETIMEDOUT;
80 case -DLM_ERRNO_EINPROGRESS:
81 return -EINPROGRESS;
82 }
83 return err;
84}
36 85
37 header_out(hd); 86void dlm_message_out(struct dlm_message *ms)
87{
88 header_out(&ms->m_header);
38 89
39 ms->m_type = cpu_to_le32(ms->m_type); 90 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid); 91 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
@@ -53,14 +104,12 @@ void dlm_message_out(struct dlm_message *ms)
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode); 104 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode); 105 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts); 106 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result); 107 ms->m_result = cpu_to_le32(to_dlm_errno(ms->m_result));
57} 108}
58 109
59void dlm_message_in(struct dlm_message *ms) 110void dlm_message_in(struct dlm_message *ms)
60{ 111{
61 struct dlm_header *hd = (struct dlm_header *) ms; 112 header_in(&ms->m_header);
62
63 header_in(hd);
64 113
65 ms->m_type = le32_to_cpu(ms->m_type); 114 ms->m_type = le32_to_cpu(ms->m_type);
66 ms->m_nodeid = le32_to_cpu(ms->m_nodeid); 115 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
@@ -79,7 +128,7 @@ void dlm_message_in(struct dlm_message *ms)
79 ms->m_rqmode = le32_to_cpu(ms->m_rqmode); 128 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
80 ms->m_bastmode = le32_to_cpu(ms->m_bastmode); 129 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
81 ms->m_asts = le32_to_cpu(ms->m_asts); 130 ms->m_asts = le32_to_cpu(ms->m_asts);
82 ms->m_result = le32_to_cpu(ms->m_result); 131 ms->m_result = from_dlm_errno(le32_to_cpu(ms->m_result));
83} 132}
84 133
85static void rcom_lock_out(struct rcom_lock *rl) 134static void rcom_lock_out(struct rcom_lock *rl)
@@ -126,10 +175,9 @@ static void rcom_config_in(struct rcom_config *rf)
126 175
127void dlm_rcom_out(struct dlm_rcom *rc) 176void dlm_rcom_out(struct dlm_rcom *rc)
128{ 177{
129 struct dlm_header *hd = (struct dlm_header *) rc;
130 int type = rc->rc_type; 178 int type = rc->rc_type;
131 179
132 header_out(hd); 180 header_out(&rc->rc_header);
133 181
134 rc->rc_type = cpu_to_le32(rc->rc_type); 182 rc->rc_type = cpu_to_le32(rc->rc_type);
135 rc->rc_result = cpu_to_le32(rc->rc_result); 183 rc->rc_result = cpu_to_le32(rc->rc_result);
@@ -137,7 +185,7 @@ void dlm_rcom_out(struct dlm_rcom *rc)
137 rc->rc_seq = cpu_to_le64(rc->rc_seq); 185 rc->rc_seq = cpu_to_le64(rc->rc_seq);
138 rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply); 186 rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply);
139 187
140 if (type == DLM_RCOM_LOCK) 188 if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
141 rcom_lock_out((struct rcom_lock *) rc->rc_buf); 189 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
142 190
143 else if (type == DLM_RCOM_STATUS_REPLY) 191 else if (type == DLM_RCOM_STATUS_REPLY)
@@ -146,9 +194,9 @@ void dlm_rcom_out(struct dlm_rcom *rc)
146 194
147void dlm_rcom_in(struct dlm_rcom *rc) 195void dlm_rcom_in(struct dlm_rcom *rc)
148{ 196{
149 struct dlm_header *hd = (struct dlm_header *) rc; 197 int type;
150 198
151 header_in(hd); 199 header_in(&rc->rc_header);
152 200
153 rc->rc_type = le32_to_cpu(rc->rc_type); 201 rc->rc_type = le32_to_cpu(rc->rc_type);
154 rc->rc_result = le32_to_cpu(rc->rc_result); 202 rc->rc_result = le32_to_cpu(rc->rc_result);
@@ -156,10 +204,12 @@ void dlm_rcom_in(struct dlm_rcom *rc)
156 rc->rc_seq = le64_to_cpu(rc->rc_seq); 204 rc->rc_seq = le64_to_cpu(rc->rc_seq);
157 rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); 205 rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply);
158 206
159 if (rc->rc_type == DLM_RCOM_LOCK) 207 type = rc->rc_type;
208
209 if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
160 rcom_lock_in((struct rcom_lock *) rc->rc_buf); 210 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
161 211
162 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY) 212 else if (type == DLM_RCOM_STATUS_REPLY)
163 rcom_config_in((struct rcom_config *) rc->rc_buf); 213 rcom_config_in((struct rcom_config *) rc->rc_buf);
164} 214}
165 215
diff --git a/fs/dquot.c b/fs/dquot.c
index 2809768d9c41..cee7c6f428f0 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -827,6 +827,18 @@ static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
827 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 827 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
828} 828}
829 829
830static int warning_issued(struct dquot *dquot, const int warntype)
831{
832 int flag = (warntype == QUOTA_NL_BHARDWARN ||
833 warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
834 ((warntype == QUOTA_NL_IHARDWARN ||
835 warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);
836
837 if (!flag)
838 return 0;
839 return test_and_set_bit(flag, &dquot->dq_flags);
840}
841
830#ifdef CONFIG_PRINT_QUOTA_WARNING 842#ifdef CONFIG_PRINT_QUOTA_WARNING
831static int flag_print_warnings = 1; 843static int flag_print_warnings = 1;
832 844
@@ -845,16 +857,12 @@ static inline int need_print_warning(struct dquot *dquot)
845} 857}
846 858
847/* Print warning to user which exceeded quota */ 859/* Print warning to user which exceeded quota */
848static void print_warning(struct dquot *dquot, const char warntype) 860static void print_warning(struct dquot *dquot, const int warntype)
849{ 861{
850 char *msg = NULL; 862 char *msg = NULL;
851 struct tty_struct *tty; 863 struct tty_struct *tty;
852 int flag = (warntype == QUOTA_NL_BHARDWARN ||
853 warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
854 ((warntype == QUOTA_NL_IHARDWARN ||
855 warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);
856 864
857 if (!need_print_warning(dquot) || (flag && test_and_set_bit(flag, &dquot->dq_flags))) 865 if (!need_print_warning(dquot))
858 return; 866 return;
859 867
860 mutex_lock(&tty_mutex); 868 mutex_lock(&tty_mutex);
@@ -895,9 +903,6 @@ out_lock:
895 903
896#ifdef CONFIG_QUOTA_NETLINK_INTERFACE 904#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
897 905
898/* Size of quota netlink message - actually an upperbound for buffer size */
899#define QUOTA_NL_MSG_SIZE 32
900
901/* Netlink family structure for quota */ 906/* Netlink family structure for quota */
902static struct genl_family quota_genl_family = { 907static struct genl_family quota_genl_family = {
903 .id = GENL_ID_GENERATE, 908 .id = GENL_ID_GENERATE,
@@ -914,11 +919,13 @@ static void send_warning(const struct dquot *dquot, const char warntype)
914 struct sk_buff *skb; 919 struct sk_buff *skb;
915 void *msg_head; 920 void *msg_head;
916 int ret; 921 int ret;
922 int msg_size = 4 * nla_total_size(sizeof(u32)) +
923 2 * nla_total_size(sizeof(u64));
917 924
918 /* We have to allocate using GFP_NOFS as we are called from a 925 /* We have to allocate using GFP_NOFS as we are called from a
919 * filesystem performing write and thus further recursion into 926 * filesystem performing write and thus further recursion into
920 * the fs to free some data could cause deadlocks. */ 927 * the fs to free some data could cause deadlocks. */
921 skb = genlmsg_new(QUOTA_NL_MSG_SIZE, GFP_NOFS); 928 skb = genlmsg_new(msg_size, GFP_NOFS);
922 if (!skb) { 929 if (!skb) {
923 printk(KERN_ERR 930 printk(KERN_ERR
924 "VFS: Not enough memory to send quota warning.\n"); 931 "VFS: Not enough memory to send quota warning.\n");
@@ -959,18 +966,19 @@ static void send_warning(const struct dquot *dquot, const char warntype)
959 "VFS: Failed to send notification message: %d\n", ret); 966 "VFS: Failed to send notification message: %d\n", ret);
960 return; 967 return;
961attr_err_out: 968attr_err_out:
962 printk(KERN_ERR "VFS: Failed to compose quota message: %d\n", ret); 969 printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
963err_out: 970err_out:
964 kfree_skb(skb); 971 kfree_skb(skb);
965} 972}
966#endif 973#endif
967 974
968static inline void flush_warnings(struct dquot **dquots, char *warntype) 975static inline void flush_warnings(struct dquot * const *dquots, char *warntype)
969{ 976{
970 int i; 977 int i;
971 978
972 for (i = 0; i < MAXQUOTAS; i++) 979 for (i = 0; i < MAXQUOTAS; i++)
973 if (dquots[i] != NODQUOT && warntype[i] != QUOTA_NL_NOWARN) { 980 if (dquots[i] != NODQUOT && warntype[i] != QUOTA_NL_NOWARN &&
981 !warning_issued(dquots[i], warntype[i])) {
974#ifdef CONFIG_PRINT_QUOTA_WARNING 982#ifdef CONFIG_PRINT_QUOTA_WARNING
975 print_warning(dquots[i], warntype[i]); 983 print_warning(dquots[i], warntype[i]);
976#endif 984#endif
@@ -1216,7 +1224,7 @@ warn_put_all:
1216 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1224 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1217 if (inode->i_dquot[cnt]) 1225 if (inode->i_dquot[cnt])
1218 mark_dquot_dirty(inode->i_dquot[cnt]); 1226 mark_dquot_dirty(inode->i_dquot[cnt]);
1219 flush_warnings((struct dquot **)inode->i_dquot, warntype); 1227 flush_warnings(inode->i_dquot, warntype);
1220 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1228 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1221 return ret; 1229 return ret;
1222} 1230}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index bbed2fd40fdc..f8ef0af919e7 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -799,7 +799,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
799 rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name, 799 rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name,
800 crypt_stat->cipher, "cbc"); 800 crypt_stat->cipher, "cbc");
801 if (rc) 801 if (rc)
802 goto out; 802 goto out_unlock;
803 crypt_stat->tfm = crypto_alloc_blkcipher(full_alg_name, 0, 803 crypt_stat->tfm = crypto_alloc_blkcipher(full_alg_name, 0,
804 CRYPTO_ALG_ASYNC); 804 CRYPTO_ALG_ASYNC);
805 kfree(full_alg_name); 805 kfree(full_alg_name);
@@ -808,12 +808,12 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
808 ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): " 808 ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
809 "Error initializing cipher [%s]\n", 809 "Error initializing cipher [%s]\n",
810 crypt_stat->cipher); 810 crypt_stat->cipher);
811 mutex_unlock(&crypt_stat->cs_tfm_mutex); 811 goto out_unlock;
812 goto out;
813 } 812 }
814 crypto_blkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); 813 crypto_blkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
815 mutex_unlock(&crypt_stat->cs_tfm_mutex);
816 rc = 0; 814 rc = 0;
815out_unlock:
816 mutex_unlock(&crypt_stat->cs_tfm_mutex);
817out: 817out:
818 return rc; 818 return rc;
819} 819}
@@ -1847,6 +1847,7 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
1847 mutex_init(&tmp_tfm->key_tfm_mutex); 1847 mutex_init(&tmp_tfm->key_tfm_mutex);
1848 strncpy(tmp_tfm->cipher_name, cipher_name, 1848 strncpy(tmp_tfm->cipher_name, cipher_name,
1849 ECRYPTFS_MAX_CIPHER_NAME_SIZE); 1849 ECRYPTFS_MAX_CIPHER_NAME_SIZE);
1850 tmp_tfm->cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
1850 tmp_tfm->key_size = key_size; 1851 tmp_tfm->key_size = key_size;
1851 rc = ecryptfs_process_key_cipher(&tmp_tfm->key_tfm, 1852 rc = ecryptfs_process_key_cipher(&tmp_tfm->key_tfm,
1852 tmp_tfm->cipher_name, 1853 tmp_tfm->cipher_name,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 0b1ab016fa2e..5a719180983c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -120,22 +120,9 @@ ecryptfs_do_create(struct inode *directory_inode,
120 rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, 120 rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
121 ecryptfs_dentry, mode, nd); 121 ecryptfs_dentry, mode, nd);
122 if (rc) { 122 if (rc) {
123 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; 123 printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
124 struct ecryptfs_inode_info *inode_info = 124 "rc = [%d]\n", __FUNCTION__, rc);
125 ecryptfs_inode_to_private(ecryptfs_inode); 125 goto out_lock;
126
127 printk(KERN_WARNING "%s: Error creating underlying file; "
128 "rc = [%d]; checking for existing\n", __FUNCTION__, rc);
129 if (inode_info) {
130 mutex_lock(&inode_info->lower_file_mutex);
131 if (!inode_info->lower_file) {
132 mutex_unlock(&inode_info->lower_file_mutex);
133 printk(KERN_ERR "%s: Failure to set underlying "
134 "file; rc = [%d]\n", __FUNCTION__, rc);
135 goto out_lock;
136 }
137 mutex_unlock(&inode_info->lower_file_mutex);
138 }
139 } 126 }
140 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, 127 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
141 directory_inode->i_sb, 0); 128 directory_inode->i_sb, 0);
@@ -451,6 +438,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
451 dentry->d_inode->i_nlink = 438 dentry->d_inode->i_nlink =
452 ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink; 439 ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink;
453 dentry->d_inode->i_ctime = dir->i_ctime; 440 dentry->d_inode->i_ctime = dir->i_ctime;
441 d_drop(dentry);
454out_unlock: 442out_unlock:
455 unlock_parent(lower_dentry); 443 unlock_parent(lower_dentry);
456 return rc; 444 return rc;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 263fed88c0ca..f458c1f35565 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1860,7 +1860,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
1860 struct ecryptfs_global_auth_tok *new_auth_tok; 1860 struct ecryptfs_global_auth_tok *new_auth_tok;
1861 int rc = 0; 1861 int rc = 0;
1862 1862
1863 new_auth_tok = kmem_cache_alloc(ecryptfs_global_auth_tok_cache, 1863 new_auth_tok = kmem_cache_zalloc(ecryptfs_global_auth_tok_cache,
1864 GFP_KERNEL); 1864 GFP_KERNEL);
1865 if (!new_auth_tok) { 1865 if (!new_auth_tok) {
1866 rc = -ENOMEM; 1866 rc = -ENOMEM;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index b83a512b7e08..0249aa4ae181 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -138,11 +138,14 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
138 inode_info->lower_file = dentry_open(lower_dentry, 138 inode_info->lower_file = dentry_open(lower_dentry,
139 lower_mnt, 139 lower_mnt,
140 (O_RDWR | O_LARGEFILE)); 140 (O_RDWR | O_LARGEFILE));
141 if (IS_ERR(inode_info->lower_file)) 141 if (IS_ERR(inode_info->lower_file)) {
142 dget(lower_dentry);
143 mntget(lower_mnt);
142 inode_info->lower_file = dentry_open(lower_dentry, 144 inode_info->lower_file = dentry_open(lower_dentry,
143 lower_mnt, 145 lower_mnt,
144 (O_RDONLY 146 (O_RDONLY
145 | O_LARGEFILE)); 147 | O_LARGEFILE));
148 }
146 if (IS_ERR(inode_info->lower_file)) { 149 if (IS_ERR(inode_info->lower_file)) {
147 printk(KERN_ERR "Error opening lower persistent file " 150 printk(KERN_ERR "Error opening lower persistent file "
148 "for lower_dentry [0x%p] and lower_mnt [0x%p]\n", 151 "for lower_dentry [0x%p] and lower_mnt [0x%p]\n",
@@ -523,6 +526,7 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
523 lower_mnt = nd.mnt; 526 lower_mnt = nd.mnt;
524 ecryptfs_set_superblock_lower(sb, lower_root->d_sb); 527 ecryptfs_set_superblock_lower(sb, lower_root->d_sb);
525 sb->s_maxbytes = lower_root->d_sb->s_maxbytes; 528 sb->s_maxbytes = lower_root->d_sb->s_maxbytes;
529 sb->s_blocksize = lower_root->d_sb->s_blocksize;
526 ecryptfs_set_dentry_lower(sb->s_root, lower_root); 530 ecryptfs_set_dentry_lower(sb->s_root, lower_root);
527 ecryptfs_set_dentry_lower_mnt(sb->s_root, lower_mnt); 531 ecryptfs_set_dentry_lower_mnt(sb->s_root, lower_mnt);
528 rc = ecryptfs_interpose(lower_root, sb->s_root, sb, 0); 532 rc = ecryptfs_interpose(lower_root, sb->s_root, sb, 0);
@@ -730,127 +734,40 @@ static int ecryptfs_init_kmem_caches(void)
730 return 0; 734 return 0;
731} 735}
732 736
733struct ecryptfs_obj { 737static struct kobject *ecryptfs_kobj;
734 char *name;
735 struct list_head slot_list;
736 struct kobject kobj;
737};
738
739struct ecryptfs_attribute {
740 struct attribute attr;
741 ssize_t(*show) (struct ecryptfs_obj *, char *);
742 ssize_t(*store) (struct ecryptfs_obj *, const char *, size_t);
743};
744
745static ssize_t
746ecryptfs_attr_store(struct kobject *kobj,
747 struct attribute *attr, const char *buf, size_t len)
748{
749 struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
750 kobj);
751 struct ecryptfs_attribute *attribute =
752 container_of(attr, struct ecryptfs_attribute, attr);
753
754 return (attribute->store ? attribute->store(obj, buf, len) : 0);
755}
756 738
757static ssize_t 739static ssize_t version_show(struct kobject *kobj,
758ecryptfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) 740 struct kobj_attribute *attr, char *buff)
759{ 741{
760 struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj, 742 return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
761 kobj);
762 struct ecryptfs_attribute *attribute =
763 container_of(attr, struct ecryptfs_attribute, attr);
764
765 return (attribute->show ? attribute->show(obj, buf) : 0);
766} 743}
767 744
768static struct sysfs_ops ecryptfs_sysfs_ops = { 745static struct kobj_attribute version_attr = __ATTR_RO(version);
769 .show = ecryptfs_attr_show,
770 .store = ecryptfs_attr_store
771};
772 746
773static struct kobj_type ecryptfs_ktype = { 747static struct attribute *attributes[] = {
774 .sysfs_ops = &ecryptfs_sysfs_ops 748 &version_attr.attr,
749 NULL,
775}; 750};
776 751
777static decl_subsys(ecryptfs, &ecryptfs_ktype, NULL); 752static struct attribute_group attr_group = {
778 753 .attrs = attributes,
779static ssize_t version_show(struct ecryptfs_obj *obj, char *buff)
780{
781 return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
782}
783
784static struct ecryptfs_attribute sysfs_attr_version = __ATTR_RO(version);
785
786static struct ecryptfs_version_str_map_elem {
787 u32 flag;
788 char *str;
789} ecryptfs_version_str_map[] = {
790 {ECRYPTFS_VERSIONING_PASSPHRASE, "passphrase"},
791 {ECRYPTFS_VERSIONING_PUBKEY, "pubkey"},
792 {ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH, "plaintext passthrough"},
793 {ECRYPTFS_VERSIONING_POLICY, "policy"},
794 {ECRYPTFS_VERSIONING_XATTR, "metadata in extended attribute"},
795 {ECRYPTFS_VERSIONING_MULTKEY, "multiple keys per file"}
796}; 754};
797 755
798static ssize_t version_str_show(struct ecryptfs_obj *obj, char *buff)
799{
800 int i;
801 int remaining = PAGE_SIZE;
802 int total_written = 0;
803
804 buff[0] = '\0';
805 for (i = 0; i < ARRAY_SIZE(ecryptfs_version_str_map); i++) {
806 int entry_size;
807
808 if (!(ECRYPTFS_VERSIONING_MASK
809 & ecryptfs_version_str_map[i].flag))
810 continue;
811 entry_size = strlen(ecryptfs_version_str_map[i].str);
812 if ((entry_size + 2) > remaining)
813 goto out;
814 memcpy(buff, ecryptfs_version_str_map[i].str, entry_size);
815 buff[entry_size++] = '\n';
816 buff[entry_size] = '\0';
817 buff += entry_size;
818 total_written += entry_size;
819 remaining -= entry_size;
820 }
821out:
822 return total_written;
823}
824
825static struct ecryptfs_attribute sysfs_attr_version_str = __ATTR_RO(version_str);
826
827static int do_sysfs_registration(void) 756static int do_sysfs_registration(void)
828{ 757{
829 int rc; 758 int rc;
830 759
831 rc = subsystem_register(&ecryptfs_subsys); 760 ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj);
832 if (rc) { 761 if (!ecryptfs_kobj) {
833 printk(KERN_ERR 762 printk(KERN_ERR "Unable to create ecryptfs kset\n");
834 "Unable to register ecryptfs sysfs subsystem\n"); 763 rc = -ENOMEM;
835 goto out;
836 }
837 rc = sysfs_create_file(&ecryptfs_subsys.kobj,
838 &sysfs_attr_version.attr);
839 if (rc) {
840 printk(KERN_ERR
841 "Unable to create ecryptfs version attribute\n");
842 subsystem_unregister(&ecryptfs_subsys);
843 goto out; 764 goto out;
844 } 765 }
845 rc = sysfs_create_file(&ecryptfs_subsys.kobj, 766 rc = sysfs_create_group(ecryptfs_kobj, &attr_group);
846 &sysfs_attr_version_str.attr);
847 if (rc) { 767 if (rc) {
848 printk(KERN_ERR 768 printk(KERN_ERR
849 "Unable to create ecryptfs version_str attribute\n"); 769 "Unable to create ecryptfs version attributes\n");
850 sysfs_remove_file(&ecryptfs_subsys.kobj, 770 kobject_put(ecryptfs_kobj);
851 &sysfs_attr_version.attr);
852 subsystem_unregister(&ecryptfs_subsys);
853 goto out;
854 } 771 }
855out: 772out:
856 return rc; 773 return rc;
@@ -858,11 +775,8 @@ out:
858 775
859static void do_sysfs_unregistration(void) 776static void do_sysfs_unregistration(void)
860{ 777{
861 sysfs_remove_file(&ecryptfs_subsys.kobj, 778 sysfs_remove_group(ecryptfs_kobj, &attr_group);
862 &sysfs_attr_version.attr); 779 kobject_put(ecryptfs_kobj);
863 sysfs_remove_file(&ecryptfs_subsys.kobj,
864 &sysfs_attr_version_str.attr);
865 subsystem_unregister(&ecryptfs_subsys);
866} 780}
867 781
868static int __init ecryptfs_init(void) 782static int __init ecryptfs_init(void)
@@ -890,7 +804,6 @@ static int __init ecryptfs_init(void)
890 printk(KERN_ERR "Failed to register filesystem\n"); 804 printk(KERN_ERR "Failed to register filesystem\n");
891 goto out_free_kmem_caches; 805 goto out_free_kmem_caches;
892 } 806 }
893 kobj_set_kset_s(&ecryptfs_subsys, fs_subsys);
894 rc = do_sysfs_registration(); 807 rc = do_sysfs_registration();
895 if (rc) { 808 if (rc) {
896 printk(KERN_ERR "sysfs registration failed\n"); 809 printk(KERN_ERR "sysfs registration failed\n");
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index a96d341d154d..9cc2aec27b0d 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -427,6 +427,7 @@ int ecryptfs_init_messaging(unsigned int transport)
427 if (!ecryptfs_daemon_id_hash) { 427 if (!ecryptfs_daemon_id_hash) {
428 rc = -ENOMEM; 428 rc = -ENOMEM;
429 ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n"); 429 ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n");
430 mutex_unlock(&ecryptfs_daemon_id_hash_mux);
430 goto out; 431 goto out;
431 } 432 }
432 for (i = 0; i < ecryptfs_hash_buckets; i++) 433 for (i = 0; i < ecryptfs_hash_buckets; i++)
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 16a7a555f392..32c5711d79a3 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -263,14 +263,13 @@ out:
263 return 0; 263 return 0;
264} 264}
265 265
266/* This function must zero any hole we create */
266static int ecryptfs_prepare_write(struct file *file, struct page *page, 267static int ecryptfs_prepare_write(struct file *file, struct page *page,
267 unsigned from, unsigned to) 268 unsigned from, unsigned to)
268{ 269{
269 int rc = 0; 270 int rc = 0;
271 loff_t prev_page_end_size;
270 272
271 if (from == 0 && to == PAGE_CACHE_SIZE)
272 goto out; /* If we are writing a full page, it will be
273 up to date. */
274 if (!PageUptodate(page)) { 273 if (!PageUptodate(page)) {
275 rc = ecryptfs_read_lower_page_segment(page, page->index, 0, 274 rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
276 PAGE_CACHE_SIZE, 275 PAGE_CACHE_SIZE,
@@ -283,22 +282,32 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
283 } else 282 } else
284 SetPageUptodate(page); 283 SetPageUptodate(page);
285 } 284 }
286 if (page->index != 0) {
287 loff_t end_of_prev_pg_pos =
288 (((loff_t)page->index << PAGE_CACHE_SHIFT) - 1);
289 285
290 if (end_of_prev_pg_pos > i_size_read(page->mapping->host)) { 286 prev_page_end_size = ((loff_t)page->index << PAGE_CACHE_SHIFT);
287
288 /*
289 * If creating a page or more of holes, zero them out via truncate.
290 * Note, this will increase i_size.
291 */
292 if (page->index != 0) {
293 if (prev_page_end_size > i_size_read(page->mapping->host)) {
291 rc = ecryptfs_truncate(file->f_path.dentry, 294 rc = ecryptfs_truncate(file->f_path.dentry,
292 end_of_prev_pg_pos); 295 prev_page_end_size);
293 if (rc) { 296 if (rc) {
294 printk(KERN_ERR "Error on attempt to " 297 printk(KERN_ERR "Error on attempt to "
295 "truncate to (higher) offset [%lld];" 298 "truncate to (higher) offset [%lld];"
296 " rc = [%d]\n", end_of_prev_pg_pos, rc); 299 " rc = [%d]\n", prev_page_end_size, rc);
297 goto out; 300 goto out;
298 } 301 }
299 } 302 }
300 if (end_of_prev_pg_pos + 1 > i_size_read(page->mapping->host)) 303 }
301 zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); 304 /*
305 * Writing to a new page, and creating a small hole from start of page?
306 * Zero it out.
307 */
308 if ((i_size_read(page->mapping->host) == prev_page_end_size) &&
309 (from != 0)) {
310 zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
302 } 311 }
303out: 312out:
304 return rc; 313 return rc;
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index 9aa345121e09..f638a698dc52 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -237,7 +237,6 @@ out:
237 */ 237 */
238void ecryptfs_release_netlink(void) 238void ecryptfs_release_netlink(void)
239{ 239{
240 if (ecryptfs_nl_sock && ecryptfs_nl_sock->sk_socket) 240 netlink_kernel_release(ecryptfs_nl_sock);
241 sock_release(ecryptfs_nl_sock->sk_socket);
242 ecryptfs_nl_sock = NULL; 241 ecryptfs_nl_sock = NULL;
243} 242}
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 6b7474a4336a..948f57624c05 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -124,6 +124,10 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
124 loff_t pos; 124 loff_t pos;
125 int rc = 0; 125 int rc = 0;
126 126
127 /*
128 * if we are writing beyond current size, then start pos
129 * at the current size - we'll fill in zeros from there.
130 */
127 if (offset > ecryptfs_file_size) 131 if (offset > ecryptfs_file_size)
128 pos = ecryptfs_file_size; 132 pos = ecryptfs_file_size;
129 else 133 else
@@ -137,6 +141,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
137 if (num_bytes > total_remaining_bytes) 141 if (num_bytes > total_remaining_bytes)
138 num_bytes = total_remaining_bytes; 142 num_bytes = total_remaining_bytes;
139 if (pos < offset) { 143 if (pos < offset) {
144 /* remaining zeros to write, up to destination offset */
140 size_t total_remaining_zeros = (offset - pos); 145 size_t total_remaining_zeros = (offset - pos);
141 146
142 if (num_bytes > total_remaining_zeros) 147 if (num_bytes > total_remaining_zeros)
@@ -167,17 +172,27 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
167 } 172 }
168 } 173 }
169 ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0); 174 ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
175
176 /*
177 * pos: where we're now writing, offset: where the request was
178 * If current pos is before request, we are filling zeros
179 * If we are at or beyond request, we are writing the *data*
180 * If we're in a fresh page beyond eof, zero it in either case
181 */
182 if (pos < offset || !start_offset_in_page) {
183 /* We are extending past the previous end of the file.
184 * Fill in zero values to the end of the page */
185 memset(((char *)ecryptfs_page_virt
186 + start_offset_in_page), 0,
187 PAGE_CACHE_SIZE - start_offset_in_page);
188 }
189
190 /* pos >= offset, we are now writing the data request */
170 if (pos >= offset) { 191 if (pos >= offset) {
171 memcpy(((char *)ecryptfs_page_virt 192 memcpy(((char *)ecryptfs_page_virt
172 + start_offset_in_page), 193 + start_offset_in_page),
173 (data + data_offset), num_bytes); 194 (data + data_offset), num_bytes);
174 data_offset += num_bytes; 195 data_offset += num_bytes;
175 } else {
176 /* We are extending past the previous end of the file.
177 * Fill in zero values up to the start of where we
178 * will be writing data. */
179 memset(((char *)ecryptfs_page_virt
180 + start_offset_in_page), 0, num_bytes);
181 } 196 }
182 kunmap_atomic(ecryptfs_page_virt, KM_USER0); 197 kunmap_atomic(ecryptfs_page_virt, KM_USER0);
183 flush_dcache_page(ecryptfs_page); 198 flush_dcache_page(ecryptfs_page);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index f8cdab2bee3d..4859c4eecd65 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -86,7 +86,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
86 fput(inode_info->lower_file); 86 fput(inode_info->lower_file);
87 inode_info->lower_file = NULL; 87 inode_info->lower_file = NULL;
88 d_drop(lower_dentry); 88 d_drop(lower_dentry);
89 d_delete(lower_dentry);
90 } 89 }
91 } 90 }
92 mutex_unlock(&inode_info->lower_file_mutex); 91 mutex_unlock(&inode_info->lower_file_mutex);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 154e25f13d77..6abaf75163f0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -680,11 +680,31 @@ static int ext2_check_descriptors (struct super_block * sb)
680static loff_t ext2_max_size(int bits) 680static loff_t ext2_max_size(int bits)
681{ 681{
682 loff_t res = EXT2_NDIR_BLOCKS; 682 loff_t res = EXT2_NDIR_BLOCKS;
683 /* This constant is calculated to be the largest file size for a 683 int meta_blocks;
684 * dense, 4k-blocksize file such that the total number of 684 loff_t upper_limit;
685
686 /* This is calculated to be the largest file size for a
687 * dense, file such that the total number of
685 * sectors in the file, including data and all indirect blocks, 688 * sectors in the file, including data and all indirect blocks,
686 * does not exceed 2^32. */ 689 * does not exceed 2^32 -1
687 const loff_t upper_limit = 0x1ff7fffd000LL; 690 * __u32 i_blocks representing the total number of
691 * 512 bytes blocks of the file
692 */
693 upper_limit = (1LL << 32) - 1;
694
695 /* total blocks in file system block size */
696 upper_limit >>= (bits - 9);
697
698
699 /* indirect blocks */
700 meta_blocks = 1;
701 /* double indirect blocks */
702 meta_blocks += 1 + (1LL << (bits-2));
703 /* tripple indirect blocks */
704 meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
705
706 upper_limit -= meta_blocks;
707 upper_limit <<= bits;
688 708
689 res += 1LL << (bits-2); 709 res += 1LL << (bits-2);
690 res += 1LL << (2*(bits-2)); 710 res += 1LL << (2*(bits-2));
@@ -692,6 +712,10 @@ static loff_t ext2_max_size(int bits)
692 res <<= bits; 712 res <<= bits;
693 if (res > upper_limit) 713 if (res > upper_limit)
694 res = upper_limit; 714 res = upper_limit;
715
716 if (res > MAX_LFS_FILESIZE)
717 res = MAX_LFS_FILESIZE;
718
695 return res; 719 return res;
696} 720}
697 721
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index de55da9e28ba..f3675cc630e9 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1436,11 +1436,31 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1436static loff_t ext3_max_size(int bits) 1436static loff_t ext3_max_size(int bits)
1437{ 1437{
1438 loff_t res = EXT3_NDIR_BLOCKS; 1438 loff_t res = EXT3_NDIR_BLOCKS;
1439 /* This constant is calculated to be the largest file size for a 1439 int meta_blocks;
1440 * dense, 4k-blocksize file such that the total number of 1440 loff_t upper_limit;
1441
1442 /* This is calculated to be the largest file size for a
1443 * dense, file such that the total number of
1441 * sectors in the file, including data and all indirect blocks, 1444 * sectors in the file, including data and all indirect blocks,
1442 * does not exceed 2^32. */ 1445 * does not exceed 2^32 -1
1443 const loff_t upper_limit = 0x1ff7fffd000LL; 1446 * __u32 i_blocks representing the total number of
1447 * 512 bytes blocks of the file
1448 */
1449 upper_limit = (1LL << 32) - 1;
1450
1451 /* total blocks in file system block size */
1452 upper_limit >>= (bits - 9);
1453
1454
1455 /* indirect blocks */
1456 meta_blocks = 1;
1457 /* double indirect blocks */
1458 meta_blocks += 1 + (1LL << (bits-2));
1459 /* tripple indirect blocks */
1460 meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
1461
1462 upper_limit -= meta_blocks;
1463 upper_limit <<= bits;
1444 1464
1445 res += 1LL << (bits-2); 1465 res += 1LL << (bits-2);
1446 res += 1LL << (2*(bits-2)); 1466 res += 1LL << (2*(bits-2));
@@ -1448,6 +1468,10 @@ static loff_t ext3_max_size(int bits)
1448 res <<= bits; 1468 res <<= bits;
1449 if (res > upper_limit) 1469 if (res > upper_limit)
1450 res = upper_limit; 1470 res = upper_limit;
1471
1472 if (res > MAX_LFS_FILESIZE)
1473 res = MAX_LFS_FILESIZE;
1474
1451 return res; 1475 return res;
1452} 1476}
1453 1477
@@ -1676,7 +1700,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1676 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); 1700 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
1677 sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); 1701 sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
1678 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); 1702 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
1679 if (EXT3_INODE_SIZE(sb) == 0) 1703 if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0)
1680 goto cantfind_ext3; 1704 goto cantfind_ext3;
1681 sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb); 1705 sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
1682 if (sbi->s_inodes_per_block == 0) 1706 if (sbi->s_inodes_per_block == 0)
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ae6e7e502ac9..ac6fa8ca0a2f 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
6 6
7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o 9 ext4_jbd2.o migrate.o mballoc.o
10 10
11ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 11ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o 12ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 71ee95e534fd..ac75ea953d83 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -29,7 +29,7 @@
29 * Calculate the block group number and offset, given a block number 29 * Calculate the block group number and offset, given a block number
30 */ 30 */
31void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 31void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
32 unsigned long *blockgrpp, ext4_grpblk_t *offsetp) 32 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
33{ 33{
34 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 34 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
35 ext4_grpblk_t offset; 35 ext4_grpblk_t offset;
@@ -46,7 +46,7 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
46/* Initializes an uninitialized block bitmap if given, and returns the 46/* Initializes an uninitialized block bitmap if given, and returns the
47 * number of blocks free in the group. */ 47 * number of blocks free in the group. */
48unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, 48unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
49 int block_group, struct ext4_group_desc *gdp) 49 ext4_group_t block_group, struct ext4_group_desc *gdp)
50{ 50{
51 unsigned long start; 51 unsigned long start;
52 int bit, bit_max; 52 int bit, bit_max;
@@ -60,7 +60,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
60 * essentially implementing a per-group read-only flag. */ 60 * essentially implementing a per-group read-only flag. */
61 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 61 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
62 ext4_error(sb, __FUNCTION__, 62 ext4_error(sb, __FUNCTION__,
63 "Checksum bad for group %u\n", block_group); 63 "Checksum bad for group %lu\n", block_group);
64 gdp->bg_free_blocks_count = 0; 64 gdp->bg_free_blocks_count = 0;
65 gdp->bg_free_inodes_count = 0; 65 gdp->bg_free_inodes_count = 0;
66 gdp->bg_itable_unused = 0; 66 gdp->bg_itable_unused = 0;
@@ -153,7 +153,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
153 * group descriptor 153 * group descriptor
154 */ 154 */
155struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 155struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
156 unsigned int block_group, 156 ext4_group_t block_group,
157 struct buffer_head ** bh) 157 struct buffer_head ** bh)
158{ 158{
159 unsigned long group_desc; 159 unsigned long group_desc;
@@ -164,7 +164,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
164 if (block_group >= sbi->s_groups_count) { 164 if (block_group >= sbi->s_groups_count) {
165 ext4_error (sb, "ext4_get_group_desc", 165 ext4_error (sb, "ext4_get_group_desc",
166 "block_group >= groups_count - " 166 "block_group >= groups_count - "
167 "block_group = %d, groups_count = %lu", 167 "block_group = %lu, groups_count = %lu",
168 block_group, sbi->s_groups_count); 168 block_group, sbi->s_groups_count);
169 169
170 return NULL; 170 return NULL;
@@ -176,7 +176,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
176 if (!sbi->s_group_desc[group_desc]) { 176 if (!sbi->s_group_desc[group_desc]) {
177 ext4_error (sb, "ext4_get_group_desc", 177 ext4_error (sb, "ext4_get_group_desc",
178 "Group descriptor not loaded - " 178 "Group descriptor not loaded - "
179 "block_group = %d, group_desc = %lu, desc = %lu", 179 "block_group = %lu, group_desc = %lu, desc = %lu",
180 block_group, group_desc, offset); 180 block_group, group_desc, offset);
181 return NULL; 181 return NULL;
182 } 182 }
@@ -189,18 +189,70 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
189 return desc; 189 return desc;
190} 190}
191 191
192static int ext4_valid_block_bitmap(struct super_block *sb,
193 struct ext4_group_desc *desc,
194 unsigned int block_group,
195 struct buffer_head *bh)
196{
197 ext4_grpblk_t offset;
198 ext4_grpblk_t next_zero_bit;
199 ext4_fsblk_t bitmap_blk;
200 ext4_fsblk_t group_first_block;
201
202 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
203 /* with FLEX_BG, the inode/block bitmaps and itable
204 * blocks may not be in the group at all
205 * so the bitmap validation will be skipped for those groups
206 * or it has to also read the block group where the bitmaps
207 * are located to verify they are set.
208 */
209 return 1;
210 }
211 group_first_block = ext4_group_first_block_no(sb, block_group);
212
213 /* check whether block bitmap block number is set */
214 bitmap_blk = ext4_block_bitmap(sb, desc);
215 offset = bitmap_blk - group_first_block;
216 if (!ext4_test_bit(offset, bh->b_data))
217 /* bad block bitmap */
218 goto err_out;
219
220 /* check whether the inode bitmap block number is set */
221 bitmap_blk = ext4_inode_bitmap(sb, desc);
222 offset = bitmap_blk - group_first_block;
223 if (!ext4_test_bit(offset, bh->b_data))
224 /* bad block bitmap */
225 goto err_out;
226
227 /* check whether the inode table block number is set */
228 bitmap_blk = ext4_inode_table(sb, desc);
229 offset = bitmap_blk - group_first_block;
230 next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
231 offset + EXT4_SB(sb)->s_itb_per_group,
232 offset);
233 if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
234 /* good bitmap for inode tables */
235 return 1;
236
237err_out:
238 ext4_error(sb, __FUNCTION__,
239 "Invalid block bitmap - "
240 "block_group = %d, block = %llu",
241 block_group, bitmap_blk);
242 return 0;
243}
192/** 244/**
193 * read_block_bitmap() 245 * read_block_bitmap()
194 * @sb: super block 246 * @sb: super block
195 * @block_group: given block group 247 * @block_group: given block group
196 * 248 *
197 * Read the bitmap for a given block_group, reading into the specified 249 * Read the bitmap for a given block_group,and validate the
198 * slot in the superblock's bitmap cache. 250 * bits for block/inode/inode tables are set in the bitmaps
199 * 251 *
200 * Return buffer_head on success or NULL in case of failure. 252 * Return buffer_head on success or NULL in case of failure.
201 */ 253 */
202struct buffer_head * 254struct buffer_head *
203read_block_bitmap(struct super_block *sb, unsigned int block_group) 255read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
204{ 256{
205 struct ext4_group_desc * desc; 257 struct ext4_group_desc * desc;
206 struct buffer_head * bh = NULL; 258 struct buffer_head * bh = NULL;
@@ -210,25 +262,36 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
210 if (!desc) 262 if (!desc)
211 return NULL; 263 return NULL;
212 bitmap_blk = ext4_block_bitmap(sb, desc); 264 bitmap_blk = ext4_block_bitmap(sb, desc);
265 bh = sb_getblk(sb, bitmap_blk);
266 if (unlikely(!bh)) {
267 ext4_error(sb, __FUNCTION__,
268 "Cannot read block bitmap - "
269 "block_group = %d, block_bitmap = %llu",
270 (int)block_group, (unsigned long long)bitmap_blk);
271 return NULL;
272 }
273 if (bh_uptodate_or_lock(bh))
274 return bh;
275
213 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 276 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
214 bh = sb_getblk(sb, bitmap_blk); 277 ext4_init_block_bitmap(sb, bh, block_group, desc);
215 if (!buffer_uptodate(bh)) { 278 set_buffer_uptodate(bh);
216 lock_buffer(bh); 279 unlock_buffer(bh);
217 if (!buffer_uptodate(bh)) { 280 return bh;
218 ext4_init_block_bitmap(sb, bh, block_group,
219 desc);
220 set_buffer_uptodate(bh);
221 }
222 unlock_buffer(bh);
223 }
224 } else {
225 bh = sb_bread(sb, bitmap_blk);
226 } 281 }
227 if (!bh) 282 if (bh_submit_read(bh) < 0) {
228 ext4_error (sb, __FUNCTION__, 283 put_bh(bh);
284 ext4_error(sb, __FUNCTION__,
229 "Cannot read block bitmap - " 285 "Cannot read block bitmap - "
230 "block_group = %d, block_bitmap = %llu", 286 "block_group = %d, block_bitmap = %llu",
231 block_group, bitmap_blk); 287 (int)block_group, (unsigned long long)bitmap_blk);
288 return NULL;
289 }
290 if (!ext4_valid_block_bitmap(sb, desc, block_group, bh)) {
291 put_bh(bh);
292 return NULL;
293 }
294
232 return bh; 295 return bh;
233} 296}
234/* 297/*
@@ -320,7 +383,7 @@ restart:
320 */ 383 */
321static int 384static int
322goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal, 385goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
323 unsigned int group, struct super_block * sb) 386 ext4_group_t group, struct super_block *sb)
324{ 387{
325 ext4_fsblk_t group_first_block, group_last_block; 388 ext4_fsblk_t group_first_block, group_last_block;
326 389
@@ -463,7 +526,7 @@ static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
463 * when setting the reservation window size through ioctl before the file 526 * when setting the reservation window size through ioctl before the file
464 * is open for write (needs block allocation). 527 * is open for write (needs block allocation).
465 * 528 *
466 * Needs truncate_mutex protection prior to call this function. 529 * Needs down_write(i_data_sem) protection prior to call this function.
467 */ 530 */
468void ext4_init_block_alloc_info(struct inode *inode) 531void ext4_init_block_alloc_info(struct inode *inode)
469{ 532{
@@ -514,6 +577,8 @@ void ext4_discard_reservation(struct inode *inode)
514 struct ext4_reserve_window_node *rsv; 577 struct ext4_reserve_window_node *rsv;
515 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock; 578 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
516 579
580 ext4_mb_discard_inode_preallocations(inode);
581
517 if (!block_i) 582 if (!block_i)
518 return; 583 return;
519 584
@@ -540,7 +605,7 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
540{ 605{
541 struct buffer_head *bitmap_bh = NULL; 606 struct buffer_head *bitmap_bh = NULL;
542 struct buffer_head *gd_bh; 607 struct buffer_head *gd_bh;
543 unsigned long block_group; 608 ext4_group_t block_group;
544 ext4_grpblk_t bit; 609 ext4_grpblk_t bit;
545 unsigned long i; 610 unsigned long i;
546 unsigned long overflow; 611 unsigned long overflow;
@@ -587,11 +652,13 @@ do_more:
587 in_range(ext4_inode_bitmap(sb, desc), block, count) || 652 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
588 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 653 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
589 in_range(block + count - 1, ext4_inode_table(sb, desc), 654 in_range(block + count - 1, ext4_inode_table(sb, desc),
590 sbi->s_itb_per_group)) 655 sbi->s_itb_per_group)) {
591 ext4_error (sb, "ext4_free_blocks", 656 ext4_error (sb, "ext4_free_blocks",
592 "Freeing blocks in system zones - " 657 "Freeing blocks in system zones - "
593 "Block = %llu, count = %lu", 658 "Block = %llu, count = %lu",
594 block, count); 659 block, count);
660 goto error_return;
661 }
595 662
596 /* 663 /*
597 * We are about to start releasing blocks in the bitmap, 664 * We are about to start releasing blocks in the bitmap,
@@ -720,19 +787,29 @@ error_return:
720 * @inode: inode 787 * @inode: inode
721 * @block: start physical block to free 788 * @block: start physical block to free
722 * @count: number of blocks to count 789 * @count: number of blocks to count
790 * @metadata: Are these metadata blocks
723 */ 791 */
724void ext4_free_blocks(handle_t *handle, struct inode *inode, 792void ext4_free_blocks(handle_t *handle, struct inode *inode,
725 ext4_fsblk_t block, unsigned long count) 793 ext4_fsblk_t block, unsigned long count,
794 int metadata)
726{ 795{
727 struct super_block * sb; 796 struct super_block * sb;
728 unsigned long dquot_freed_blocks; 797 unsigned long dquot_freed_blocks;
729 798
799 /* this isn't the right place to decide whether block is metadata
800 * inode.c/extents.c knows better, but for safety ... */
801 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
802 ext4_should_journal_data(inode))
803 metadata = 1;
804
730 sb = inode->i_sb; 805 sb = inode->i_sb;
731 if (!sb) { 806
732 printk ("ext4_free_blocks: nonexistent device"); 807 if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
733 return; 808 ext4_free_blocks_sb(handle, sb, block, count,
734 } 809 &dquot_freed_blocks);
735 ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); 810 else
811 ext4_mb_free_blocks(handle, inode, block, count,
812 metadata, &dquot_freed_blocks);
736 if (dquot_freed_blocks) 813 if (dquot_freed_blocks)
737 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); 814 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
738 return; 815 return;
@@ -920,9 +997,10 @@ claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
920 * ext4_journal_release_buffer(), else we'll run out of credits. 997 * ext4_journal_release_buffer(), else we'll run out of credits.
921 */ 998 */
922static ext4_grpblk_t 999static ext4_grpblk_t
923ext4_try_to_allocate(struct super_block *sb, handle_t *handle, int group, 1000ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
924 struct buffer_head *bitmap_bh, ext4_grpblk_t grp_goal, 1001 ext4_group_t group, struct buffer_head *bitmap_bh,
925 unsigned long *count, struct ext4_reserve_window *my_rsv) 1002 ext4_grpblk_t grp_goal, unsigned long *count,
1003 struct ext4_reserve_window *my_rsv)
926{ 1004{
927 ext4_fsblk_t group_first_block; 1005 ext4_fsblk_t group_first_block;
928 ext4_grpblk_t start, end; 1006 ext4_grpblk_t start, end;
@@ -1156,7 +1234,7 @@ static int find_next_reservable_window(
1156 */ 1234 */
1157static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv, 1235static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
1158 ext4_grpblk_t grp_goal, struct super_block *sb, 1236 ext4_grpblk_t grp_goal, struct super_block *sb,
1159 unsigned int group, struct buffer_head *bitmap_bh) 1237 ext4_group_t group, struct buffer_head *bitmap_bh)
1160{ 1238{
1161 struct ext4_reserve_window_node *search_head; 1239 struct ext4_reserve_window_node *search_head;
1162 ext4_fsblk_t group_first_block, group_end_block, start_block; 1240 ext4_fsblk_t group_first_block, group_end_block, start_block;
@@ -1354,7 +1432,7 @@ static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
1354 */ 1432 */
1355static ext4_grpblk_t 1433static ext4_grpblk_t
1356ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, 1434ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1357 unsigned int group, struct buffer_head *bitmap_bh, 1435 ext4_group_t group, struct buffer_head *bitmap_bh,
1358 ext4_grpblk_t grp_goal, 1436 ext4_grpblk_t grp_goal,
1359 struct ext4_reserve_window_node * my_rsv, 1437 struct ext4_reserve_window_node * my_rsv,
1360 unsigned long *count, int *errp) 1438 unsigned long *count, int *errp)
@@ -1510,7 +1588,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1510} 1588}
1511 1589
1512/** 1590/**
1513 * ext4_new_blocks() -- core block(s) allocation function 1591 * ext4_new_blocks_old() -- core block(s) allocation function
1514 * @handle: handle to this transaction 1592 * @handle: handle to this transaction
1515 * @inode: file inode 1593 * @inode: file inode
1516 * @goal: given target block(filesystem wide) 1594 * @goal: given target block(filesystem wide)
@@ -1523,17 +1601,17 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1523 * any specific goal block. 1601 * any specific goal block.
1524 * 1602 *
1525 */ 1603 */
1526ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, 1604ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
1527 ext4_fsblk_t goal, unsigned long *count, int *errp) 1605 ext4_fsblk_t goal, unsigned long *count, int *errp)
1528{ 1606{
1529 struct buffer_head *bitmap_bh = NULL; 1607 struct buffer_head *bitmap_bh = NULL;
1530 struct buffer_head *gdp_bh; 1608 struct buffer_head *gdp_bh;
1531 unsigned long group_no; 1609 ext4_group_t group_no;
1532 int goal_group; 1610 ext4_group_t goal_group;
1533 ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */ 1611 ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1534 ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ 1612 ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1535 ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */ 1613 ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */
1536 int bgi; /* blockgroup iteration index */ 1614 ext4_group_t bgi; /* blockgroup iteration index */
1537 int fatal = 0, err; 1615 int fatal = 0, err;
1538 int performed_allocation = 0; 1616 int performed_allocation = 0;
1539 ext4_grpblk_t free_blocks; /* number of free blocks in a group */ 1617 ext4_grpblk_t free_blocks; /* number of free blocks in a group */
@@ -1544,10 +1622,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1544 struct ext4_reserve_window_node *my_rsv = NULL; 1622 struct ext4_reserve_window_node *my_rsv = NULL;
1545 struct ext4_block_alloc_info *block_i; 1623 struct ext4_block_alloc_info *block_i;
1546 unsigned short windowsz = 0; 1624 unsigned short windowsz = 0;
1547#ifdef EXT4FS_DEBUG 1625 ext4_group_t ngroups;
1548 static int goal_hits, goal_attempts;
1549#endif
1550 unsigned long ngroups;
1551 unsigned long num = *count; 1626 unsigned long num = *count;
1552 1627
1553 *errp = -ENOSPC; 1628 *errp = -ENOSPC;
@@ -1567,7 +1642,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1567 1642
1568 sbi = EXT4_SB(sb); 1643 sbi = EXT4_SB(sb);
1569 es = EXT4_SB(sb)->s_es; 1644 es = EXT4_SB(sb)->s_es;
1570 ext4_debug("goal=%lu.\n", goal); 1645 ext4_debug("goal=%llu.\n", goal);
1571 /* 1646 /*
1572 * Allocate a block from reservation only when 1647 * Allocate a block from reservation only when
1573 * filesystem is mounted with reservation(default,-o reservation), and 1648 * filesystem is mounted with reservation(default,-o reservation), and
@@ -1677,7 +1752,7 @@ retry_alloc:
1677 1752
1678allocated: 1753allocated:
1679 1754
1680 ext4_debug("using block group %d(%d)\n", 1755 ext4_debug("using block group %lu(%d)\n",
1681 group_no, gdp->bg_free_blocks_count); 1756 group_no, gdp->bg_free_blocks_count);
1682 1757
1683 BUFFER_TRACE(gdp_bh, "get_write_access"); 1758 BUFFER_TRACE(gdp_bh, "get_write_access");
@@ -1692,11 +1767,13 @@ allocated:
1692 in_range(ret_block, ext4_inode_table(sb, gdp), 1767 in_range(ret_block, ext4_inode_table(sb, gdp),
1693 EXT4_SB(sb)->s_itb_per_group) || 1768 EXT4_SB(sb)->s_itb_per_group) ||
1694 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp), 1769 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
1695 EXT4_SB(sb)->s_itb_per_group)) 1770 EXT4_SB(sb)->s_itb_per_group)) {
1696 ext4_error(sb, "ext4_new_block", 1771 ext4_error(sb, "ext4_new_block",
1697 "Allocating block in system zone - " 1772 "Allocating block in system zone - "
1698 "blocks from %llu, length %lu", 1773 "blocks from %llu, length %lu",
1699 ret_block, num); 1774 ret_block, num);
1775 goto out;
1776 }
1700 1777
1701 performed_allocation = 1; 1778 performed_allocation = 1;
1702 1779
@@ -1743,9 +1820,6 @@ allocated:
1743 * list of some description. We don't know in advance whether 1820 * list of some description. We don't know in advance whether
1744 * the caller wants to use it as metadata or data. 1821 * the caller wants to use it as metadata or data.
1745 */ 1822 */
1746 ext4_debug("allocating block %lu. Goal hits %d of %d.\n",
1747 ret_block, goal_hits, goal_attempts);
1748
1749 spin_lock(sb_bgl_lock(sbi, group_no)); 1823 spin_lock(sb_bgl_lock(sbi, group_no));
1750 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) 1824 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1751 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 1825 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
@@ -1787,13 +1861,46 @@ out:
1787} 1861}
1788 1862
1789ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, 1863ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
1790 ext4_fsblk_t goal, int *errp) 1864 ext4_fsblk_t goal, int *errp)
1791{ 1865{
1792 unsigned long count = 1; 1866 struct ext4_allocation_request ar;
1867 ext4_fsblk_t ret;
1793 1868
1794 return ext4_new_blocks(handle, inode, goal, &count, errp); 1869 if (!test_opt(inode->i_sb, MBALLOC)) {
1870 unsigned long count = 1;
1871 ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
1872 return ret;
1873 }
1874
1875 memset(&ar, 0, sizeof(ar));
1876 ar.inode = inode;
1877 ar.goal = goal;
1878 ar.len = 1;
1879 ret = ext4_mb_new_blocks(handle, &ar, errp);
1880 return ret;
1881}
1882
1883ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1884 ext4_fsblk_t goal, unsigned long *count, int *errp)
1885{
1886 struct ext4_allocation_request ar;
1887 ext4_fsblk_t ret;
1888
1889 if (!test_opt(inode->i_sb, MBALLOC)) {
1890 ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
1891 return ret;
1892 }
1893
1894 memset(&ar, 0, sizeof(ar));
1895 ar.inode = inode;
1896 ar.goal = goal;
1897 ar.len = *count;
1898 ret = ext4_mb_new_blocks(handle, &ar, errp);
1899 *count = ar.len;
1900 return ret;
1795} 1901}
1796 1902
1903
1797/** 1904/**
1798 * ext4_count_free_blocks() -- count filesystem free blocks 1905 * ext4_count_free_blocks() -- count filesystem free blocks
1799 * @sb: superblock 1906 * @sb: superblock
@@ -1804,8 +1911,8 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
1804{ 1911{
1805 ext4_fsblk_t desc_count; 1912 ext4_fsblk_t desc_count;
1806 struct ext4_group_desc *gdp; 1913 struct ext4_group_desc *gdp;
1807 int i; 1914 ext4_group_t i;
1808 unsigned long ngroups = EXT4_SB(sb)->s_groups_count; 1915 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
1809#ifdef EXT4FS_DEBUG 1916#ifdef EXT4FS_DEBUG
1810 struct ext4_super_block *es; 1917 struct ext4_super_block *es;
1811 ext4_fsblk_t bitmap_count; 1918 ext4_fsblk_t bitmap_count;
@@ -1829,14 +1936,14 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
1829 continue; 1936 continue;
1830 1937
1831 x = ext4_count_free(bitmap_bh, sb->s_blocksize); 1938 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
1832 printk("group %d: stored = %d, counted = %lu\n", 1939 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
1833 i, le16_to_cpu(gdp->bg_free_blocks_count), x); 1940 i, le16_to_cpu(gdp->bg_free_blocks_count), x);
1834 bitmap_count += x; 1941 bitmap_count += x;
1835 } 1942 }
1836 brelse(bitmap_bh); 1943 brelse(bitmap_bh);
1837 printk("ext4_count_free_blocks: stored = %llu" 1944 printk("ext4_count_free_blocks: stored = %llu"
1838 ", computed = %llu, %llu\n", 1945 ", computed = %llu, %llu\n",
1839 EXT4_FREE_BLOCKS_COUNT(es), 1946 ext4_free_blocks_count(es),
1840 desc_count, bitmap_count); 1947 desc_count, bitmap_count);
1841 return bitmap_count; 1948 return bitmap_count;
1842#else 1949#else
@@ -1853,7 +1960,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
1853#endif 1960#endif
1854} 1961}
1855 1962
1856static inline int test_root(int a, int b) 1963static inline int test_root(ext4_group_t a, int b)
1857{ 1964{
1858 int num = b; 1965 int num = b;
1859 1966
@@ -1862,7 +1969,7 @@ static inline int test_root(int a, int b)
1862 return num == a; 1969 return num == a;
1863} 1970}
1864 1971
1865static int ext4_group_sparse(int group) 1972static int ext4_group_sparse(ext4_group_t group)
1866{ 1973{
1867 if (group <= 1) 1974 if (group <= 1)
1868 return 1; 1975 return 1;
@@ -1880,7 +1987,7 @@ static int ext4_group_sparse(int group)
1880 * Return the number of blocks used by the superblock (primary or backup) 1987 * Return the number of blocks used by the superblock (primary or backup)
1881 * in this group. Currently this will be only 0 or 1. 1988 * in this group. Currently this will be only 0 or 1.
1882 */ 1989 */
1883int ext4_bg_has_super(struct super_block *sb, int group) 1990int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
1884{ 1991{
1885 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 1992 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1886 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && 1993 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
@@ -1889,18 +1996,20 @@ int ext4_bg_has_super(struct super_block *sb, int group)
1889 return 1; 1996 return 1;
1890} 1997}
1891 1998
1892static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, int group) 1999static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
2000 ext4_group_t group)
1893{ 2001{
1894 unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); 2002 unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
1895 unsigned long first = metagroup * EXT4_DESC_PER_BLOCK(sb); 2003 ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
1896 unsigned long last = first + EXT4_DESC_PER_BLOCK(sb) - 1; 2004 ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
1897 2005
1898 if (group == first || group == first + 1 || group == last) 2006 if (group == first || group == first + 1 || group == last)
1899 return 1; 2007 return 1;
1900 return 0; 2008 return 0;
1901} 2009}
1902 2010
1903static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group) 2011static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
2012 ext4_group_t group)
1904{ 2013{
1905 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 2014 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1906 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && 2015 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
@@ -1918,7 +2027,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group)
1918 * (primary or backup) in this group. In the future there may be a 2027 * (primary or backup) in this group. In the future there may be a
1919 * different number of descriptor blocks in each group. 2028 * different number of descriptor blocks in each group.
1920 */ 2029 */
1921unsigned long ext4_bg_num_gdb(struct super_block *sb, int group) 2030unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
1922{ 2031{
1923 unsigned long first_meta_bg = 2032 unsigned long first_meta_bg =
1924 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); 2033 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f612bef98315..33888bb58144 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
67 unsigned long offset) 67 unsigned long offset)
68{ 68{
69 const char * error_msg = NULL; 69 const char * error_msg = NULL;
70 const int rlen = le16_to_cpu(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len);
71 71
72 if (rlen < EXT4_DIR_REC_LEN(1)) 72 if (rlen < EXT4_DIR_REC_LEN(1))
73 error_msg = "rec_len is smaller than minimal"; 73 error_msg = "rec_len is smaller than minimal";
@@ -124,7 +124,7 @@ static int ext4_readdir(struct file * filp,
124 offset = filp->f_pos & (sb->s_blocksize - 1); 124 offset = filp->f_pos & (sb->s_blocksize - 1);
125 125
126 while (!error && !stored && filp->f_pos < inode->i_size) { 126 while (!error && !stored && filp->f_pos < inode->i_size) {
127 unsigned long blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); 127 ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
128 struct buffer_head map_bh; 128 struct buffer_head map_bh;
129 struct buffer_head *bh = NULL; 129 struct buffer_head *bh = NULL;
130 130
@@ -172,10 +172,10 @@ revalidate:
172 * least that it is non-zero. A 172 * least that it is non-zero. A
173 * failure will be detected in the 173 * failure will be detected in the
174 * dirent test below. */ 174 * dirent test below. */
175 if (le16_to_cpu(de->rec_len) < 175 if (ext4_rec_len_from_disk(de->rec_len)
176 EXT4_DIR_REC_LEN(1)) 176 < EXT4_DIR_REC_LEN(1))
177 break; 177 break;
178 i += le16_to_cpu(de->rec_len); 178 i += ext4_rec_len_from_disk(de->rec_len);
179 } 179 }
180 offset = i; 180 offset = i;
181 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 181 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -197,7 +197,7 @@ revalidate:
197 ret = stored; 197 ret = stored;
198 goto out; 198 goto out;
199 } 199 }
200 offset += le16_to_cpu(de->rec_len); 200 offset += ext4_rec_len_from_disk(de->rec_len);
201 if (le32_to_cpu(de->inode)) { 201 if (le32_to_cpu(de->inode)) {
202 /* We might block in the next section 202 /* We might block in the next section
203 * if the data destination is 203 * if the data destination is
@@ -219,7 +219,7 @@ revalidate:
219 goto revalidate; 219 goto revalidate;
220 stored ++; 220 stored ++;
221 } 221 }
222 filp->f_pos += le16_to_cpu(de->rec_len); 222 filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
223 } 223 }
224 offset = 0; 224 offset = 0;
225 brelse (bh); 225 brelse (bh);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 85287742f2ae..bc7081f1fbe8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -61,7 +61,7 @@ static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
61 * idx_pblock: 61 * idx_pblock:
62 * combine low and high parts of a leaf physical block number into ext4_fsblk_t 62 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
63 */ 63 */
64static ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix) 64ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
65{ 65{
66 ext4_fsblk_t block; 66 ext4_fsblk_t block;
67 67
@@ -75,7 +75,7 @@ static ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
75 * stores a large physical block number into an extent struct, 75 * stores a large physical block number into an extent struct,
76 * breaking it into parts 76 * breaking it into parts
77 */ 77 */
78static void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) 78void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
79{ 79{
80 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); 80 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
81 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 81 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
@@ -144,7 +144,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
144 144
145static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 145static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
146 struct ext4_ext_path *path, 146 struct ext4_ext_path *path,
147 ext4_fsblk_t block) 147 ext4_lblk_t block)
148{ 148{
149 struct ext4_inode_info *ei = EXT4_I(inode); 149 struct ext4_inode_info *ei = EXT4_I(inode);
150 ext4_fsblk_t bg_start; 150 ext4_fsblk_t bg_start;
@@ -367,13 +367,14 @@ static void ext4_ext_drop_refs(struct ext4_ext_path *path)
367 * the header must be checked before calling this 367 * the header must be checked before calling this
368 */ 368 */
369static void 369static void
370ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block) 370ext4_ext_binsearch_idx(struct inode *inode,
371 struct ext4_ext_path *path, ext4_lblk_t block)
371{ 372{
372 struct ext4_extent_header *eh = path->p_hdr; 373 struct ext4_extent_header *eh = path->p_hdr;
373 struct ext4_extent_idx *r, *l, *m; 374 struct ext4_extent_idx *r, *l, *m;
374 375
375 376
376 ext_debug("binsearch for %d(idx): ", block); 377 ext_debug("binsearch for %u(idx): ", block);
377 378
378 l = EXT_FIRST_INDEX(eh) + 1; 379 l = EXT_FIRST_INDEX(eh) + 1;
379 r = EXT_LAST_INDEX(eh); 380 r = EXT_LAST_INDEX(eh);
@@ -425,7 +426,8 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int bloc
425 * the header must be checked before calling this 426 * the header must be checked before calling this
426 */ 427 */
427static void 428static void
428ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block) 429ext4_ext_binsearch(struct inode *inode,
430 struct ext4_ext_path *path, ext4_lblk_t block)
429{ 431{
430 struct ext4_extent_header *eh = path->p_hdr; 432 struct ext4_extent_header *eh = path->p_hdr;
431 struct ext4_extent *r, *l, *m; 433 struct ext4_extent *r, *l, *m;
@@ -438,7 +440,7 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
438 return; 440 return;
439 } 441 }
440 442
441 ext_debug("binsearch for %d: ", block); 443 ext_debug("binsearch for %u: ", block);
442 444
443 l = EXT_FIRST_EXTENT(eh) + 1; 445 l = EXT_FIRST_EXTENT(eh) + 1;
444 r = EXT_LAST_EXTENT(eh); 446 r = EXT_LAST_EXTENT(eh);
@@ -494,7 +496,8 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
494} 496}
495 497
496struct ext4_ext_path * 498struct ext4_ext_path *
497ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path) 499ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
500 struct ext4_ext_path *path)
498{ 501{
499 struct ext4_extent_header *eh; 502 struct ext4_extent_header *eh;
500 struct buffer_head *bh; 503 struct buffer_head *bh;
@@ -763,7 +766,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
763 while (k--) { 766 while (k--) {
764 oldblock = newblock; 767 oldblock = newblock;
765 newblock = ablocks[--a]; 768 newblock = ablocks[--a];
766 bh = sb_getblk(inode->i_sb, (ext4_fsblk_t)newblock); 769 bh = sb_getblk(inode->i_sb, newblock);
767 if (!bh) { 770 if (!bh) {
768 err = -EIO; 771 err = -EIO;
769 goto cleanup; 772 goto cleanup;
@@ -783,9 +786,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
783 fidx->ei_block = border; 786 fidx->ei_block = border;
784 ext4_idx_store_pblock(fidx, oldblock); 787 ext4_idx_store_pblock(fidx, oldblock);
785 788
786 ext_debug("int.index at %d (block %llu): %lu -> %llu\n", i, 789 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
787 newblock, (unsigned long) le32_to_cpu(border), 790 i, newblock, le32_to_cpu(border), oldblock);
788 oldblock);
789 /* copy indexes */ 791 /* copy indexes */
790 m = 0; 792 m = 0;
791 path[i].p_idx++; 793 path[i].p_idx++;
@@ -851,7 +853,7 @@ cleanup:
851 for (i = 0; i < depth; i++) { 853 for (i = 0; i < depth; i++) {
852 if (!ablocks[i]) 854 if (!ablocks[i])
853 continue; 855 continue;
854 ext4_free_blocks(handle, inode, ablocks[i], 1); 856 ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
855 } 857 }
856 } 858 }
857 kfree(ablocks); 859 kfree(ablocks);
@@ -979,8 +981,8 @@ repeat:
979 /* refill path */ 981 /* refill path */
980 ext4_ext_drop_refs(path); 982 ext4_ext_drop_refs(path);
981 path = ext4_ext_find_extent(inode, 983 path = ext4_ext_find_extent(inode,
982 le32_to_cpu(newext->ee_block), 984 (ext4_lblk_t)le32_to_cpu(newext->ee_block),
983 path); 985 path);
984 if (IS_ERR(path)) 986 if (IS_ERR(path))
985 err = PTR_ERR(path); 987 err = PTR_ERR(path);
986 } else { 988 } else {
@@ -992,8 +994,8 @@ repeat:
992 /* refill path */ 994 /* refill path */
993 ext4_ext_drop_refs(path); 995 ext4_ext_drop_refs(path);
994 path = ext4_ext_find_extent(inode, 996 path = ext4_ext_find_extent(inode,
995 le32_to_cpu(newext->ee_block), 997 (ext4_lblk_t)le32_to_cpu(newext->ee_block),
996 path); 998 path);
997 if (IS_ERR(path)) { 999 if (IS_ERR(path)) {
998 err = PTR_ERR(path); 1000 err = PTR_ERR(path);
999 goto out; 1001 goto out;
@@ -1015,13 +1017,157 @@ out:
1015} 1017}
1016 1018
1017/* 1019/*
1020 * search the closest allocated block to the left for *logical
1021 * and returns it at @logical + it's physical address at @phys
1022 * if *logical is the smallest allocated block, the function
1023 * returns 0 at @phys
1024 * return value contains 0 (success) or error code
1025 */
1026int
1027ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1028 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1029{
1030 struct ext4_extent_idx *ix;
1031 struct ext4_extent *ex;
1032 int depth, ee_len;
1033
1034 BUG_ON(path == NULL);
1035 depth = path->p_depth;
1036 *phys = 0;
1037
1038 if (depth == 0 && path->p_ext == NULL)
1039 return 0;
1040
1041 /* usually extent in the path covers blocks smaller
1042 * then *logical, but it can be that extent is the
1043 * first one in the file */
1044
1045 ex = path[depth].p_ext;
1046 ee_len = ext4_ext_get_actual_len(ex);
1047 if (*logical < le32_to_cpu(ex->ee_block)) {
1048 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1049 while (--depth >= 0) {
1050 ix = path[depth].p_idx;
1051 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1052 }
1053 return 0;
1054 }
1055
1056 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1057
1058 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1059 *phys = ext_pblock(ex) + ee_len - 1;
1060 return 0;
1061}
1062
1063/*
1064 * search the closest allocated block to the right for *logical
1065 * and returns it at @logical + it's physical address at @phys
1066 * if *logical is the smallest allocated block, the function
1067 * returns 0 at @phys
1068 * return value contains 0 (success) or error code
1069 */
1070int
1071ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1072 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1073{
1074 struct buffer_head *bh = NULL;
1075 struct ext4_extent_header *eh;
1076 struct ext4_extent_idx *ix;
1077 struct ext4_extent *ex;
1078 ext4_fsblk_t block;
1079 int depth, ee_len;
1080
1081 BUG_ON(path == NULL);
1082 depth = path->p_depth;
1083 *phys = 0;
1084
1085 if (depth == 0 && path->p_ext == NULL)
1086 return 0;
1087
1088 /* usually extent in the path covers blocks smaller
1089 * then *logical, but it can be that extent is the
1090 * first one in the file */
1091
1092 ex = path[depth].p_ext;
1093 ee_len = ext4_ext_get_actual_len(ex);
1094 if (*logical < le32_to_cpu(ex->ee_block)) {
1095 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1096 while (--depth >= 0) {
1097 ix = path[depth].p_idx;
1098 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1099 }
1100 *logical = le32_to_cpu(ex->ee_block);
1101 *phys = ext_pblock(ex);
1102 return 0;
1103 }
1104
1105 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1106
1107 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1108 /* next allocated block in this leaf */
1109 ex++;
1110 *logical = le32_to_cpu(ex->ee_block);
1111 *phys = ext_pblock(ex);
1112 return 0;
1113 }
1114
1115 /* go up and search for index to the right */
1116 while (--depth >= 0) {
1117 ix = path[depth].p_idx;
1118 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1119 break;
1120 }
1121
1122 if (depth < 0) {
1123 /* we've gone up to the root and
1124 * found no index to the right */
1125 return 0;
1126 }
1127
1128 /* we've found index to the right, let's
1129 * follow it and find the closest allocated
1130 * block to the right */
1131 ix++;
1132 block = idx_pblock(ix);
1133 while (++depth < path->p_depth) {
1134 bh = sb_bread(inode->i_sb, block);
1135 if (bh == NULL)
1136 return -EIO;
1137 eh = ext_block_hdr(bh);
1138 if (ext4_ext_check_header(inode, eh, depth)) {
1139 put_bh(bh);
1140 return -EIO;
1141 }
1142 ix = EXT_FIRST_INDEX(eh);
1143 block = idx_pblock(ix);
1144 put_bh(bh);
1145 }
1146
1147 bh = sb_bread(inode->i_sb, block);
1148 if (bh == NULL)
1149 return -EIO;
1150 eh = ext_block_hdr(bh);
1151 if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
1152 put_bh(bh);
1153 return -EIO;
1154 }
1155 ex = EXT_FIRST_EXTENT(eh);
1156 *logical = le32_to_cpu(ex->ee_block);
1157 *phys = ext_pblock(ex);
1158 put_bh(bh);
1159 return 0;
1160
1161}
1162
1163/*
1018 * ext4_ext_next_allocated_block: 1164 * ext4_ext_next_allocated_block:
1019 * returns allocated block in subsequent extent or EXT_MAX_BLOCK. 1165 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1020 * NOTE: it considers block number from index entry as 1166 * NOTE: it considers block number from index entry as
1021 * allocated block. Thus, index entries have to be consistent 1167 * allocated block. Thus, index entries have to be consistent
1022 * with leaves. 1168 * with leaves.
1023 */ 1169 */
1024static unsigned long 1170static ext4_lblk_t
1025ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1171ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1026{ 1172{
1027 int depth; 1173 int depth;
@@ -1054,7 +1200,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1054 * ext4_ext_next_leaf_block: 1200 * ext4_ext_next_leaf_block:
1055 * returns first allocated block from next leaf or EXT_MAX_BLOCK 1201 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1056 */ 1202 */
1057static unsigned ext4_ext_next_leaf_block(struct inode *inode, 1203static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1058 struct ext4_ext_path *path) 1204 struct ext4_ext_path *path)
1059{ 1205{
1060 int depth; 1206 int depth;
@@ -1072,7 +1218,8 @@ static unsigned ext4_ext_next_leaf_block(struct inode *inode,
1072 while (depth >= 0) { 1218 while (depth >= 0) {
1073 if (path[depth].p_idx != 1219 if (path[depth].p_idx !=
1074 EXT_LAST_INDEX(path[depth].p_hdr)) 1220 EXT_LAST_INDEX(path[depth].p_hdr))
1075 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1221 return (ext4_lblk_t)
1222 le32_to_cpu(path[depth].p_idx[1].ei_block);
1076 depth--; 1223 depth--;
1077 } 1224 }
1078 1225
@@ -1085,7 +1232,7 @@ static unsigned ext4_ext_next_leaf_block(struct inode *inode,
1085 * then we have to correct all indexes above. 1232 * then we have to correct all indexes above.
1086 * TODO: do we need to correct tree in all cases? 1233 * TODO: do we need to correct tree in all cases?
1087 */ 1234 */
1088int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1235static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1089 struct ext4_ext_path *path) 1236 struct ext4_ext_path *path)
1090{ 1237{
1091 struct ext4_extent_header *eh; 1238 struct ext4_extent_header *eh;
@@ -1171,7 +1318,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1171 if (ext1_ee_len + ext2_ee_len > max_len) 1318 if (ext1_ee_len + ext2_ee_len > max_len)
1172 return 0; 1319 return 0;
1173#ifdef AGGRESSIVE_TEST 1320#ifdef AGGRESSIVE_TEST
1174 if (le16_to_cpu(ex1->ee_len) >= 4) 1321 if (ext1_ee_len >= 4)
1175 return 0; 1322 return 0;
1176#endif 1323#endif
1177 1324
@@ -1239,7 +1386,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
1239 struct ext4_extent *newext, 1386 struct ext4_extent *newext,
1240 struct ext4_ext_path *path) 1387 struct ext4_ext_path *path)
1241{ 1388{
1242 unsigned long b1, b2; 1389 ext4_lblk_t b1, b2;
1243 unsigned int depth, len1; 1390 unsigned int depth, len1;
1244 unsigned int ret = 0; 1391 unsigned int ret = 0;
1245 1392
@@ -1260,7 +1407,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
1260 goto out; 1407 goto out;
1261 } 1408 }
1262 1409
1263 /* check for wrap through zero */ 1410 /* check for wrap through zero on extent logical start block*/
1264 if (b1 + len1 < b1) { 1411 if (b1 + len1 < b1) {
1265 len1 = EXT_MAX_BLOCK - b1; 1412 len1 = EXT_MAX_BLOCK - b1;
1266 newext->ee_len = cpu_to_le16(len1); 1413 newext->ee_len = cpu_to_le16(len1);
@@ -1290,7 +1437,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1290 struct ext4_extent *ex, *fex; 1437 struct ext4_extent *ex, *fex;
1291 struct ext4_extent *nearex; /* nearest extent */ 1438 struct ext4_extent *nearex; /* nearest extent */
1292 struct ext4_ext_path *npath = NULL; 1439 struct ext4_ext_path *npath = NULL;
1293 int depth, len, err, next; 1440 int depth, len, err;
1441 ext4_lblk_t next;
1294 unsigned uninitialized = 0; 1442 unsigned uninitialized = 0;
1295 1443
1296 BUG_ON(ext4_ext_get_actual_len(newext) == 0); 1444 BUG_ON(ext4_ext_get_actual_len(newext) == 0);
@@ -1435,114 +1583,8 @@ cleanup:
1435 return err; 1583 return err;
1436} 1584}
1437 1585
1438int ext4_ext_walk_space(struct inode *inode, unsigned long block,
1439 unsigned long num, ext_prepare_callback func,
1440 void *cbdata)
1441{
1442 struct ext4_ext_path *path = NULL;
1443 struct ext4_ext_cache cbex;
1444 struct ext4_extent *ex;
1445 unsigned long next, start = 0, end = 0;
1446 unsigned long last = block + num;
1447 int depth, exists, err = 0;
1448
1449 BUG_ON(func == NULL);
1450 BUG_ON(inode == NULL);
1451
1452 while (block < last && block != EXT_MAX_BLOCK) {
1453 num = last - block;
1454 /* find extent for this block */
1455 path = ext4_ext_find_extent(inode, block, path);
1456 if (IS_ERR(path)) {
1457 err = PTR_ERR(path);
1458 path = NULL;
1459 break;
1460 }
1461
1462 depth = ext_depth(inode);
1463 BUG_ON(path[depth].p_hdr == NULL);
1464 ex = path[depth].p_ext;
1465 next = ext4_ext_next_allocated_block(path);
1466
1467 exists = 0;
1468 if (!ex) {
1469 /* there is no extent yet, so try to allocate
1470 * all requested space */
1471 start = block;
1472 end = block + num;
1473 } else if (le32_to_cpu(ex->ee_block) > block) {
1474 /* need to allocate space before found extent */
1475 start = block;
1476 end = le32_to_cpu(ex->ee_block);
1477 if (block + num < end)
1478 end = block + num;
1479 } else if (block >= le32_to_cpu(ex->ee_block)
1480 + ext4_ext_get_actual_len(ex)) {
1481 /* need to allocate space after found extent */
1482 start = block;
1483 end = block + num;
1484 if (end >= next)
1485 end = next;
1486 } else if (block >= le32_to_cpu(ex->ee_block)) {
1487 /*
1488 * some part of requested space is covered
1489 * by found extent
1490 */
1491 start = block;
1492 end = le32_to_cpu(ex->ee_block)
1493 + ext4_ext_get_actual_len(ex);
1494 if (block + num < end)
1495 end = block + num;
1496 exists = 1;
1497 } else {
1498 BUG();
1499 }
1500 BUG_ON(end <= start);
1501
1502 if (!exists) {
1503 cbex.ec_block = start;
1504 cbex.ec_len = end - start;
1505 cbex.ec_start = 0;
1506 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1507 } else {
1508 cbex.ec_block = le32_to_cpu(ex->ee_block);
1509 cbex.ec_len = ext4_ext_get_actual_len(ex);
1510 cbex.ec_start = ext_pblock(ex);
1511 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1512 }
1513
1514 BUG_ON(cbex.ec_len == 0);
1515 err = func(inode, path, &cbex, cbdata);
1516 ext4_ext_drop_refs(path);
1517
1518 if (err < 0)
1519 break;
1520 if (err == EXT_REPEAT)
1521 continue;
1522 else if (err == EXT_BREAK) {
1523 err = 0;
1524 break;
1525 }
1526
1527 if (ext_depth(inode) != depth) {
1528 /* depth was changed. we have to realloc path */
1529 kfree(path);
1530 path = NULL;
1531 }
1532
1533 block = cbex.ec_block + cbex.ec_len;
1534 }
1535
1536 if (path) {
1537 ext4_ext_drop_refs(path);
1538 kfree(path);
1539 }
1540
1541 return err;
1542}
1543
1544static void 1586static void
1545ext4_ext_put_in_cache(struct inode *inode, __u32 block, 1587ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1546 __u32 len, ext4_fsblk_t start, int type) 1588 __u32 len, ext4_fsblk_t start, int type)
1547{ 1589{
1548 struct ext4_ext_cache *cex; 1590 struct ext4_ext_cache *cex;
@@ -1561,10 +1603,11 @@ ext4_ext_put_in_cache(struct inode *inode, __u32 block,
1561 */ 1603 */
1562static void 1604static void
1563ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 1605ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1564 unsigned long block) 1606 ext4_lblk_t block)
1565{ 1607{
1566 int depth = ext_depth(inode); 1608 int depth = ext_depth(inode);
1567 unsigned long lblock, len; 1609 unsigned long len;
1610 ext4_lblk_t lblock;
1568 struct ext4_extent *ex; 1611 struct ext4_extent *ex;
1569 1612
1570 ex = path[depth].p_ext; 1613 ex = path[depth].p_ext;
@@ -1576,32 +1619,34 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1576 } else if (block < le32_to_cpu(ex->ee_block)) { 1619 } else if (block < le32_to_cpu(ex->ee_block)) {
1577 lblock = block; 1620 lblock = block;
1578 len = le32_to_cpu(ex->ee_block) - block; 1621 len = le32_to_cpu(ex->ee_block) - block;
1579 ext_debug("cache gap(before): %lu [%lu:%lu]", 1622 ext_debug("cache gap(before): %u [%u:%u]",
1580 (unsigned long) block, 1623 block,
1581 (unsigned long) le32_to_cpu(ex->ee_block), 1624 le32_to_cpu(ex->ee_block),
1582 (unsigned long) ext4_ext_get_actual_len(ex)); 1625 ext4_ext_get_actual_len(ex));
1583 } else if (block >= le32_to_cpu(ex->ee_block) 1626 } else if (block >= le32_to_cpu(ex->ee_block)
1584 + ext4_ext_get_actual_len(ex)) { 1627 + ext4_ext_get_actual_len(ex)) {
1628 ext4_lblk_t next;
1585 lblock = le32_to_cpu(ex->ee_block) 1629 lblock = le32_to_cpu(ex->ee_block)
1586 + ext4_ext_get_actual_len(ex); 1630 + ext4_ext_get_actual_len(ex);
1587 len = ext4_ext_next_allocated_block(path); 1631
1588 ext_debug("cache gap(after): [%lu:%lu] %lu", 1632 next = ext4_ext_next_allocated_block(path);
1589 (unsigned long) le32_to_cpu(ex->ee_block), 1633 ext_debug("cache gap(after): [%u:%u] %u",
1590 (unsigned long) ext4_ext_get_actual_len(ex), 1634 le32_to_cpu(ex->ee_block),
1591 (unsigned long) block); 1635 ext4_ext_get_actual_len(ex),
1592 BUG_ON(len == lblock); 1636 block);
1593 len = len - lblock; 1637 BUG_ON(next == lblock);
1638 len = next - lblock;
1594 } else { 1639 } else {
1595 lblock = len = 0; 1640 lblock = len = 0;
1596 BUG(); 1641 BUG();
1597 } 1642 }
1598 1643
1599 ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len); 1644 ext_debug(" -> %u:%lu\n", lblock, len);
1600 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); 1645 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1601} 1646}
1602 1647
1603static int 1648static int
1604ext4_ext_in_cache(struct inode *inode, unsigned long block, 1649ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1605 struct ext4_extent *ex) 1650 struct ext4_extent *ex)
1606{ 1651{
1607 struct ext4_ext_cache *cex; 1652 struct ext4_ext_cache *cex;
@@ -1618,11 +1663,9 @@ ext4_ext_in_cache(struct inode *inode, unsigned long block,
1618 ex->ee_block = cpu_to_le32(cex->ec_block); 1663 ex->ee_block = cpu_to_le32(cex->ec_block);
1619 ext4_ext_store_pblock(ex, cex->ec_start); 1664 ext4_ext_store_pblock(ex, cex->ec_start);
1620 ex->ee_len = cpu_to_le16(cex->ec_len); 1665 ex->ee_len = cpu_to_le16(cex->ec_len);
1621 ext_debug("%lu cached by %lu:%lu:%llu\n", 1666 ext_debug("%u cached by %u:%u:%llu\n",
1622 (unsigned long) block, 1667 block,
1623 (unsigned long) cex->ec_block, 1668 cex->ec_block, cex->ec_len, cex->ec_start);
1624 (unsigned long) cex->ec_len,
1625 cex->ec_start);
1626 return cex->ec_type; 1669 return cex->ec_type;
1627 } 1670 }
1628 1671
@@ -1636,7 +1679,7 @@ ext4_ext_in_cache(struct inode *inode, unsigned long block,
1636 * It's used in truncate case only, thus all requests are for 1679 * It's used in truncate case only, thus all requests are for
1637 * last index in the block only. 1680 * last index in the block only.
1638 */ 1681 */
1639int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 1682static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1640 struct ext4_ext_path *path) 1683 struct ext4_ext_path *path)
1641{ 1684{
1642 struct buffer_head *bh; 1685 struct buffer_head *bh;
@@ -1657,7 +1700,7 @@ int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1657 ext_debug("index is empty, remove it, free block %llu\n", leaf); 1700 ext_debug("index is empty, remove it, free block %llu\n", leaf);
1658 bh = sb_find_get_block(inode->i_sb, leaf); 1701 bh = sb_find_get_block(inode->i_sb, leaf);
1659 ext4_forget(handle, 1, inode, bh, leaf); 1702 ext4_forget(handle, 1, inode, bh, leaf);
1660 ext4_free_blocks(handle, inode, leaf, 1); 1703 ext4_free_blocks(handle, inode, leaf, 1, 1);
1661 return err; 1704 return err;
1662} 1705}
1663 1706
@@ -1666,7 +1709,7 @@ int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1666 * This routine returns max. credits that the extent tree can consume. 1709 * This routine returns max. credits that the extent tree can consume.
1667 * It should be OK for low-performance paths like ->writepage() 1710 * It should be OK for low-performance paths like ->writepage()
1668 * To allow many writing processes to fit into a single transaction, 1711 * To allow many writing processes to fit into a single transaction,
1669 * the caller should calculate credits under truncate_mutex and 1712 * the caller should calculate credits under i_data_sem and
1670 * pass the actual path. 1713 * pass the actual path.
1671 */ 1714 */
1672int ext4_ext_calc_credits_for_insert(struct inode *inode, 1715int ext4_ext_calc_credits_for_insert(struct inode *inode,
@@ -1714,12 +1757,14 @@ int ext4_ext_calc_credits_for_insert(struct inode *inode,
1714 1757
1715static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 1758static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1716 struct ext4_extent *ex, 1759 struct ext4_extent *ex,
1717 unsigned long from, unsigned long to) 1760 ext4_lblk_t from, ext4_lblk_t to)
1718{ 1761{
1719 struct buffer_head *bh; 1762 struct buffer_head *bh;
1720 unsigned short ee_len = ext4_ext_get_actual_len(ex); 1763 unsigned short ee_len = ext4_ext_get_actual_len(ex);
1721 int i; 1764 int i, metadata = 0;
1722 1765
1766 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1767 metadata = 1;
1723#ifdef EXTENTS_STATS 1768#ifdef EXTENTS_STATS
1724 { 1769 {
1725 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1770 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1738,42 +1783,45 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1738 if (from >= le32_to_cpu(ex->ee_block) 1783 if (from >= le32_to_cpu(ex->ee_block)
1739 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 1784 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
1740 /* tail removal */ 1785 /* tail removal */
1741 unsigned long num; 1786 ext4_lblk_t num;
1742 ext4_fsblk_t start; 1787 ext4_fsblk_t start;
1788
1743 num = le32_to_cpu(ex->ee_block) + ee_len - from; 1789 num = le32_to_cpu(ex->ee_block) + ee_len - from;
1744 start = ext_pblock(ex) + ee_len - num; 1790 start = ext_pblock(ex) + ee_len - num;
1745 ext_debug("free last %lu blocks starting %llu\n", num, start); 1791 ext_debug("free last %u blocks starting %llu\n", num, start);
1746 for (i = 0; i < num; i++) { 1792 for (i = 0; i < num; i++) {
1747 bh = sb_find_get_block(inode->i_sb, start + i); 1793 bh = sb_find_get_block(inode->i_sb, start + i);
1748 ext4_forget(handle, 0, inode, bh, start + i); 1794 ext4_forget(handle, 0, inode, bh, start + i);
1749 } 1795 }
1750 ext4_free_blocks(handle, inode, start, num); 1796 ext4_free_blocks(handle, inode, start, num, metadata);
1751 } else if (from == le32_to_cpu(ex->ee_block) 1797 } else if (from == le32_to_cpu(ex->ee_block)
1752 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 1798 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
1753 printk("strange request: removal %lu-%lu from %u:%u\n", 1799 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
1754 from, to, le32_to_cpu(ex->ee_block), ee_len); 1800 from, to, le32_to_cpu(ex->ee_block), ee_len);
1755 } else { 1801 } else {
1756 printk("strange request: removal(2) %lu-%lu from %u:%u\n", 1802 printk(KERN_INFO "strange request: removal(2) "
1757 from, to, le32_to_cpu(ex->ee_block), ee_len); 1803 "%u-%u from %u:%u\n",
1804 from, to, le32_to_cpu(ex->ee_block), ee_len);
1758 } 1805 }
1759 return 0; 1806 return 0;
1760} 1807}
1761 1808
1762static int 1809static int
1763ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 1810ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1764 struct ext4_ext_path *path, unsigned long start) 1811 struct ext4_ext_path *path, ext4_lblk_t start)
1765{ 1812{
1766 int err = 0, correct_index = 0; 1813 int err = 0, correct_index = 0;
1767 int depth = ext_depth(inode), credits; 1814 int depth = ext_depth(inode), credits;
1768 struct ext4_extent_header *eh; 1815 struct ext4_extent_header *eh;
1769 unsigned a, b, block, num; 1816 ext4_lblk_t a, b, block;
1770 unsigned long ex_ee_block; 1817 unsigned num;
1818 ext4_lblk_t ex_ee_block;
1771 unsigned short ex_ee_len; 1819 unsigned short ex_ee_len;
1772 unsigned uninitialized = 0; 1820 unsigned uninitialized = 0;
1773 struct ext4_extent *ex; 1821 struct ext4_extent *ex;
1774 1822
1775 /* the header must be checked already in ext4_ext_remove_space() */ 1823 /* the header must be checked already in ext4_ext_remove_space() */
1776 ext_debug("truncate since %lu in leaf\n", start); 1824 ext_debug("truncate since %u in leaf\n", start);
1777 if (!path[depth].p_hdr) 1825 if (!path[depth].p_hdr)
1778 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 1826 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
1779 eh = path[depth].p_hdr; 1827 eh = path[depth].p_hdr;
@@ -1904,7 +1952,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
1904 return 1; 1952 return 1;
1905} 1953}
1906 1954
1907int ext4_ext_remove_space(struct inode *inode, unsigned long start) 1955static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
1908{ 1956{
1909 struct super_block *sb = inode->i_sb; 1957 struct super_block *sb = inode->i_sb;
1910 int depth = ext_depth(inode); 1958 int depth = ext_depth(inode);
@@ -1912,7 +1960,7 @@ int ext4_ext_remove_space(struct inode *inode, unsigned long start)
1912 handle_t *handle; 1960 handle_t *handle;
1913 int i = 0, err = 0; 1961 int i = 0, err = 0;
1914 1962
1915 ext_debug("truncate since %lu\n", start); 1963 ext_debug("truncate since %u\n", start);
1916 1964
1917 /* probably first extent we're gonna free will be last in block */ 1965 /* probably first extent we're gonna free will be last in block */
1918 handle = ext4_journal_start(inode, depth + 1); 1966 handle = ext4_journal_start(inode, depth + 1);
@@ -2094,17 +2142,19 @@ void ext4_ext_release(struct super_block *sb)
2094 * b> Splits in two extents: Write is happening at either end of the extent 2142 * b> Splits in two extents: Write is happening at either end of the extent
2095 * c> Splits in three extents: Somone is writing in middle of the extent 2143 * c> Splits in three extents: Somone is writing in middle of the extent
2096 */ 2144 */
2097int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, 2145static int ext4_ext_convert_to_initialized(handle_t *handle,
2098 struct ext4_ext_path *path, 2146 struct inode *inode,
2099 ext4_fsblk_t iblock, 2147 struct ext4_ext_path *path,
2100 unsigned long max_blocks) 2148 ext4_lblk_t iblock,
2149 unsigned long max_blocks)
2101{ 2150{
2102 struct ext4_extent *ex, newex; 2151 struct ext4_extent *ex, newex;
2103 struct ext4_extent *ex1 = NULL; 2152 struct ext4_extent *ex1 = NULL;
2104 struct ext4_extent *ex2 = NULL; 2153 struct ext4_extent *ex2 = NULL;
2105 struct ext4_extent *ex3 = NULL; 2154 struct ext4_extent *ex3 = NULL;
2106 struct ext4_extent_header *eh; 2155 struct ext4_extent_header *eh;
2107 unsigned int allocated, ee_block, ee_len, depth; 2156 ext4_lblk_t ee_block;
2157 unsigned int allocated, ee_len, depth;
2108 ext4_fsblk_t newblock; 2158 ext4_fsblk_t newblock;
2109 int err = 0; 2159 int err = 0;
2110 int ret = 0; 2160 int ret = 0;
@@ -2225,8 +2275,13 @@ out:
2225 return err ? err : allocated; 2275 return err ? err : allocated;
2226} 2276}
2227 2277
2278/*
2279 * Need to be called with
2280 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
2281 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
2282 */
2228int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 2283int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2229 ext4_fsblk_t iblock, 2284 ext4_lblk_t iblock,
2230 unsigned long max_blocks, struct buffer_head *bh_result, 2285 unsigned long max_blocks, struct buffer_head *bh_result,
2231 int create, int extend_disksize) 2286 int create, int extend_disksize)
2232{ 2287{
@@ -2236,11 +2291,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2236 ext4_fsblk_t goal, newblock; 2291 ext4_fsblk_t goal, newblock;
2237 int err = 0, depth, ret; 2292 int err = 0, depth, ret;
2238 unsigned long allocated = 0; 2293 unsigned long allocated = 0;
2294 struct ext4_allocation_request ar;
2239 2295
2240 __clear_bit(BH_New, &bh_result->b_state); 2296 __clear_bit(BH_New, &bh_result->b_state);
2241 ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock, 2297 ext_debug("blocks %u/%lu requested for inode %u\n",
2242 max_blocks, (unsigned) inode->i_ino); 2298 iblock, max_blocks, inode->i_ino);
2243 mutex_lock(&EXT4_I(inode)->truncate_mutex);
2244 2299
2245 /* check in cache */ 2300 /* check in cache */
2246 goal = ext4_ext_in_cache(inode, iblock, &newex); 2301 goal = ext4_ext_in_cache(inode, iblock, &newex);
@@ -2260,7 +2315,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2260 - le32_to_cpu(newex.ee_block) 2315 - le32_to_cpu(newex.ee_block)
2261 + ext_pblock(&newex); 2316 + ext_pblock(&newex);
2262 /* number of remaining blocks in the extent */ 2317 /* number of remaining blocks in the extent */
2263 allocated = le16_to_cpu(newex.ee_len) - 2318 allocated = ext4_ext_get_actual_len(&newex) -
2264 (iblock - le32_to_cpu(newex.ee_block)); 2319 (iblock - le32_to_cpu(newex.ee_block));
2265 goto out; 2320 goto out;
2266 } else { 2321 } else {
@@ -2288,7 +2343,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2288 2343
2289 ex = path[depth].p_ext; 2344 ex = path[depth].p_ext;
2290 if (ex) { 2345 if (ex) {
2291 unsigned long ee_block = le32_to_cpu(ex->ee_block); 2346 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
2292 ext4_fsblk_t ee_start = ext_pblock(ex); 2347 ext4_fsblk_t ee_start = ext_pblock(ex);
2293 unsigned short ee_len; 2348 unsigned short ee_len;
2294 2349
@@ -2302,7 +2357,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2302 newblock = iblock - ee_block + ee_start; 2357 newblock = iblock - ee_block + ee_start;
2303 /* number of remaining blocks in the extent */ 2358 /* number of remaining blocks in the extent */
2304 allocated = ee_len - (iblock - ee_block); 2359 allocated = ee_len - (iblock - ee_block);
2305 ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock, 2360 ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
2306 ee_block, ee_len, newblock); 2361 ee_block, ee_len, newblock);
2307 2362
2308 /* Do not put uninitialized extent in the cache */ 2363 /* Do not put uninitialized extent in the cache */
@@ -2320,9 +2375,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2320 ret = ext4_ext_convert_to_initialized(handle, inode, 2375 ret = ext4_ext_convert_to_initialized(handle, inode,
2321 path, iblock, 2376 path, iblock,
2322 max_blocks); 2377 max_blocks);
2323 if (ret <= 0) 2378 if (ret <= 0) {
2379 err = ret;
2324 goto out2; 2380 goto out2;
2325 else 2381 } else
2326 allocated = ret; 2382 allocated = ret;
2327 goto outnew; 2383 goto outnew;
2328 } 2384 }
@@ -2347,8 +2403,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2347 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info)) 2403 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2348 ext4_init_block_alloc_info(inode); 2404 ext4_init_block_alloc_info(inode);
2349 2405
2350 /* allocate new block */ 2406 /* find neighbour allocated blocks */
2351 goal = ext4_ext_find_goal(inode, path, iblock); 2407 ar.lleft = iblock;
2408 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
2409 if (err)
2410 goto out2;
2411 ar.lright = iblock;
2412 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
2413 if (err)
2414 goto out2;
2352 2415
2353 /* 2416 /*
2354 * See if request is beyond maximum number of blocks we can have in 2417 * See if request is beyond maximum number of blocks we can have in
@@ -2368,10 +2431,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2368 newex.ee_len = cpu_to_le16(max_blocks); 2431 newex.ee_len = cpu_to_le16(max_blocks);
2369 err = ext4_ext_check_overlap(inode, &newex, path); 2432 err = ext4_ext_check_overlap(inode, &newex, path);
2370 if (err) 2433 if (err)
2371 allocated = le16_to_cpu(newex.ee_len); 2434 allocated = ext4_ext_get_actual_len(&newex);
2372 else 2435 else
2373 allocated = max_blocks; 2436 allocated = max_blocks;
2374 newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err); 2437
2438 /* allocate new block */
2439 ar.inode = inode;
2440 ar.goal = ext4_ext_find_goal(inode, path, iblock);
2441 ar.logical = iblock;
2442 ar.len = allocated;
2443 if (S_ISREG(inode->i_mode))
2444 ar.flags = EXT4_MB_HINT_DATA;
2445 else
2446 /* disable in-core preallocation for non-regular files */
2447 ar.flags = 0;
2448 newblock = ext4_mb_new_blocks(handle, &ar, &err);
2375 if (!newblock) 2449 if (!newblock)
2376 goto out2; 2450 goto out2;
2377 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 2451 ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
@@ -2379,14 +2453,17 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2379 2453
2380 /* try to insert new extent into found leaf and return */ 2454 /* try to insert new extent into found leaf and return */
2381 ext4_ext_store_pblock(&newex, newblock); 2455 ext4_ext_store_pblock(&newex, newblock);
2382 newex.ee_len = cpu_to_le16(allocated); 2456 newex.ee_len = cpu_to_le16(ar.len);
2383 if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ 2457 if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
2384 ext4_ext_mark_uninitialized(&newex); 2458 ext4_ext_mark_uninitialized(&newex);
2385 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2459 err = ext4_ext_insert_extent(handle, inode, path, &newex);
2386 if (err) { 2460 if (err) {
2387 /* free data blocks we just allocated */ 2461 /* free data blocks we just allocated */
2462 /* not a good idea to call discard here directly,
2463 * but otherwise we'd need to call it every free() */
2464 ext4_mb_discard_inode_preallocations(inode);
2388 ext4_free_blocks(handle, inode, ext_pblock(&newex), 2465 ext4_free_blocks(handle, inode, ext_pblock(&newex),
2389 le16_to_cpu(newex.ee_len)); 2466 ext4_ext_get_actual_len(&newex), 0);
2390 goto out2; 2467 goto out2;
2391 } 2468 }
2392 2469
@@ -2395,6 +2472,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2395 2472
2396 /* previous routine could use block we allocated */ 2473 /* previous routine could use block we allocated */
2397 newblock = ext_pblock(&newex); 2474 newblock = ext_pblock(&newex);
2475 allocated = ext4_ext_get_actual_len(&newex);
2398outnew: 2476outnew:
2399 __set_bit(BH_New, &bh_result->b_state); 2477 __set_bit(BH_New, &bh_result->b_state);
2400 2478
@@ -2414,8 +2492,6 @@ out2:
2414 ext4_ext_drop_refs(path); 2492 ext4_ext_drop_refs(path);
2415 kfree(path); 2493 kfree(path);
2416 } 2494 }
2417 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
2418
2419 return err ? err : allocated; 2495 return err ? err : allocated;
2420} 2496}
2421 2497
@@ -2423,7 +2499,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2423{ 2499{
2424 struct address_space *mapping = inode->i_mapping; 2500 struct address_space *mapping = inode->i_mapping;
2425 struct super_block *sb = inode->i_sb; 2501 struct super_block *sb = inode->i_sb;
2426 unsigned long last_block; 2502 ext4_lblk_t last_block;
2427 handle_t *handle; 2503 handle_t *handle;
2428 int err = 0; 2504 int err = 0;
2429 2505
@@ -2445,9 +2521,11 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2445 if (page) 2521 if (page)
2446 ext4_block_truncate_page(handle, page, mapping, inode->i_size); 2522 ext4_block_truncate_page(handle, page, mapping, inode->i_size);
2447 2523
2448 mutex_lock(&EXT4_I(inode)->truncate_mutex); 2524 down_write(&EXT4_I(inode)->i_data_sem);
2449 ext4_ext_invalidate_cache(inode); 2525 ext4_ext_invalidate_cache(inode);
2450 2526
2527 ext4_mb_discard_inode_preallocations(inode);
2528
2451 /* 2529 /*
2452 * TODO: optimization is possible here. 2530 * TODO: optimization is possible here.
2453 * Probably we need not scan at all, 2531 * Probably we need not scan at all,
@@ -2481,7 +2559,7 @@ out_stop:
2481 if (inode->i_nlink) 2559 if (inode->i_nlink)
2482 ext4_orphan_del(handle, inode); 2560 ext4_orphan_del(handle, inode);
2483 2561
2484 mutex_unlock(&EXT4_I(inode)->truncate_mutex); 2562 up_write(&EXT4_I(inode)->i_data_sem);
2485 ext4_journal_stop(handle); 2563 ext4_journal_stop(handle);
2486} 2564}
2487 2565
@@ -2516,7 +2594,8 @@ int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
2516long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 2594long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
2517{ 2595{
2518 handle_t *handle; 2596 handle_t *handle;
2519 ext4_fsblk_t block, max_blocks; 2597 ext4_lblk_t block;
2598 unsigned long max_blocks;
2520 ext4_fsblk_t nblocks = 0; 2599 ext4_fsblk_t nblocks = 0;
2521 int ret = 0; 2600 int ret = 0;
2522 int ret2 = 0; 2601 int ret2 = 0;
@@ -2544,6 +2623,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
2544 * modify 1 super block, 1 block bitmap and 1 group descriptor. 2623 * modify 1 super block, 1 block bitmap and 1 group descriptor.
2545 */ 2624 */
2546 credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; 2625 credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
2626 down_write((&EXT4_I(inode)->i_data_sem));
2547retry: 2627retry:
2548 while (ret >= 0 && ret < max_blocks) { 2628 while (ret >= 0 && ret < max_blocks) {
2549 block = block + ret; 2629 block = block + ret;
@@ -2557,12 +2637,12 @@ retry:
2557 ret = ext4_ext_get_blocks(handle, inode, block, 2637 ret = ext4_ext_get_blocks(handle, inode, block,
2558 max_blocks, &map_bh, 2638 max_blocks, &map_bh,
2559 EXT4_CREATE_UNINITIALIZED_EXT, 0); 2639 EXT4_CREATE_UNINITIALIZED_EXT, 0);
2560 WARN_ON(!ret); 2640 WARN_ON(ret <= 0);
2561 if (!ret) { 2641 if (ret <= 0) {
2562 ext4_error(inode->i_sb, "ext4_fallocate", 2642 ext4_error(inode->i_sb, "ext4_fallocate",
2563 "ext4_ext_get_blocks returned 0! inode#%lu" 2643 "ext4_ext_get_blocks returned error: "
2564 ", block=%llu, max_blocks=%llu", 2644 "inode#%lu, block=%u, max_blocks=%lu",
2565 inode->i_ino, block, max_blocks); 2645 inode->i_ino, block, max_blocks);
2566 ret = -EIO; 2646 ret = -EIO;
2567 ext4_mark_inode_dirty(handle, inode); 2647 ext4_mark_inode_dirty(handle, inode);
2568 ret2 = ext4_journal_stop(handle); 2648 ret2 = ext4_journal_stop(handle);
@@ -2600,6 +2680,7 @@ retry:
2600 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2680 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2601 goto retry; 2681 goto retry;
2602 2682
2683 up_write((&EXT4_I(inode)->i_data_sem));
2603 /* 2684 /*
2604 * Time to update the file size. 2685 * Time to update the file size.
2605 * Update only when preallocation was requested beyond the file size. 2686 * Update only when preallocation was requested beyond the file size.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1a81cd66d63b..ac35ec58db55 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -37,9 +37,9 @@ static int ext4_release_file (struct inode * inode, struct file * filp)
37 if ((filp->f_mode & FMODE_WRITE) && 37 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 38 (atomic_read(&inode->i_writecount) == 1))
39 { 39 {
40 mutex_lock(&EXT4_I(inode)->truncate_mutex); 40 down_write(&EXT4_I(inode)->i_data_sem);
41 ext4_discard_reservation(inode); 41 ext4_discard_reservation(inode);
42 mutex_unlock(&EXT4_I(inode)->truncate_mutex); 42 up_write(&EXT4_I(inode)->i_data_sem);
43 } 43 }
44 if (is_dx(inode) && filp->private_data) 44 if (is_dx(inode) && filp->private_data)
45 ext4_htree_free_dir_info(filp->private_data); 45 ext4_htree_free_dir_info(filp->private_data);
@@ -56,8 +56,25 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
56 ssize_t ret; 56 ssize_t ret;
57 int err; 57 int err;
58 58
59 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 59 /*
60 * If we have encountered a bitmap-format file, the size limit
61 * is smaller than s_maxbytes, which is for extent-mapped files.
62 */
63
64 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
65 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
66 size_t length = iov_length(iov, nr_segs);
60 67
68 if (pos > sbi->s_bitmap_maxbytes)
69 return -EFBIG;
70
71 if (pos + length > sbi->s_bitmap_maxbytes) {
72 nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
73 sbi->s_bitmap_maxbytes - pos);
74 }
75 }
76
77 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
61 /* 78 /*
62 * Skip flushing if there was an error, or if nothing was written. 79 * Skip flushing if there was an error, or if nothing was written.
63 */ 80 */
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 1577910bb58b..7eb0604e7eea 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -14,14 +14,16 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, 14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
15 struct ext4_group_desc *gdp); 15 struct ext4_group_desc *gdp);
16struct buffer_head *read_block_bitmap(struct super_block *sb, 16struct buffer_head *read_block_bitmap(struct super_block *sb,
17 unsigned int block_group); 17 ext4_group_t block_group);
18extern unsigned ext4_init_block_bitmap(struct super_block *sb, 18extern unsigned ext4_init_block_bitmap(struct super_block *sb,
19 struct buffer_head *bh, int group, 19 struct buffer_head *bh,
20 ext4_group_t group,
20 struct ext4_group_desc *desc); 21 struct ext4_group_desc *desc);
21#define ext4_free_blocks_after_init(sb, group, desc) \ 22#define ext4_free_blocks_after_init(sb, group, desc) \
22 ext4_init_block_bitmap(sb, NULL, group, desc) 23 ext4_init_block_bitmap(sb, NULL, group, desc)
23extern unsigned ext4_init_inode_bitmap(struct super_block *sb, 24extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
24 struct buffer_head *bh, int group, 25 struct buffer_head *bh,
26 ext4_group_t group,
25 struct ext4_group_desc *desc); 27 struct ext4_group_desc *desc);
26extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); 28extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
27#endif /* _LINUX_EXT4_GROUP_H */ 29#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c61f37fd3f05..575b5215c808 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -64,8 +64,8 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
64} 64}
65 65
66/* Initializes an uninitialized inode bitmap */ 66/* Initializes an uninitialized inode bitmap */
67unsigned ext4_init_inode_bitmap(struct super_block *sb, 67unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
68 struct buffer_head *bh, int block_group, 68 ext4_group_t block_group,
69 struct ext4_group_desc *gdp) 69 struct ext4_group_desc *gdp)
70{ 70{
71 struct ext4_sb_info *sbi = EXT4_SB(sb); 71 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -75,7 +75,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb,
75 /* If checksum is bad mark all blocks and inodes use to prevent 75 /* If checksum is bad mark all blocks and inodes use to prevent
76 * allocation, essentially implementing a per-group read-only flag. */ 76 * allocation, essentially implementing a per-group read-only flag. */
77 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 77 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
78 ext4_error(sb, __FUNCTION__, "Checksum bad for group %u\n", 78 ext4_error(sb, __FUNCTION__, "Checksum bad for group %lu\n",
79 block_group); 79 block_group);
80 gdp->bg_free_blocks_count = 0; 80 gdp->bg_free_blocks_count = 0;
81 gdp->bg_free_inodes_count = 0; 81 gdp->bg_free_inodes_count = 0;
@@ -98,7 +98,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb,
98 * Return buffer_head of bitmap on success or NULL. 98 * Return buffer_head of bitmap on success or NULL.
99 */ 99 */
100static struct buffer_head * 100static struct buffer_head *
101read_inode_bitmap(struct super_block * sb, unsigned long block_group) 101read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
102{ 102{
103 struct ext4_group_desc *desc; 103 struct ext4_group_desc *desc;
104 struct buffer_head *bh = NULL; 104 struct buffer_head *bh = NULL;
@@ -152,7 +152,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
152 unsigned long ino; 152 unsigned long ino;
153 struct buffer_head *bitmap_bh = NULL; 153 struct buffer_head *bitmap_bh = NULL;
154 struct buffer_head *bh2; 154 struct buffer_head *bh2;
155 unsigned long block_group; 155 ext4_group_t block_group;
156 unsigned long bit; 156 unsigned long bit;
157 struct ext4_group_desc * gdp; 157 struct ext4_group_desc * gdp;
158 struct ext4_super_block * es; 158 struct ext4_super_block * es;
@@ -260,12 +260,14 @@ error_return:
260 * For other inodes, search forward from the parent directory\'s block 260 * For other inodes, search forward from the parent directory\'s block
261 * group to find a free inode. 261 * group to find a free inode.
262 */ 262 */
263static int find_group_dir(struct super_block *sb, struct inode *parent) 263static int find_group_dir(struct super_block *sb, struct inode *parent,
264 ext4_group_t *best_group)
264{ 265{
265 int ngroups = EXT4_SB(sb)->s_groups_count; 266 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
266 unsigned int freei, avefreei; 267 unsigned int freei, avefreei;
267 struct ext4_group_desc *desc, *best_desc = NULL; 268 struct ext4_group_desc *desc, *best_desc = NULL;
268 int group, best_group = -1; 269 ext4_group_t group;
270 int ret = -1;
269 271
270 freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter); 272 freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
271 avefreei = freei / ngroups; 273 avefreei = freei / ngroups;
@@ -279,11 +281,12 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
279 if (!best_desc || 281 if (!best_desc ||
280 (le16_to_cpu(desc->bg_free_blocks_count) > 282 (le16_to_cpu(desc->bg_free_blocks_count) >
281 le16_to_cpu(best_desc->bg_free_blocks_count))) { 283 le16_to_cpu(best_desc->bg_free_blocks_count))) {
282 best_group = group; 284 *best_group = group;
283 best_desc = desc; 285 best_desc = desc;
286 ret = 0;
284 } 287 }
285 } 288 }
286 return best_group; 289 return ret;
287} 290}
288 291
289/* 292/*
@@ -314,12 +317,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
314#define INODE_COST 64 317#define INODE_COST 64
315#define BLOCK_COST 256 318#define BLOCK_COST 256
316 319
317static int find_group_orlov(struct super_block *sb, struct inode *parent) 320static int find_group_orlov(struct super_block *sb, struct inode *parent,
321 ext4_group_t *group)
318{ 322{
319 int parent_group = EXT4_I(parent)->i_block_group; 323 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
320 struct ext4_sb_info *sbi = EXT4_SB(sb); 324 struct ext4_sb_info *sbi = EXT4_SB(sb);
321 struct ext4_super_block *es = sbi->s_es; 325 struct ext4_super_block *es = sbi->s_es;
322 int ngroups = sbi->s_groups_count; 326 ext4_group_t ngroups = sbi->s_groups_count;
323 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 327 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
324 unsigned int freei, avefreei; 328 unsigned int freei, avefreei;
325 ext4_fsblk_t freeb, avefreeb; 329 ext4_fsblk_t freeb, avefreeb;
@@ -327,7 +331,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
327 unsigned int ndirs; 331 unsigned int ndirs;
328 int max_debt, max_dirs, min_inodes; 332 int max_debt, max_dirs, min_inodes;
329 ext4_grpblk_t min_blocks; 333 ext4_grpblk_t min_blocks;
330 int group = -1, i; 334 ext4_group_t i;
331 struct ext4_group_desc *desc; 335 struct ext4_group_desc *desc;
332 336
333 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); 337 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
@@ -340,13 +344,14 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
340 if ((parent == sb->s_root->d_inode) || 344 if ((parent == sb->s_root->d_inode) ||
341 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { 345 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
342 int best_ndir = inodes_per_group; 346 int best_ndir = inodes_per_group;
343 int best_group = -1; 347 ext4_group_t grp;
348 int ret = -1;
344 349
345 get_random_bytes(&group, sizeof(group)); 350 get_random_bytes(&grp, sizeof(grp));
346 parent_group = (unsigned)group % ngroups; 351 parent_group = (unsigned)grp % ngroups;
347 for (i = 0; i < ngroups; i++) { 352 for (i = 0; i < ngroups; i++) {
348 group = (parent_group + i) % ngroups; 353 grp = (parent_group + i) % ngroups;
349 desc = ext4_get_group_desc (sb, group, NULL); 354 desc = ext4_get_group_desc(sb, grp, NULL);
350 if (!desc || !desc->bg_free_inodes_count) 355 if (!desc || !desc->bg_free_inodes_count)
351 continue; 356 continue;
352 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) 357 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
@@ -355,11 +360,12 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
355 continue; 360 continue;
356 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) 361 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
357 continue; 362 continue;
358 best_group = group; 363 *group = grp;
364 ret = 0;
359 best_ndir = le16_to_cpu(desc->bg_used_dirs_count); 365 best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
360 } 366 }
361 if (best_group >= 0) 367 if (ret == 0)
362 return best_group; 368 return ret;
363 goto fallback; 369 goto fallback;
364 } 370 }
365 371
@@ -380,8 +386,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
380 max_debt = 1; 386 max_debt = 1;
381 387
382 for (i = 0; i < ngroups; i++) { 388 for (i = 0; i < ngroups; i++) {
383 group = (parent_group + i) % ngroups; 389 *group = (parent_group + i) % ngroups;
384 desc = ext4_get_group_desc (sb, group, NULL); 390 desc = ext4_get_group_desc(sb, *group, NULL);
385 if (!desc || !desc->bg_free_inodes_count) 391 if (!desc || !desc->bg_free_inodes_count)
386 continue; 392 continue;
387 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) 393 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
@@ -390,17 +396,16 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
390 continue; 396 continue;
391 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) 397 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
392 continue; 398 continue;
393 return group; 399 return 0;
394 } 400 }
395 401
396fallback: 402fallback:
397 for (i = 0; i < ngroups; i++) { 403 for (i = 0; i < ngroups; i++) {
398 group = (parent_group + i) % ngroups; 404 *group = (parent_group + i) % ngroups;
399 desc = ext4_get_group_desc (sb, group, NULL); 405 desc = ext4_get_group_desc(sb, *group, NULL);
400 if (!desc || !desc->bg_free_inodes_count) 406 if (desc && desc->bg_free_inodes_count &&
401 continue; 407 le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
402 if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) 408 return 0;
403 return group;
404 } 409 }
405 410
406 if (avefreei) { 411 if (avefreei) {
@@ -415,21 +420,22 @@ fallback:
415 return -1; 420 return -1;
416} 421}
417 422
418static int find_group_other(struct super_block *sb, struct inode *parent) 423static int find_group_other(struct super_block *sb, struct inode *parent,
424 ext4_group_t *group)
419{ 425{
420 int parent_group = EXT4_I(parent)->i_block_group; 426 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
421 int ngroups = EXT4_SB(sb)->s_groups_count; 427 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
422 struct ext4_group_desc *desc; 428 struct ext4_group_desc *desc;
423 int group, i; 429 ext4_group_t i;
424 430
425 /* 431 /*
426 * Try to place the inode in its parent directory 432 * Try to place the inode in its parent directory
427 */ 433 */
428 group = parent_group; 434 *group = parent_group;
429 desc = ext4_get_group_desc (sb, group, NULL); 435 desc = ext4_get_group_desc(sb, *group, NULL);
430 if (desc && le16_to_cpu(desc->bg_free_inodes_count) && 436 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
431 le16_to_cpu(desc->bg_free_blocks_count)) 437 le16_to_cpu(desc->bg_free_blocks_count))
432 return group; 438 return 0;
433 439
434 /* 440 /*
435 * We're going to place this inode in a different blockgroup from its 441 * We're going to place this inode in a different blockgroup from its
@@ -440,33 +446,33 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
440 * 446 *
441 * So add our directory's i_ino into the starting point for the hash. 447 * So add our directory's i_ino into the starting point for the hash.
442 */ 448 */
443 group = (group + parent->i_ino) % ngroups; 449 *group = (*group + parent->i_ino) % ngroups;
444 450
445 /* 451 /*
446 * Use a quadratic hash to find a group with a free inode and some free 452 * Use a quadratic hash to find a group with a free inode and some free
447 * blocks. 453 * blocks.
448 */ 454 */
449 for (i = 1; i < ngroups; i <<= 1) { 455 for (i = 1; i < ngroups; i <<= 1) {
450 group += i; 456 *group += i;
451 if (group >= ngroups) 457 if (*group >= ngroups)
452 group -= ngroups; 458 *group -= ngroups;
453 desc = ext4_get_group_desc (sb, group, NULL); 459 desc = ext4_get_group_desc(sb, *group, NULL);
454 if (desc && le16_to_cpu(desc->bg_free_inodes_count) && 460 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
455 le16_to_cpu(desc->bg_free_blocks_count)) 461 le16_to_cpu(desc->bg_free_blocks_count))
456 return group; 462 return 0;
457 } 463 }
458 464
459 /* 465 /*
460 * That failed: try linear search for a free inode, even if that group 466 * That failed: try linear search for a free inode, even if that group
461 * has no free blocks. 467 * has no free blocks.
462 */ 468 */
463 group = parent_group; 469 *group = parent_group;
464 for (i = 0; i < ngroups; i++) { 470 for (i = 0; i < ngroups; i++) {
465 if (++group >= ngroups) 471 if (++*group >= ngroups)
466 group = 0; 472 *group = 0;
467 desc = ext4_get_group_desc (sb, group, NULL); 473 desc = ext4_get_group_desc(sb, *group, NULL);
468 if (desc && le16_to_cpu(desc->bg_free_inodes_count)) 474 if (desc && le16_to_cpu(desc->bg_free_inodes_count))
469 return group; 475 return 0;
470 } 476 }
471 477
472 return -1; 478 return -1;
@@ -487,16 +493,17 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
487 struct super_block *sb; 493 struct super_block *sb;
488 struct buffer_head *bitmap_bh = NULL; 494 struct buffer_head *bitmap_bh = NULL;
489 struct buffer_head *bh2; 495 struct buffer_head *bh2;
490 int group; 496 ext4_group_t group = 0;
491 unsigned long ino = 0; 497 unsigned long ino = 0;
492 struct inode * inode; 498 struct inode * inode;
493 struct ext4_group_desc * gdp = NULL; 499 struct ext4_group_desc * gdp = NULL;
494 struct ext4_super_block * es; 500 struct ext4_super_block * es;
495 struct ext4_inode_info *ei; 501 struct ext4_inode_info *ei;
496 struct ext4_sb_info *sbi; 502 struct ext4_sb_info *sbi;
497 int err = 0; 503 int ret2, err = 0;
498 struct inode *ret; 504 struct inode *ret;
499 int i, free = 0; 505 ext4_group_t i;
506 int free = 0;
500 507
501 /* Cannot create files in a deleted directory */ 508 /* Cannot create files in a deleted directory */
502 if (!dir || !dir->i_nlink) 509 if (!dir || !dir->i_nlink)
@@ -512,14 +519,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
512 es = sbi->s_es; 519 es = sbi->s_es;
513 if (S_ISDIR(mode)) { 520 if (S_ISDIR(mode)) {
514 if (test_opt (sb, OLDALLOC)) 521 if (test_opt (sb, OLDALLOC))
515 group = find_group_dir(sb, dir); 522 ret2 = find_group_dir(sb, dir, &group);
516 else 523 else
517 group = find_group_orlov(sb, dir); 524 ret2 = find_group_orlov(sb, dir, &group);
518 } else 525 } else
519 group = find_group_other(sb, dir); 526 ret2 = find_group_other(sb, dir, &group);
520 527
521 err = -ENOSPC; 528 err = -ENOSPC;
522 if (group == -1) 529 if (ret2 == -1)
523 goto out; 530 goto out;
524 531
525 for (i = 0; i < sbi->s_groups_count; i++) { 532 for (i = 0; i < sbi->s_groups_count; i++) {
@@ -583,7 +590,7 @@ got:
583 ino > EXT4_INODES_PER_GROUP(sb)) { 590 ino > EXT4_INODES_PER_GROUP(sb)) {
584 ext4_error(sb, __FUNCTION__, 591 ext4_error(sb, __FUNCTION__,
585 "reserved inode or inode > inodes count - " 592 "reserved inode or inode > inodes count - "
586 "block_group = %d, inode=%lu", group, 593 "block_group = %lu, inode=%lu", group,
587 ino + group * EXT4_INODES_PER_GROUP(sb)); 594 ino + group * EXT4_INODES_PER_GROUP(sb));
588 err = -EIO; 595 err = -EIO;
589 goto fail; 596 goto fail;
@@ -702,7 +709,6 @@ got:
702 if (!S_ISDIR(mode)) 709 if (!S_ISDIR(mode))
703 ei->i_flags &= ~EXT4_DIRSYNC_FL; 710 ei->i_flags &= ~EXT4_DIRSYNC_FL;
704 ei->i_file_acl = 0; 711 ei->i_file_acl = 0;
705 ei->i_dir_acl = 0;
706 ei->i_dtime = 0; 712 ei->i_dtime = 0;
707 ei->i_block_alloc_info = NULL; 713 ei->i_block_alloc_info = NULL;
708 ei->i_block_group = group; 714 ei->i_block_group = group;
@@ -741,13 +747,10 @@ got:
741 if (test_opt(sb, EXTENTS)) { 747 if (test_opt(sb, EXTENTS)) {
742 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 748 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
743 ext4_ext_tree_init(handle, inode); 749 ext4_ext_tree_init(handle, inode);
744 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 750 err = ext4_update_incompat_feature(handle, sb,
745 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); 751 EXT4_FEATURE_INCOMPAT_EXTENTS);
746 if (err) goto fail; 752 if (err)
747 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS); 753 goto fail;
748 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "call ext4_journal_dirty_metadata");
749 err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
750 }
751 } 754 }
752 755
753 ext4_debug("allocating inode %lu\n", inode->i_ino); 756 ext4_debug("allocating inode %lu\n", inode->i_ino);
@@ -777,7 +780,7 @@ fail_drop:
777struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) 780struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
778{ 781{
779 unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); 782 unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
780 unsigned long block_group; 783 ext4_group_t block_group;
781 int bit; 784 int bit;
782 struct buffer_head *bitmap_bh = NULL; 785 struct buffer_head *bitmap_bh = NULL;
783 struct inode *inode = NULL; 786 struct inode *inode = NULL;
@@ -833,7 +836,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
833{ 836{
834 unsigned long desc_count; 837 unsigned long desc_count;
835 struct ext4_group_desc *gdp; 838 struct ext4_group_desc *gdp;
836 int i; 839 ext4_group_t i;
837#ifdef EXT4FS_DEBUG 840#ifdef EXT4FS_DEBUG
838 struct ext4_super_block *es; 841 struct ext4_super_block *es;
839 unsigned long bitmap_count, x; 842 unsigned long bitmap_count, x;
@@ -854,7 +857,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
854 continue; 857 continue;
855 858
856 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); 859 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
857 printk("group %d: stored = %d, counted = %lu\n", 860 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
858 i, le16_to_cpu(gdp->bg_free_inodes_count), x); 861 i, le16_to_cpu(gdp->bg_free_inodes_count), x);
859 bitmap_count += x; 862 bitmap_count += x;
860 } 863 }
@@ -879,7 +882,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
879unsigned long ext4_count_dirs (struct super_block * sb) 882unsigned long ext4_count_dirs (struct super_block * sb)
880{ 883{
881 unsigned long count = 0; 884 unsigned long count = 0;
882 int i; 885 ext4_group_t i;
883 886
884 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 887 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
885 struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL); 888 struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5489703d9573..bb717cbb749c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -105,7 +105,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
105 */ 105 */
106static unsigned long blocks_for_truncate(struct inode *inode) 106static unsigned long blocks_for_truncate(struct inode *inode)
107{ 107{
108 unsigned long needed; 108 ext4_lblk_t needed;
109 109
110 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 110 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
111 111
@@ -243,13 +243,6 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
243 p->bh = bh; 243 p->bh = bh;
244} 244}
245 245
246static int verify_chain(Indirect *from, Indirect *to)
247{
248 while (from <= to && from->key == *from->p)
249 from++;
250 return (from > to);
251}
252
253/** 246/**
254 * ext4_block_to_path - parse the block number into array of offsets 247 * ext4_block_to_path - parse the block number into array of offsets
255 * @inode: inode in question (we are only interested in its superblock) 248 * @inode: inode in question (we are only interested in its superblock)
@@ -282,7 +275,8 @@ static int verify_chain(Indirect *from, Indirect *to)
282 */ 275 */
283 276
284static int ext4_block_to_path(struct inode *inode, 277static int ext4_block_to_path(struct inode *inode,
285 long i_block, int offsets[4], int *boundary) 278 ext4_lblk_t i_block,
279 ext4_lblk_t offsets[4], int *boundary)
286{ 280{
287 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 281 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
288 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 282 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -313,7 +307,10 @@ static int ext4_block_to_path(struct inode *inode,
313 offsets[n++] = i_block & (ptrs - 1); 307 offsets[n++] = i_block & (ptrs - 1);
314 final = ptrs; 308 final = ptrs;
315 } else { 309 } else {
316 ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big"); 310 ext4_warning(inode->i_sb, "ext4_block_to_path",
311 "block %lu > max",
312 i_block + direct_blocks +
313 indirect_blocks + double_blocks);
317 } 314 }
318 if (boundary) 315 if (boundary)
319 *boundary = final - 1 - (i_block & (ptrs - 1)); 316 *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -344,12 +341,14 @@ static int ext4_block_to_path(struct inode *inode,
344 * (pointer to last triple returned, *@err == 0) 341 * (pointer to last triple returned, *@err == 0)
345 * or when it gets an IO error reading an indirect block 342 * or when it gets an IO error reading an indirect block
346 * (ditto, *@err == -EIO) 343 * (ditto, *@err == -EIO)
347 * or when it notices that chain had been changed while it was reading
348 * (ditto, *@err == -EAGAIN)
349 * or when it reads all @depth-1 indirect blocks successfully and finds 344 * or when it reads all @depth-1 indirect blocks successfully and finds
350 * the whole chain, all way to the data (returns %NULL, *err == 0). 345 * the whole chain, all way to the data (returns %NULL, *err == 0).
346 *
347 * Need to be called with
348 * down_read(&EXT4_I(inode)->i_data_sem)
351 */ 349 */
352static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets, 350static Indirect *ext4_get_branch(struct inode *inode, int depth,
351 ext4_lblk_t *offsets,
353 Indirect chain[4], int *err) 352 Indirect chain[4], int *err)
354{ 353{
355 struct super_block *sb = inode->i_sb; 354 struct super_block *sb = inode->i_sb;
@@ -365,9 +364,6 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
365 bh = sb_bread(sb, le32_to_cpu(p->key)); 364 bh = sb_bread(sb, le32_to_cpu(p->key));
366 if (!bh) 365 if (!bh)
367 goto failure; 366 goto failure;
368 /* Reader: pointers */
369 if (!verify_chain(chain, p))
370 goto changed;
371 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); 367 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
372 /* Reader: end */ 368 /* Reader: end */
373 if (!p->key) 369 if (!p->key)
@@ -375,10 +371,6 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
375 } 371 }
376 return NULL; 372 return NULL;
377 373
378changed:
379 brelse(bh);
380 *err = -EAGAIN;
381 goto no_block;
382failure: 374failure:
383 *err = -EIO; 375 *err = -EIO;
384no_block: 376no_block:
@@ -445,7 +437,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
445 * stores it in *@goal and returns zero. 437 * stores it in *@goal and returns zero.
446 */ 438 */
447 439
448static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block, 440static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
449 Indirect chain[4], Indirect *partial) 441 Indirect chain[4], Indirect *partial)
450{ 442{
451 struct ext4_block_alloc_info *block_i; 443 struct ext4_block_alloc_info *block_i;
@@ -559,7 +551,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
559 return ret; 551 return ret;
560failed_out: 552failed_out:
561 for (i = 0; i <index; i++) 553 for (i = 0; i <index; i++)
562 ext4_free_blocks(handle, inode, new_blocks[i], 1); 554 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
563 return ret; 555 return ret;
564} 556}
565 557
@@ -590,7 +582,7 @@ failed_out:
590 */ 582 */
591static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 583static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
592 int indirect_blks, int *blks, ext4_fsblk_t goal, 584 int indirect_blks, int *blks, ext4_fsblk_t goal,
593 int *offsets, Indirect *branch) 585 ext4_lblk_t *offsets, Indirect *branch)
594{ 586{
595 int blocksize = inode->i_sb->s_blocksize; 587 int blocksize = inode->i_sb->s_blocksize;
596 int i, n = 0; 588 int i, n = 0;
@@ -658,9 +650,9 @@ failed:
658 ext4_journal_forget(handle, branch[i].bh); 650 ext4_journal_forget(handle, branch[i].bh);
659 } 651 }
660 for (i = 0; i <indirect_blks; i++) 652 for (i = 0; i <indirect_blks; i++)
661 ext4_free_blocks(handle, inode, new_blocks[i], 1); 653 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
662 654
663 ext4_free_blocks(handle, inode, new_blocks[i], num); 655 ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
664 656
665 return err; 657 return err;
666} 658}
@@ -680,7 +672,7 @@ failed:
680 * chain to new block and return 0. 672 * chain to new block and return 0.
681 */ 673 */
682static int ext4_splice_branch(handle_t *handle, struct inode *inode, 674static int ext4_splice_branch(handle_t *handle, struct inode *inode,
683 long block, Indirect *where, int num, int blks) 675 ext4_lblk_t block, Indirect *where, int num, int blks)
684{ 676{
685 int i; 677 int i;
686 int err = 0; 678 int err = 0;
@@ -757,9 +749,10 @@ err_out:
757 for (i = 1; i <= num; i++) { 749 for (i = 1; i <= num; i++) {
758 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 750 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
759 ext4_journal_forget(handle, where[i].bh); 751 ext4_journal_forget(handle, where[i].bh);
760 ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); 752 ext4_free_blocks(handle, inode,
753 le32_to_cpu(where[i-1].key), 1, 0);
761 } 754 }
762 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); 755 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
763 756
764 return err; 757 return err;
765} 758}
@@ -782,14 +775,19 @@ err_out:
782 * return > 0, # of blocks mapped or allocated. 775 * return > 0, # of blocks mapped or allocated.
783 * return = 0, if plain lookup failed. 776 * return = 0, if plain lookup failed.
784 * return < 0, error case. 777 * return < 0, error case.
778 *
779 *
780 * Need to be called with
781 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
782 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
785 */ 783 */
786int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 784int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
787 sector_t iblock, unsigned long maxblocks, 785 ext4_lblk_t iblock, unsigned long maxblocks,
788 struct buffer_head *bh_result, 786 struct buffer_head *bh_result,
789 int create, int extend_disksize) 787 int create, int extend_disksize)
790{ 788{
791 int err = -EIO; 789 int err = -EIO;
792 int offsets[4]; 790 ext4_lblk_t offsets[4];
793 Indirect chain[4]; 791 Indirect chain[4];
794 Indirect *partial; 792 Indirect *partial;
795 ext4_fsblk_t goal; 793 ext4_fsblk_t goal;
@@ -803,7 +801,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
803 801
804 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 802 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
805 J_ASSERT(handle != NULL || create == 0); 803 J_ASSERT(handle != NULL || create == 0);
806 depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary); 804 depth = ext4_block_to_path(inode, iblock, offsets,
805 &blocks_to_boundary);
807 806
808 if (depth == 0) 807 if (depth == 0)
809 goto out; 808 goto out;
@@ -819,18 +818,6 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
819 while (count < maxblocks && count <= blocks_to_boundary) { 818 while (count < maxblocks && count <= blocks_to_boundary) {
820 ext4_fsblk_t blk; 819 ext4_fsblk_t blk;
821 820
822 if (!verify_chain(chain, partial)) {
823 /*
824 * Indirect block might be removed by
825 * truncate while we were reading it.
826 * Handling of that case: forget what we've
827 * got now. Flag the err as EAGAIN, so it
828 * will reread.
829 */
830 err = -EAGAIN;
831 count = 0;
832 break;
833 }
834 blk = le32_to_cpu(*(chain[depth-1].p + count)); 821 blk = le32_to_cpu(*(chain[depth-1].p + count));
835 822
836 if (blk == first_block + count) 823 if (blk == first_block + count)
@@ -838,44 +825,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
838 else 825 else
839 break; 826 break;
840 } 827 }
841 if (err != -EAGAIN) 828 goto got_it;
842 goto got_it;
843 } 829 }
844 830
845 /* Next simple case - plain lookup or failed read of indirect block */ 831 /* Next simple case - plain lookup or failed read of indirect block */
846 if (!create || err == -EIO) 832 if (!create || err == -EIO)
847 goto cleanup; 833 goto cleanup;
848 834
849 mutex_lock(&ei->truncate_mutex);
850
851 /*
852 * If the indirect block is missing while we are reading
853 * the chain(ext4_get_branch() returns -EAGAIN err), or
854 * if the chain has been changed after we grab the semaphore,
855 * (either because another process truncated this branch, or
856 * another get_block allocated this branch) re-grab the chain to see if
857 * the request block has been allocated or not.
858 *
859 * Since we already block the truncate/other get_block
860 * at this point, we will have the current copy of the chain when we
861 * splice the branch into the tree.
862 */
863 if (err == -EAGAIN || !verify_chain(chain, partial)) {
864 while (partial > chain) {
865 brelse(partial->bh);
866 partial--;
867 }
868 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
869 if (!partial) {
870 count++;
871 mutex_unlock(&ei->truncate_mutex);
872 if (err)
873 goto cleanup;
874 clear_buffer_new(bh_result);
875 goto got_it;
876 }
877 }
878
879 /* 835 /*
880 * Okay, we need to do block allocation. Lazily initialize the block 836 * Okay, we need to do block allocation. Lazily initialize the block
881 * allocation info here if necessary 837 * allocation info here if necessary
@@ -911,13 +867,12 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
911 err = ext4_splice_branch(handle, inode, iblock, 867 err = ext4_splice_branch(handle, inode, iblock,
912 partial, indirect_blks, count); 868 partial, indirect_blks, count);
913 /* 869 /*
914 * i_disksize growing is protected by truncate_mutex. Don't forget to 870 * i_disksize growing is protected by i_data_sem. Don't forget to
915 * protect it if you're about to implement concurrent 871 * protect it if you're about to implement concurrent
916 * ext4_get_block() -bzzz 872 * ext4_get_block() -bzzz
917 */ 873 */
918 if (!err && extend_disksize && inode->i_size > ei->i_disksize) 874 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
919 ei->i_disksize = inode->i_size; 875 ei->i_disksize = inode->i_size;
920 mutex_unlock(&ei->truncate_mutex);
921 if (err) 876 if (err)
922 goto cleanup; 877 goto cleanup;
923 878
@@ -942,6 +897,47 @@ out:
942 897
943#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32) 898#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
944 899
900int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
901 unsigned long max_blocks, struct buffer_head *bh,
902 int create, int extend_disksize)
903{
904 int retval;
905 /*
906 * Try to see if we can get the block without requesting
907 * for new file system block.
908 */
909 down_read((&EXT4_I(inode)->i_data_sem));
910 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
911 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
912 bh, 0, 0);
913 } else {
914 retval = ext4_get_blocks_handle(handle,
915 inode, block, max_blocks, bh, 0, 0);
916 }
917 up_read((&EXT4_I(inode)->i_data_sem));
918 if (!create || (retval > 0))
919 return retval;
920
921 /*
922 * We need to allocate new blocks which will result
923 * in i_data update
924 */
925 down_write((&EXT4_I(inode)->i_data_sem));
926 /*
927 * We need to check for EXT4 here because migrate
928 * could have changed the inode type in between
929 */
930 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
931 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
932 bh, create, extend_disksize);
933 } else {
934 retval = ext4_get_blocks_handle(handle, inode, block,
935 max_blocks, bh, create, extend_disksize);
936 }
937 up_write((&EXT4_I(inode)->i_data_sem));
938 return retval;
939}
940
945static int ext4_get_block(struct inode *inode, sector_t iblock, 941static int ext4_get_block(struct inode *inode, sector_t iblock,
946 struct buffer_head *bh_result, int create) 942 struct buffer_head *bh_result, int create)
947{ 943{
@@ -996,7 +992,7 @@ get_block:
996 * `handle' can be NULL if create is zero 992 * `handle' can be NULL if create is zero
997 */ 993 */
998struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 994struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
999 long block, int create, int *errp) 995 ext4_lblk_t block, int create, int *errp)
1000{ 996{
1001 struct buffer_head dummy; 997 struct buffer_head dummy;
1002 int fatal = 0, err; 998 int fatal = 0, err;
@@ -1063,7 +1059,7 @@ err:
1063} 1059}
1064 1060
1065struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1061struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1066 int block, int create, int *err) 1062 ext4_lblk_t block, int create, int *err)
1067{ 1063{
1068 struct buffer_head * bh; 1064 struct buffer_head * bh;
1069 1065
@@ -1446,7 +1442,7 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1446 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... 1442 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1447 * 1443 *
1448 * Same applies to ext4_get_block(). We will deadlock on various things like 1444 * Same applies to ext4_get_block(). We will deadlock on various things like
1449 * lock_journal and i_truncate_mutex. 1445 * lock_journal and i_data_sem
1450 * 1446 *
1451 * Setting PF_MEMALLOC here doesn't work - too many internal memory 1447 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1452 * allocations fail. 1448 * allocations fail.
@@ -1828,7 +1824,8 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1828{ 1824{
1829 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 1825 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
1830 unsigned offset = from & (PAGE_CACHE_SIZE-1); 1826 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1831 unsigned blocksize, iblock, length, pos; 1827 unsigned blocksize, length, pos;
1828 ext4_lblk_t iblock;
1832 struct inode *inode = mapping->host; 1829 struct inode *inode = mapping->host;
1833 struct buffer_head *bh; 1830 struct buffer_head *bh;
1834 int err = 0; 1831 int err = 0;
@@ -1964,7 +1961,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
1964 * (no partially truncated stuff there). */ 1961 * (no partially truncated stuff there). */
1965 1962
1966static Indirect *ext4_find_shared(struct inode *inode, int depth, 1963static Indirect *ext4_find_shared(struct inode *inode, int depth,
1967 int offsets[4], Indirect chain[4], __le32 *top) 1964 ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
1968{ 1965{
1969 Indirect *partial, *p; 1966 Indirect *partial, *p;
1970 int k, err; 1967 int k, err;
@@ -2048,15 +2045,15 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
2048 for (p = first; p < last; p++) { 2045 for (p = first; p < last; p++) {
2049 u32 nr = le32_to_cpu(*p); 2046 u32 nr = le32_to_cpu(*p);
2050 if (nr) { 2047 if (nr) {
2051 struct buffer_head *bh; 2048 struct buffer_head *tbh;
2052 2049
2053 *p = 0; 2050 *p = 0;
2054 bh = sb_find_get_block(inode->i_sb, nr); 2051 tbh = sb_find_get_block(inode->i_sb, nr);
2055 ext4_forget(handle, 0, inode, bh, nr); 2052 ext4_forget(handle, 0, inode, tbh, nr);
2056 } 2053 }
2057 } 2054 }
2058 2055
2059 ext4_free_blocks(handle, inode, block_to_free, count); 2056 ext4_free_blocks(handle, inode, block_to_free, count, 0);
2060} 2057}
2061 2058
2062/** 2059/**
@@ -2229,7 +2226,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
2229 ext4_journal_test_restart(handle, inode); 2226 ext4_journal_test_restart(handle, inode);
2230 } 2227 }
2231 2228
2232 ext4_free_blocks(handle, inode, nr, 1); 2229 ext4_free_blocks(handle, inode, nr, 1, 1);
2233 2230
2234 if (parent_bh) { 2231 if (parent_bh) {
2235 /* 2232 /*
@@ -2289,12 +2286,12 @@ void ext4_truncate(struct inode *inode)
2289 __le32 *i_data = ei->i_data; 2286 __le32 *i_data = ei->i_data;
2290 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 2287 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
2291 struct address_space *mapping = inode->i_mapping; 2288 struct address_space *mapping = inode->i_mapping;
2292 int offsets[4]; 2289 ext4_lblk_t offsets[4];
2293 Indirect chain[4]; 2290 Indirect chain[4];
2294 Indirect *partial; 2291 Indirect *partial;
2295 __le32 nr = 0; 2292 __le32 nr = 0;
2296 int n; 2293 int n;
2297 long last_block; 2294 ext4_lblk_t last_block;
2298 unsigned blocksize = inode->i_sb->s_blocksize; 2295 unsigned blocksize = inode->i_sb->s_blocksize;
2299 struct page *page; 2296 struct page *page;
2300 2297
@@ -2320,8 +2317,10 @@ void ext4_truncate(struct inode *inode)
2320 return; 2317 return;
2321 } 2318 }
2322 2319
2323 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 2320 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
2324 return ext4_ext_truncate(inode, page); 2321 ext4_ext_truncate(inode, page);
2322 return;
2323 }
2325 2324
2326 handle = start_transaction(inode); 2325 handle = start_transaction(inode);
2327 if (IS_ERR(handle)) { 2326 if (IS_ERR(handle)) {
@@ -2369,7 +2368,7 @@ void ext4_truncate(struct inode *inode)
2369 * From here we block out all ext4_get_block() callers who want to 2368 * From here we block out all ext4_get_block() callers who want to
2370 * modify the block allocation tree. 2369 * modify the block allocation tree.
2371 */ 2370 */
2372 mutex_lock(&ei->truncate_mutex); 2371 down_write(&ei->i_data_sem);
2373 2372
2374 if (n == 1) { /* direct blocks */ 2373 if (n == 1) { /* direct blocks */
2375 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 2374 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
@@ -2433,7 +2432,7 @@ do_indirects:
2433 2432
2434 ext4_discard_reservation(inode); 2433 ext4_discard_reservation(inode);
2435 2434
2436 mutex_unlock(&ei->truncate_mutex); 2435 up_write(&ei->i_data_sem);
2437 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 2436 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
2438 ext4_mark_inode_dirty(handle, inode); 2437 ext4_mark_inode_dirty(handle, inode);
2439 2438
@@ -2460,7 +2459,8 @@ out_stop:
2460static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, 2459static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
2461 unsigned long ino, struct ext4_iloc *iloc) 2460 unsigned long ino, struct ext4_iloc *iloc)
2462{ 2461{
2463 unsigned long desc, group_desc, block_group; 2462 unsigned long desc, group_desc;
2463 ext4_group_t block_group;
2464 unsigned long offset; 2464 unsigned long offset;
2465 ext4_fsblk_t block; 2465 ext4_fsblk_t block;
2466 struct buffer_head *bh; 2466 struct buffer_head *bh;
@@ -2547,7 +2547,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
2547 struct ext4_group_desc *desc; 2547 struct ext4_group_desc *desc;
2548 int inodes_per_buffer; 2548 int inodes_per_buffer;
2549 int inode_offset, i; 2549 int inode_offset, i;
2550 int block_group; 2550 ext4_group_t block_group;
2551 int start; 2551 int start;
2552 2552
2553 block_group = (inode->i_ino - 1) / 2553 block_group = (inode->i_ino - 1) /
@@ -2660,6 +2660,28 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei)
2660 if (flags & S_DIRSYNC) 2660 if (flags & S_DIRSYNC)
2661 ei->i_flags |= EXT4_DIRSYNC_FL; 2661 ei->i_flags |= EXT4_DIRSYNC_FL;
2662} 2662}
2663static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
2664 struct ext4_inode_info *ei)
2665{
2666 blkcnt_t i_blocks ;
2667 struct inode *inode = &(ei->vfs_inode);
2668 struct super_block *sb = inode->i_sb;
2669
2670 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
2671 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
2672 /* we are using combined 48 bit field */
2673 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
2674 le32_to_cpu(raw_inode->i_blocks_lo);
2675 if (ei->i_flags & EXT4_HUGE_FILE_FL) {
2676 /* i_blocks represent file system block size */
2677 return i_blocks << (inode->i_blkbits - 9);
2678 } else {
2679 return i_blocks;
2680 }
2681 } else {
2682 return le32_to_cpu(raw_inode->i_blocks_lo);
2683 }
2684}
2663 2685
2664void ext4_read_inode(struct inode * inode) 2686void ext4_read_inode(struct inode * inode)
2665{ 2687{
@@ -2687,7 +2709,6 @@ void ext4_read_inode(struct inode * inode)
2687 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 2709 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2688 } 2710 }
2689 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 2711 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2690 inode->i_size = le32_to_cpu(raw_inode->i_size);
2691 2712
2692 ei->i_state = 0; 2713 ei->i_state = 0;
2693 ei->i_dir_start_lookup = 0; 2714 ei->i_dir_start_lookup = 0;
@@ -2709,19 +2730,15 @@ void ext4_read_inode(struct inode * inode)
2709 * recovery code: that's fine, we're about to complete 2730 * recovery code: that's fine, we're about to complete
2710 * the process of deleting those. */ 2731 * the process of deleting those. */
2711 } 2732 }
2712 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2713 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 2733 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2714 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); 2734 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
2735 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
2715 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 2736 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
2716 cpu_to_le32(EXT4_OS_HURD)) 2737 cpu_to_le32(EXT4_OS_HURD)) {
2717 ei->i_file_acl |= 2738 ei->i_file_acl |=
2718 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 2739 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
2719 if (!S_ISREG(inode->i_mode)) {
2720 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2721 } else {
2722 inode->i_size |=
2723 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2724 } 2740 }
2741 inode->i_size = ext4_isize(raw_inode);
2725 ei->i_disksize = inode->i_size; 2742 ei->i_disksize = inode->i_size;
2726 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 2743 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2727 ei->i_block_group = iloc.block_group; 2744 ei->i_block_group = iloc.block_group;
@@ -2765,6 +2782,13 @@ void ext4_read_inode(struct inode * inode)
2765 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 2782 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
2766 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 2783 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
2767 2784
2785 inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
2786 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
2787 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
2788 inode->i_version |=
2789 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
2790 }
2791
2768 if (S_ISREG(inode->i_mode)) { 2792 if (S_ISREG(inode->i_mode)) {
2769 inode->i_op = &ext4_file_inode_operations; 2793 inode->i_op = &ext4_file_inode_operations;
2770 inode->i_fop = &ext4_file_operations; 2794 inode->i_fop = &ext4_file_operations;
@@ -2797,6 +2821,55 @@ bad_inode:
2797 return; 2821 return;
2798} 2822}
2799 2823
2824static int ext4_inode_blocks_set(handle_t *handle,
2825 struct ext4_inode *raw_inode,
2826 struct ext4_inode_info *ei)
2827{
2828 struct inode *inode = &(ei->vfs_inode);
2829 u64 i_blocks = inode->i_blocks;
2830 struct super_block *sb = inode->i_sb;
2831 int err = 0;
2832
2833 if (i_blocks <= ~0U) {
2834 /*
2835 * i_blocks can be represnted in a 32 bit variable
2836 * as multiple of 512 bytes
2837 */
2838 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
2839 raw_inode->i_blocks_high = 0;
2840 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
2841 } else if (i_blocks <= 0xffffffffffffULL) {
2842 /*
2843 * i_blocks can be represented in a 48 bit variable
2844 * as multiple of 512 bytes
2845 */
2846 err = ext4_update_rocompat_feature(handle, sb,
2847 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2848 if (err)
2849 goto err_out;
2850 /* i_block is stored in the split 48 bit fields */
2851 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
2852 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
2853 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
2854 } else {
2855 /*
2856 * i_blocks should be represented in a 48 bit variable
2857 * as multiple of file system block size
2858 */
2859 err = ext4_update_rocompat_feature(handle, sb,
2860 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2861 if (err)
2862 goto err_out;
2863 ei->i_flags |= EXT4_HUGE_FILE_FL;
2864 /* i_block is stored in file system block size */
2865 i_blocks = i_blocks >> (inode->i_blkbits - 9);
2866 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
2867 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
2868 }
2869err_out:
2870 return err;
2871}
2872
2800/* 2873/*
2801 * Post the struct inode info into an on-disk inode location in the 2874 * Post the struct inode info into an on-disk inode location in the
2802 * buffer-cache. This gobbles the caller's reference to the 2875 * buffer-cache. This gobbles the caller's reference to the
@@ -2845,47 +2918,42 @@ static int ext4_do_update_inode(handle_t *handle,
2845 raw_inode->i_gid_high = 0; 2918 raw_inode->i_gid_high = 0;
2846 } 2919 }
2847 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 2920 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2848 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2849 2921
2850 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 2922 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
2851 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 2923 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
2852 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 2924 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
2853 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 2925 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
2854 2926
2855 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); 2927 if (ext4_inode_blocks_set(handle, raw_inode, ei))
2928 goto out_brelse;
2856 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 2929 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2857 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 2930 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2858 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 2931 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
2859 cpu_to_le32(EXT4_OS_HURD)) 2932 cpu_to_le32(EXT4_OS_HURD))
2860 raw_inode->i_file_acl_high = 2933 raw_inode->i_file_acl_high =
2861 cpu_to_le16(ei->i_file_acl >> 32); 2934 cpu_to_le16(ei->i_file_acl >> 32);
2862 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); 2935 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
2863 if (!S_ISREG(inode->i_mode)) { 2936 ext4_isize_set(raw_inode, ei->i_disksize);
2864 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); 2937 if (ei->i_disksize > 0x7fffffffULL) {
2865 } else { 2938 struct super_block *sb = inode->i_sb;
2866 raw_inode->i_size_high = 2939 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
2867 cpu_to_le32(ei->i_disksize >> 32); 2940 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
2868 if (ei->i_disksize > 0x7fffffffULL) { 2941 EXT4_SB(sb)->s_es->s_rev_level ==
2869 struct super_block *sb = inode->i_sb; 2942 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
2870 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 2943 /* If this is the first large file
2871 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || 2944 * created, add a flag to the superblock.
2872 EXT4_SB(sb)->s_es->s_rev_level == 2945 */
2873 cpu_to_le32(EXT4_GOOD_OLD_REV)) { 2946 err = ext4_journal_get_write_access(handle,
2874 /* If this is the first large file 2947 EXT4_SB(sb)->s_sbh);
2875 * created, add a flag to the superblock. 2948 if (err)
2876 */ 2949 goto out_brelse;
2877 err = ext4_journal_get_write_access(handle, 2950 ext4_update_dynamic_rev(sb);
2878 EXT4_SB(sb)->s_sbh); 2951 EXT4_SET_RO_COMPAT_FEATURE(sb,
2879 if (err)
2880 goto out_brelse;
2881 ext4_update_dynamic_rev(sb);
2882 EXT4_SET_RO_COMPAT_FEATURE(sb,
2883 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 2952 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
2884 sb->s_dirt = 1; 2953 sb->s_dirt = 1;
2885 handle->h_sync = 1; 2954 handle->h_sync = 1;
2886 err = ext4_journal_dirty_metadata(handle, 2955 err = ext4_journal_dirty_metadata(handle,
2887 EXT4_SB(sb)->s_sbh); 2956 EXT4_SB(sb)->s_sbh);
2888 }
2889 } 2957 }
2890 } 2958 }
2891 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 2959 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -2903,8 +2971,14 @@ static int ext4_do_update_inode(handle_t *handle,
2903 } else for (block = 0; block < EXT4_N_BLOCKS; block++) 2971 } else for (block = 0; block < EXT4_N_BLOCKS; block++)
2904 raw_inode->i_block[block] = ei->i_data[block]; 2972 raw_inode->i_block[block] = ei->i_data[block];
2905 2973
2906 if (ei->i_extra_isize) 2974 raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
2975 if (ei->i_extra_isize) {
2976 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
2977 raw_inode->i_version_hi =
2978 cpu_to_le32(inode->i_version >> 32);
2907 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 2979 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2980 }
2981
2908 2982
2909 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 2983 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
2910 rc = ext4_journal_dirty_metadata(handle, bh); 2984 rc = ext4_journal_dirty_metadata(handle, bh);
@@ -3024,6 +3098,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
3024 ext4_journal_stop(handle); 3098 ext4_journal_stop(handle);
3025 } 3099 }
3026 3100
3101 if (attr->ia_valid & ATTR_SIZE) {
3102 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
3103 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3104
3105 if (attr->ia_size > sbi->s_bitmap_maxbytes) {
3106 error = -EFBIG;
3107 goto err_out;
3108 }
3109 }
3110 }
3111
3027 if (S_ISREG(inode->i_mode) && 3112 if (S_ISREG(inode->i_mode) &&
3028 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 3113 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
3029 handle_t *handle; 3114 handle_t *handle;
@@ -3120,6 +3205,9 @@ int ext4_mark_iloc_dirty(handle_t *handle,
3120{ 3205{
3121 int err = 0; 3206 int err = 0;
3122 3207
3208 if (test_opt(inode->i_sb, I_VERSION))
3209 inode_inc_iversion(inode);
3210
3123 /* the do_update_inode consumes one bh->b_count */ 3211 /* the do_update_inode consumes one bh->b_count */
3124 get_bh(iloc->bh); 3212 get_bh(iloc->bh);
3125 3213
@@ -3158,8 +3246,10 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
3158 * Expand an inode by new_extra_isize bytes. 3246 * Expand an inode by new_extra_isize bytes.
3159 * Returns 0 on success or negative error number on failure. 3247 * Returns 0 on success or negative error number on failure.
3160 */ 3248 */
3161int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, 3249static int ext4_expand_extra_isize(struct inode *inode,
3162 struct ext4_iloc iloc, handle_t *handle) 3250 unsigned int new_extra_isize,
3251 struct ext4_iloc iloc,
3252 handle_t *handle)
3163{ 3253{
3164 struct ext4_inode *raw_inode; 3254 struct ext4_inode *raw_inode;
3165 struct ext4_xattr_ibody_header *header; 3255 struct ext4_xattr_ibody_header *header;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index e7f894bdb420..2ed7c37f897e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -199,7 +199,7 @@ flags_err:
199 * need to allocate reservation structure for this inode 199 * need to allocate reservation structure for this inode
200 * before set the window size 200 * before set the window size
201 */ 201 */
202 mutex_lock(&ei->truncate_mutex); 202 down_write(&ei->i_data_sem);
203 if (!ei->i_block_alloc_info) 203 if (!ei->i_block_alloc_info)
204 ext4_init_block_alloc_info(inode); 204 ext4_init_block_alloc_info(inode);
205 205
@@ -207,7 +207,7 @@ flags_err:
207 struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; 207 struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
208 rsv->rsv_goal_size = rsv_window_size; 208 rsv->rsv_goal_size = rsv_window_size;
209 } 209 }
210 mutex_unlock(&ei->truncate_mutex); 210 up_write(&ei->i_data_sem);
211 return 0; 211 return 0;
212 } 212 }
213 case EXT4_IOC_GROUP_EXTEND: { 213 case EXT4_IOC_GROUP_EXTEND: {
@@ -254,6 +254,9 @@ flags_err:
254 return err; 254 return err;
255 } 255 }
256 256
257 case EXT4_IOC_MIGRATE:
258 return ext4_ext_migrate(inode, filp, cmd, arg);
259
257 default: 260 default:
258 return -ENOTTY; 261 return -ENOTTY;
259 } 262 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
new file mode 100644
index 000000000000..76e5fedc0a0b
--- /dev/null
+++ b/fs/ext4/mballoc.c
@@ -0,0 +1,4552 @@
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 */
18
19
20/*
21 * mballoc.c contains the multiblocks allocation routines
22 */
23
24#include <linux/time.h>
25#include <linux/fs.h>
26#include <linux/namei.h>
27#include <linux/ext4_jbd2.h>
28#include <linux/ext4_fs.h>
29#include <linux/quotaops.h>
30#include <linux/buffer_head.h>
31#include <linux/module.h>
32#include <linux/swap.h>
33#include <linux/proc_fs.h>
34#include <linux/pagemap.h>
35#include <linux/seq_file.h>
36#include <linux/version.h>
37#include "group.h"
38
39/*
40 * MUSTDO:
41 * - test ext4_ext_search_left() and ext4_ext_search_right()
42 * - search for metadata in few groups
43 *
44 * TODO v4:
45 * - normalization should take into account whether file is still open
46 * - discard preallocations if no free space left (policy?)
47 * - don't normalize tails
48 * - quota
49 * - reservation for superuser
50 *
51 * TODO v3:
52 * - bitmap read-ahead (proposed by Oleg Drokin aka green)
53 * - track min/max extents in each group for better group selection
54 * - mb_mark_used() may allocate chunk right after splitting buddy
55 * - tree of groups sorted by number of free blocks
56 * - error handling
57 */
58
59/*
60 * The allocation request involve request for multiple number of blocks
61 * near to the goal(block) value specified.
62 *
63 * During initialization phase of the allocator we decide to use the group
64 * preallocation or inode preallocation depending on the size file. The
65 * size of the file could be the resulting file size we would have after
66 * allocation or the current file size which ever is larger. If the size is
67 * less that sbi->s_mb_stream_request we select the group
68 * preallocation. The default value of s_mb_stream_request is 16
69 * blocks. This can also be tuned via
70 * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
71 * of number of blocks.
72 *
73 * The main motivation for having small file use group preallocation is to
74 * ensure that we have small file closer in the disk.
75 *
76 * First stage the allocator looks at the inode prealloc list
77 * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
78 * this particular inode. The inode prealloc space is represented as:
79 *
80 * pa_lstart -> the logical start block for this prealloc space
81 * pa_pstart -> the physical start block for this prealloc space
82 * pa_len -> lenght for this prealloc space
83 * pa_free -> free space available in this prealloc space
84 *
85 * The inode preallocation space is used looking at the _logical_ start
86 * block. If only the logical file block falls within the range of prealloc
87 * space we will consume the particular prealloc space. This make sure that
88 * that the we have contiguous physical blocks representing the file blocks
89 *
90 * The important thing to be noted in case of inode prealloc space is that
91 * we don't modify the values associated to inode prealloc space except
92 * pa_free.
93 *
94 * If we are not able to find blocks in the inode prealloc space and if we
95 * have the group allocation flag set then we look at the locality group
96 * prealloc space. These are per CPU prealloc list repreasented as
97 *
98 * ext4_sb_info.s_locality_groups[smp_processor_id()]
99 *
100 * The reason for having a per cpu locality group is to reduce the contention
101 * between CPUs. It is possible to get scheduled at this point.
102 *
103 * The locality group prealloc space is used looking at whether we have
104 * enough free space (pa_free) withing the prealloc space.
105 *
106 * If we can't allocate blocks via inode prealloc or/and locality group
107 * prealloc then we look at the buddy cache. The buddy cache is represented
108 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
109 * mapped to the buddy and bitmap information regarding different
110 * groups. The buddy information is attached to buddy cache inode so that
111 * we can access them through the page cache. The information regarding
112 * each group is loaded via ext4_mb_load_buddy. The information involve
113 * block bitmap and buddy information. The information are stored in the
114 * inode as:
115 *
116 * { page }
117 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
118 *
119 *
120 * one block each for bitmap and buddy information. So for each group we
121 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
122 * blocksize) blocks. So it can have information regarding groups_per_page
123 * which is blocks_per_page/2
124 *
125 * The buddy cache inode is not stored on disk. The inode is thrown
126 * away when the filesystem is unmounted.
127 *
128 * We look for count number of blocks in the buddy cache. If we were able
129 * to locate that many free blocks we return with additional information
130 * regarding rest of the contiguous physical block available
131 *
132 * Before allocating blocks via buddy cache we normalize the request
133 * blocks. This ensure we ask for more blocks that we needed. The extra
134 * blocks that we get after allocation is added to the respective prealloc
135 * list. In case of inode preallocation we follow a list of heuristics
136 * based on file size. This can be found in ext4_mb_normalize_request. If
137 * we are doing a group prealloc we try to normalize the request to
138 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
139 * 512 blocks. This can be tuned via
140 * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
141 * terms of number of blocks. If we have mounted the file system with -O
142 * stripe=<value> option the group prealloc request is normalized to the
143 * stripe value (sbi->s_stripe)
144 *
145 * The regular allocator(using the buddy cache) support few tunables.
146 *
147 * /proc/fs/ext4/<partition>/min_to_scan
148 * /proc/fs/ext4/<partition>/max_to_scan
149 * /proc/fs/ext4/<partition>/order2_req
150 *
151 * The regular allocator use buddy scan only if the request len is power of
152 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
153 * value of s_mb_order2_reqs can be tuned via
154 * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to
155 * stripe size (sbi->s_stripe), we try to search for contigous block in
156 * stripe size. This should result in better allocation on RAID setup. If
157 * not we search in the specific group using bitmap for best extents. The
158 * tunable min_to_scan and max_to_scan controll the behaviour here.
159 * min_to_scan indicate how long the mballoc __must__ look for a best
160 * extent and max_to_scanindicate how long the mballoc __can__ look for a
161 * best extent in the found extents. Searching for the blocks starts with
162 * the group specified as the goal value in allocation context via
163 * ac_g_ex. Each group is first checked based on the criteria whether it
164 * can used for allocation. ext4_mb_good_group explains how the groups are
165 * checked.
166 *
167 * Both the prealloc space are getting populated as above. So for the first
168 * request we will hit the buddy cache which will result in this prealloc
169 * space getting filled. The prealloc space is then later used for the
170 * subsequent request.
171 */
172
173/*
174 * mballoc operates on the following data:
175 * - on-disk bitmap
176 * - in-core buddy (actually includes buddy and bitmap)
177 * - preallocation descriptors (PAs)
178 *
179 * there are two types of preallocations:
180 * - inode
181 * assiged to specific inode and can be used for this inode only.
182 * it describes part of inode's space preallocated to specific
183 * physical blocks. any block from that preallocated can be used
184 * independent. the descriptor just tracks number of blocks left
185 * unused. so, before taking some block from descriptor, one must
186 * make sure corresponded logical block isn't allocated yet. this
187 * also means that freeing any block within descriptor's range
188 * must discard all preallocated blocks.
189 * - locality group
190 * assigned to specific locality group which does not translate to
191 * permanent set of inodes: inode can join and leave group. space
192 * from this type of preallocation can be used for any inode. thus
193 * it's consumed from the beginning to the end.
194 *
195 * relation between them can be expressed as:
196 * in-core buddy = on-disk bitmap + preallocation descriptors
197 *
198 * this mean blocks mballoc considers used are:
199 * - allocated blocks (persistent)
200 * - preallocated blocks (non-persistent)
201 *
202 * consistency in mballoc world means that at any time a block is either
203 * free or used in ALL structures. notice: "any time" should not be read
204 * literally -- time is discrete and delimited by locks.
205 *
206 * to keep it simple, we don't use block numbers, instead we count number of
207 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
208 *
209 * all operations can be expressed as:
210 * - init buddy: buddy = on-disk + PAs
211 * - new PA: buddy += N; PA = N
212 * - use inode PA: on-disk += N; PA -= N
213 * - discard inode PA buddy -= on-disk - PA; PA = 0
214 * - use locality group PA on-disk += N; PA -= N
215 * - discard locality group PA buddy -= PA; PA = 0
216 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
217 * is used in real operation because we can't know actual used
218 * bits from PA, only from on-disk bitmap
219 *
220 * if we follow this strict logic, then all operations above should be atomic.
221 * given some of them can block, we'd have to use something like semaphores
222 * killing performance on high-end SMP hardware. let's try to relax it using
223 * the following knowledge:
224 * 1) if buddy is referenced, it's already initialized
225 * 2) while block is used in buddy and the buddy is referenced,
226 * nobody can re-allocate that block
227 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
228 * bit set and PA claims same block, it's OK. IOW, one can set bit in
229 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
230 * block
231 *
232 * so, now we're building a concurrency table:
233 * - init buddy vs.
234 * - new PA
235 * blocks for PA are allocated in the buddy, buddy must be referenced
236 * until PA is linked to allocation group to avoid concurrent buddy init
237 * - use inode PA
238 * we need to make sure that either on-disk bitmap or PA has uptodate data
239 * given (3) we care that PA-=N operation doesn't interfere with init
240 * - discard inode PA
241 * the simplest way would be to have buddy initialized by the discard
242 * - use locality group PA
243 * again PA-=N must be serialized with init
244 * - discard locality group PA
245 * the simplest way would be to have buddy initialized by the discard
246 * - new PA vs.
247 * - use inode PA
248 * i_data_sem serializes them
249 * - discard inode PA
250 * discard process must wait until PA isn't used by another process
251 * - use locality group PA
252 * some mutex should serialize them
253 * - discard locality group PA
254 * discard process must wait until PA isn't used by another process
255 * - use inode PA
256 * - use inode PA
257 * i_data_sem or another mutex should serializes them
258 * - discard inode PA
259 * discard process must wait until PA isn't used by another process
260 * - use locality group PA
261 * nothing wrong here -- they're different PAs covering different blocks
262 * - discard locality group PA
263 * discard process must wait until PA isn't used by another process
264 *
265 * now we're ready to make few consequences:
266 * - PA is referenced and while it is no discard is possible
267 * - PA is referenced until block isn't marked in on-disk bitmap
268 * - PA changes only after on-disk bitmap
269 * - discard must not compete with init. either init is done before
270 * any discard or they're serialized somehow
271 * - buddy init as sum of on-disk bitmap and PAs is done atomically
272 *
273 * a special case when we've used PA to emptiness. no need to modify buddy
274 * in this case, but we should care about concurrent init
275 *
276 */
277
278 /*
279 * Logic in few words:
280 *
281 * - allocation:
282 * load group
283 * find blocks
284 * mark bits in on-disk bitmap
285 * release group
286 *
287 * - use preallocation:
288 * find proper PA (per-inode or group)
289 * load group
290 * mark bits in on-disk bitmap
291 * release group
292 * release PA
293 *
294 * - free:
295 * load group
296 * mark bits in on-disk bitmap
297 * release group
298 *
299 * - discard preallocations in group:
300 * mark PAs deleted
301 * move them onto local list
302 * load on-disk bitmap
303 * load group
304 * remove PA from object (inode or locality group)
305 * mark free blocks in-core
306 *
307 * - discard inode's preallocations:
308 */
309
310/*
311 * Locking rules
312 *
313 * Locks:
314 * - bitlock on a group (group)
315 * - object (inode/locality) (object)
316 * - per-pa lock (pa)
317 *
318 * Paths:
319 * - new pa
320 * object
321 * group
322 *
323 * - find and use pa:
324 * pa
325 *
326 * - release consumed pa:
327 * pa
328 * group
329 * object
330 *
331 * - generate in-core bitmap:
332 * group
333 * pa
334 *
335 * - discard all for given object (inode, locality group):
336 * object
337 * pa
338 * group
339 *
340 * - discard all for given group:
341 * group
342 * pa
343 * group
344 * object
345 *
346 */
347
348/*
349 * with AGGRESSIVE_CHECK allocator runs consistency checks over
350 * structures. these checks slow things down a lot
351 */
352#define AGGRESSIVE_CHECK__
353
354/*
355 * with DOUBLE_CHECK defined mballoc creates persistent in-core
356 * bitmaps, maintains and uses them to check for double allocations
357 */
358#define DOUBLE_CHECK__
359
360/*
361 */
362#define MB_DEBUG__
363#ifdef MB_DEBUG
364#define mb_debug(fmt, a...) printk(fmt, ##a)
365#else
366#define mb_debug(fmt, a...)
367#endif
368
369/*
370 * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
371 * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
372 */
373#define EXT4_MB_HISTORY
374#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
375#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
376#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
377#define EXT4_MB_HISTORY_FREE 8 /* free */
378
379#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
380 EXT4_MB_HISTORY_PREALLOC)
381
382/*
383 * How long mballoc can look for a best extent (in found extents)
384 */
385#define MB_DEFAULT_MAX_TO_SCAN 200
386
387/*
388 * How long mballoc must look for a best extent
389 */
390#define MB_DEFAULT_MIN_TO_SCAN 10
391
392/*
393 * How many groups mballoc will scan looking for the best chunk
394 */
395#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5
396
397/*
398 * with 'ext4_mb_stats' allocator will collect stats that will be
399 * shown at umount. The collecting costs though!
400 */
401#define MB_DEFAULT_STATS 1
402
403/*
404 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
405 * by the stream allocator, which purpose is to pack requests
406 * as close each to other as possible to produce smooth I/O traffic
407 * We use locality group prealloc space for stream request.
408 * We can tune the same via /proc/fs/ext4/<parition>/stream_req
409 */
410#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */
411
412/*
413 * for which requests use 2^N search using buddies
414 */
415#define MB_DEFAULT_ORDER2_REQS 2
416
417/*
418 * default group prealloc size 512 blocks
419 */
420#define MB_DEFAULT_GROUP_PREALLOC 512
421
422static struct kmem_cache *ext4_pspace_cachep;
423
424#ifdef EXT4_BB_MAX_BLOCKS
425#undef EXT4_BB_MAX_BLOCKS
426#endif
427#define EXT4_BB_MAX_BLOCKS 30
428
429struct ext4_free_metadata {
430 ext4_group_t group;
431 unsigned short num;
432 ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
433 struct list_head list;
434};
435
436struct ext4_group_info {
437 unsigned long bb_state;
438 unsigned long bb_tid;
439 struct ext4_free_metadata *bb_md_cur;
440 unsigned short bb_first_free;
441 unsigned short bb_free;
442 unsigned short bb_fragments;
443 struct list_head bb_prealloc_list;
444#ifdef DOUBLE_CHECK
445 void *bb_bitmap;
446#endif
447 unsigned short bb_counters[];
448};
449
450#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
451#define EXT4_GROUP_INFO_LOCKED_BIT 1
452
453#define EXT4_MB_GRP_NEED_INIT(grp) \
454 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
455
456
457struct ext4_prealloc_space {
458 struct list_head pa_inode_list;
459 struct list_head pa_group_list;
460 union {
461 struct list_head pa_tmp_list;
462 struct rcu_head pa_rcu;
463 } u;
464 spinlock_t pa_lock;
465 atomic_t pa_count;
466 unsigned pa_deleted;
467 ext4_fsblk_t pa_pstart; /* phys. block */
468 ext4_lblk_t pa_lstart; /* log. block */
469 unsigned short pa_len; /* len of preallocated chunk */
470 unsigned short pa_free; /* how many blocks are free */
471 unsigned short pa_linear; /* consumed in one direction
472 * strictly, for grp prealloc */
473 spinlock_t *pa_obj_lock;
474 struct inode *pa_inode; /* hack, for history only */
475};
476
477
478struct ext4_free_extent {
479 ext4_lblk_t fe_logical;
480 ext4_grpblk_t fe_start;
481 ext4_group_t fe_group;
482 int fe_len;
483};
484
485/*
486 * Locality group:
487 * we try to group all related changes together
488 * so that writeback can flush/allocate them together as well
489 */
490struct ext4_locality_group {
491 /* for allocator */
492 struct mutex lg_mutex; /* to serialize allocates */
493 struct list_head lg_prealloc_list;/* list of preallocations */
494 spinlock_t lg_prealloc_lock;
495};
496
497struct ext4_allocation_context {
498 struct inode *ac_inode;
499 struct super_block *ac_sb;
500
501 /* original request */
502 struct ext4_free_extent ac_o_ex;
503
504 /* goal request (after normalization) */
505 struct ext4_free_extent ac_g_ex;
506
507 /* the best found extent */
508 struct ext4_free_extent ac_b_ex;
509
510 /* copy of the bext found extent taken before preallocation efforts */
511 struct ext4_free_extent ac_f_ex;
512
513 /* number of iterations done. we have to track to limit searching */
514 unsigned long ac_ex_scanned;
515 __u16 ac_groups_scanned;
516 __u16 ac_found;
517 __u16 ac_tail;
518 __u16 ac_buddy;
519 __u16 ac_flags; /* allocation hints */
520 __u8 ac_status;
521 __u8 ac_criteria;
522 __u8 ac_repeats;
523 __u8 ac_2order; /* if request is to allocate 2^N blocks and
524 * N > 0, the field stores N, otherwise 0 */
525 __u8 ac_op; /* operation, for history only */
526 struct page *ac_bitmap_page;
527 struct page *ac_buddy_page;
528 struct ext4_prealloc_space *ac_pa;
529 struct ext4_locality_group *ac_lg;
530};
531
532#define AC_STATUS_CONTINUE 1
533#define AC_STATUS_FOUND 2
534#define AC_STATUS_BREAK 3
535
536struct ext4_mb_history {
537 struct ext4_free_extent orig; /* orig allocation */
538 struct ext4_free_extent goal; /* goal allocation */
539 struct ext4_free_extent result; /* result allocation */
540 unsigned pid;
541 unsigned ino;
542 __u16 found; /* how many extents have been found */
543 __u16 groups; /* how many groups have been scanned */
544 __u16 tail; /* what tail broke some buddy */
545 __u16 buddy; /* buddy the tail ^^^ broke */
546 __u16 flags;
547 __u8 cr:3; /* which phase the result extent was found at */
548 __u8 op:4;
549 __u8 merged:1;
550};
551
552struct ext4_buddy {
553 struct page *bd_buddy_page;
554 void *bd_buddy;
555 struct page *bd_bitmap_page;
556 void *bd_bitmap;
557 struct ext4_group_info *bd_info;
558 struct super_block *bd_sb;
559 __u16 bd_blkbits;
560 ext4_group_t bd_group;
561};
562#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
563#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
564
565#ifndef EXT4_MB_HISTORY
566static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
567{
568 return;
569}
570#else
571static void ext4_mb_store_history(struct ext4_allocation_context *ac);
572#endif
573
574#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
575
576static struct proc_dir_entry *proc_root_ext4;
577struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
578ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
579 ext4_fsblk_t goal, unsigned long *count, int *errp);
580
581static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
582 ext4_group_t group);
583static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
584static void ext4_mb_free_committed_blocks(struct super_block *);
585static void ext4_mb_return_to_preallocation(struct inode *inode,
586 struct ext4_buddy *e4b, sector_t block,
587 int count);
588static void ext4_mb_put_pa(struct ext4_allocation_context *,
589 struct super_block *, struct ext4_prealloc_space *pa);
590static int ext4_mb_init_per_dev_proc(struct super_block *sb);
591static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
592
593
594static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
595{
596 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
597
598 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
599}
600
601static inline void ext4_unlock_group(struct super_block *sb,
602 ext4_group_t group)
603{
604 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
605
606 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
607}
608
609static inline int ext4_is_group_locked(struct super_block *sb,
610 ext4_group_t group)
611{
612 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
613
614 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
615 &(grinfo->bb_state));
616}
617
618static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
619 struct ext4_free_extent *fex)
620{
621 ext4_fsblk_t block;
622
623 block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
624 + fex->fe_start
625 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
626 return block;
627}
628
629#if BITS_PER_LONG == 64
630#define mb_correct_addr_and_bit(bit, addr) \
631{ \
632 bit += ((unsigned long) addr & 7UL) << 3; \
633 addr = (void *) ((unsigned long) addr & ~7UL); \
634}
635#elif BITS_PER_LONG == 32
636#define mb_correct_addr_and_bit(bit, addr) \
637{ \
638 bit += ((unsigned long) addr & 3UL) << 3; \
639 addr = (void *) ((unsigned long) addr & ~3UL); \
640}
641#else
642#error "how many bits you are?!"
643#endif
644
645static inline int mb_test_bit(int bit, void *addr)
646{
647 /*
648 * ext4_test_bit on architecture like powerpc
649 * needs unsigned long aligned address
650 */
651 mb_correct_addr_and_bit(bit, addr);
652 return ext4_test_bit(bit, addr);
653}
654
655static inline void mb_set_bit(int bit, void *addr)
656{
657 mb_correct_addr_and_bit(bit, addr);
658 ext4_set_bit(bit, addr);
659}
660
661static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
662{
663 mb_correct_addr_and_bit(bit, addr);
664 ext4_set_bit_atomic(lock, bit, addr);
665}
666
667static inline void mb_clear_bit(int bit, void *addr)
668{
669 mb_correct_addr_and_bit(bit, addr);
670 ext4_clear_bit(bit, addr);
671}
672
673static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
674{
675 mb_correct_addr_and_bit(bit, addr);
676 ext4_clear_bit_atomic(lock, bit, addr);
677}
678
679static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
680{
681 char *bb;
682
683 /* FIXME!! is this needed */
684 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
685 BUG_ON(max == NULL);
686
687 if (order > e4b->bd_blkbits + 1) {
688 *max = 0;
689 return NULL;
690 }
691
692 /* at order 0 we see each particular block */
693 *max = 1 << (e4b->bd_blkbits + 3);
694 if (order == 0)
695 return EXT4_MB_BITMAP(e4b);
696
697 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
698 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
699
700 return bb;
701}
702
703#ifdef DOUBLE_CHECK
704static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
705 int first, int count)
706{
707 int i;
708 struct super_block *sb = e4b->bd_sb;
709
710 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
711 return;
712 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
713 for (i = 0; i < count; i++) {
714 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
715 ext4_fsblk_t blocknr;
716 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
717 blocknr += first + i;
718 blocknr +=
719 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
720
721 ext4_error(sb, __FUNCTION__, "double-free of inode"
722 " %lu's block %llu(bit %u in group %lu)\n",
723 inode ? inode->i_ino : 0, blocknr,
724 first + i, e4b->bd_group);
725 }
726 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
727 }
728}
729
730static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
731{
732 int i;
733
734 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
735 return;
736 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
737 for (i = 0; i < count; i++) {
738 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
739 mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
740 }
741}
742
743static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
744{
745 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
746 unsigned char *b1, *b2;
747 int i;
748 b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
749 b2 = (unsigned char *) bitmap;
750 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
751 if (b1[i] != b2[i]) {
752 printk("corruption in group %lu at byte %u(%u):"
753 " %x in copy != %x on disk/prealloc\n",
754 e4b->bd_group, i, i * 8, b1[i], b2[i]);
755 BUG();
756 }
757 }
758 }
759}
760
761#else
762static inline void mb_free_blocks_double(struct inode *inode,
763 struct ext4_buddy *e4b, int first, int count)
764{
765 return;
766}
767static inline void mb_mark_used_double(struct ext4_buddy *e4b,
768 int first, int count)
769{
770 return;
771}
772static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
773{
774 return;
775}
776#endif
777
778#ifdef AGGRESSIVE_CHECK
779
780#define MB_CHECK_ASSERT(assert) \
781do { \
782 if (!(assert)) { \
783 printk(KERN_EMERG \
784 "Assertion failure in %s() at %s:%d: \"%s\"\n", \
785 function, file, line, # assert); \
786 BUG(); \
787 } \
788} while (0)
789
790static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
791 const char *function, int line)
792{
793 struct super_block *sb = e4b->bd_sb;
794 int order = e4b->bd_blkbits + 1;
795 int max;
796 int max2;
797 int i;
798 int j;
799 int k;
800 int count;
801 struct ext4_group_info *grp;
802 int fragments = 0;
803 int fstart;
804 struct list_head *cur;
805 void *buddy;
806 void *buddy2;
807
808 if (!test_opt(sb, MBALLOC))
809 return 0;
810
811 {
812 static int mb_check_counter;
813 if (mb_check_counter++ % 100 != 0)
814 return 0;
815 }
816
817 while (order > 1) {
818 buddy = mb_find_buddy(e4b, order, &max);
819 MB_CHECK_ASSERT(buddy);
820 buddy2 = mb_find_buddy(e4b, order - 1, &max2);
821 MB_CHECK_ASSERT(buddy2);
822 MB_CHECK_ASSERT(buddy != buddy2);
823 MB_CHECK_ASSERT(max * 2 == max2);
824
825 count = 0;
826 for (i = 0; i < max; i++) {
827
828 if (mb_test_bit(i, buddy)) {
829 /* only single bit in buddy2 may be 1 */
830 if (!mb_test_bit(i << 1, buddy2)) {
831 MB_CHECK_ASSERT(
832 mb_test_bit((i<<1)+1, buddy2));
833 } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
834 MB_CHECK_ASSERT(
835 mb_test_bit(i << 1, buddy2));
836 }
837 continue;
838 }
839
840 /* both bits in buddy2 must be 0 */
841 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
842 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
843
844 for (j = 0; j < (1 << order); j++) {
845 k = (i * (1 << order)) + j;
846 MB_CHECK_ASSERT(
847 !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
848 }
849 count++;
850 }
851 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
852 order--;
853 }
854
855 fstart = -1;
856 buddy = mb_find_buddy(e4b, 0, &max);
857 for (i = 0; i < max; i++) {
858 if (!mb_test_bit(i, buddy)) {
859 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
860 if (fstart == -1) {
861 fragments++;
862 fstart = i;
863 }
864 continue;
865 }
866 fstart = -1;
867 /* check used bits only */
868 for (j = 0; j < e4b->bd_blkbits + 1; j++) {
869 buddy2 = mb_find_buddy(e4b, j, &max2);
870 k = i >> j;
871 MB_CHECK_ASSERT(k < max2);
872 MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
873 }
874 }
875 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
876 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
877
878 grp = ext4_get_group_info(sb, e4b->bd_group);
879 buddy = mb_find_buddy(e4b, 0, &max);
880 list_for_each(cur, &grp->bb_prealloc_list) {
881 ext4_group_t groupnr;
882 struct ext4_prealloc_space *pa;
883 pa = list_entry(cur, struct ext4_prealloc_space, group_list);
884 ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k);
885 MB_CHECK_ASSERT(groupnr == e4b->bd_group);
886 for (i = 0; i < pa->len; i++)
887 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
888 }
889 return 0;
890}
891#undef MB_CHECK_ASSERT
892#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
893 __FILE__, __FUNCTION__, __LINE__)
894#else
895#define mb_check_buddy(e4b)
896#endif
897
898/* FIXME!! need more doc */
899static void ext4_mb_mark_free_simple(struct super_block *sb,
900 void *buddy, unsigned first, int len,
901 struct ext4_group_info *grp)
902{
903 struct ext4_sb_info *sbi = EXT4_SB(sb);
904 unsigned short min;
905 unsigned short max;
906 unsigned short chunk;
907 unsigned short border;
908
909 BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
910
911 border = 2 << sb->s_blocksize_bits;
912
913 while (len > 0) {
914 /* find how many blocks can be covered since this position */
915 max = ffs(first | border) - 1;
916
917 /* find how many blocks of power 2 we need to mark */
918 min = fls(len) - 1;
919
920 if (max < min)
921 min = max;
922 chunk = 1 << min;
923
924 /* mark multiblock chunks only */
925 grp->bb_counters[min]++;
926 if (min > 0)
927 mb_clear_bit(first >> min,
928 buddy + sbi->s_mb_offsets[min]);
929
930 len -= chunk;
931 first += chunk;
932 }
933}
934
935static void ext4_mb_generate_buddy(struct super_block *sb,
936 void *buddy, void *bitmap, ext4_group_t group)
937{
938 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
939 unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
940 unsigned short i = 0;
941 unsigned short first;
942 unsigned short len;
943 unsigned free = 0;
944 unsigned fragments = 0;
945 unsigned long long period = get_cycles();
946
947 /* initialize buddy from bitmap which is aggregation
948 * of on-disk bitmap and preallocations */
949 i = ext4_find_next_zero_bit(bitmap, max, 0);
950 grp->bb_first_free = i;
951 while (i < max) {
952 fragments++;
953 first = i;
954 i = ext4_find_next_bit(bitmap, max, i);
955 len = i - first;
956 free += len;
957 if (len > 1)
958 ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
959 else
960 grp->bb_counters[0]++;
961 if (i < max)
962 i = ext4_find_next_zero_bit(bitmap, max, i);
963 }
964 grp->bb_fragments = fragments;
965
966 if (free != grp->bb_free) {
967 printk(KERN_DEBUG
968 "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
969 group, free, grp->bb_free);
970 grp->bb_free = free;
971 }
972
973 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
974
975 period = get_cycles() - period;
976 spin_lock(&EXT4_SB(sb)->s_bal_lock);
977 EXT4_SB(sb)->s_mb_buddies_generated++;
978 EXT4_SB(sb)->s_mb_generation_time += period;
979 spin_unlock(&EXT4_SB(sb)->s_bal_lock);
980}
981
982/* The buddy information is attached the buddy cache inode
983 * for convenience. The information regarding each group
984 * is loaded via ext4_mb_load_buddy. The information involve
985 * block bitmap and buddy information. The information are
986 * stored in the inode as
987 *
988 * { page }
989 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
990 *
991 *
992 * one block each for bitmap and buddy information.
993 * So for each group we take up 2 blocks. A page can
994 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
995 * So it can have information regarding groups_per_page which
996 * is blocks_per_page/2
997 */
998
999static int ext4_mb_init_cache(struct page *page, char *incore)
1000{
1001 int blocksize;
1002 int blocks_per_page;
1003 int groups_per_page;
1004 int err = 0;
1005 int i;
1006 ext4_group_t first_group;
1007 int first_block;
1008 struct super_block *sb;
1009 struct buffer_head *bhs;
1010 struct buffer_head **bh;
1011 struct inode *inode;
1012 char *data;
1013 char *bitmap;
1014
1015 mb_debug("init page %lu\n", page->index);
1016
1017 inode = page->mapping->host;
1018 sb = inode->i_sb;
1019 blocksize = 1 << inode->i_blkbits;
1020 blocks_per_page = PAGE_CACHE_SIZE / blocksize;
1021
1022 groups_per_page = blocks_per_page >> 1;
1023 if (groups_per_page == 0)
1024 groups_per_page = 1;
1025
1026 /* allocate buffer_heads to read bitmaps */
1027 if (groups_per_page > 1) {
1028 err = -ENOMEM;
1029 i = sizeof(struct buffer_head *) * groups_per_page;
1030 bh = kzalloc(i, GFP_NOFS);
1031 if (bh == NULL)
1032 goto out;
1033 } else
1034 bh = &bhs;
1035
1036 first_group = page->index * blocks_per_page / 2;
1037
1038 /* read all groups the page covers into the cache */
1039 for (i = 0; i < groups_per_page; i++) {
1040 struct ext4_group_desc *desc;
1041
1042 if (first_group + i >= EXT4_SB(sb)->s_groups_count)
1043 break;
1044
1045 err = -EIO;
1046 desc = ext4_get_group_desc(sb, first_group + i, NULL);
1047 if (desc == NULL)
1048 goto out;
1049
1050 err = -ENOMEM;
1051 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
1052 if (bh[i] == NULL)
1053 goto out;
1054
1055 if (bh_uptodate_or_lock(bh[i]))
1056 continue;
1057
1058 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1059 ext4_init_block_bitmap(sb, bh[i],
1060 first_group + i, desc);
1061 set_buffer_uptodate(bh[i]);
1062 unlock_buffer(bh[i]);
1063 continue;
1064 }
1065 get_bh(bh[i]);
1066 bh[i]->b_end_io = end_buffer_read_sync;
1067 submit_bh(READ, bh[i]);
1068 mb_debug("read bitmap for group %lu\n", first_group + i);
1069 }
1070
1071 /* wait for I/O completion */
1072 for (i = 0; i < groups_per_page && bh[i]; i++)
1073 wait_on_buffer(bh[i]);
1074
1075 err = -EIO;
1076 for (i = 0; i < groups_per_page && bh[i]; i++)
1077 if (!buffer_uptodate(bh[i]))
1078 goto out;
1079
1080 first_block = page->index * blocks_per_page;
1081 for (i = 0; i < blocks_per_page; i++) {
1082 int group;
1083 struct ext4_group_info *grinfo;
1084
1085 group = (first_block + i) >> 1;
1086 if (group >= EXT4_SB(sb)->s_groups_count)
1087 break;
1088
1089 /*
1090 * data carry information regarding this
1091 * particular group in the format specified
1092 * above
1093 *
1094 */
1095 data = page_address(page) + (i * blocksize);
1096 bitmap = bh[group - first_group]->b_data;
1097
1098 /*
1099 * We place the buddy block and bitmap block
1100 * close together
1101 */
1102 if ((first_block + i) & 1) {
1103 /* this is block of buddy */
1104 BUG_ON(incore == NULL);
1105 mb_debug("put buddy for group %u in page %lu/%x\n",
1106 group, page->index, i * blocksize);
1107 memset(data, 0xff, blocksize);
1108 grinfo = ext4_get_group_info(sb, group);
1109 grinfo->bb_fragments = 0;
1110 memset(grinfo->bb_counters, 0,
1111 sizeof(unsigned short)*(sb->s_blocksize_bits+2));
1112 /*
1113 * incore got set to the group block bitmap below
1114 */
1115 ext4_mb_generate_buddy(sb, data, incore, group);
1116 incore = NULL;
1117 } else {
1118 /* this is block of bitmap */
1119 BUG_ON(incore != NULL);
1120 mb_debug("put bitmap for group %u in page %lu/%x\n",
1121 group, page->index, i * blocksize);
1122
1123 /* see comments in ext4_mb_put_pa() */
1124 ext4_lock_group(sb, group);
1125 memcpy(data, bitmap, blocksize);
1126
1127 /* mark all preallocated blks used in in-core bitmap */
1128 ext4_mb_generate_from_pa(sb, data, group);
1129 ext4_unlock_group(sb, group);
1130
1131 /* set incore so that the buddy information can be
1132 * generated using this
1133 */
1134 incore = data;
1135 }
1136 }
1137 SetPageUptodate(page);
1138
1139out:
1140 if (bh) {
1141 for (i = 0; i < groups_per_page && bh[i]; i++)
1142 brelse(bh[i]);
1143 if (bh != &bhs)
1144 kfree(bh);
1145 }
1146 return err;
1147}
1148
1149static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1150 struct ext4_buddy *e4b)
1151{
1152 struct ext4_sb_info *sbi = EXT4_SB(sb);
1153 struct inode *inode = sbi->s_buddy_cache;
1154 int blocks_per_page;
1155 int block;
1156 int pnum;
1157 int poff;
1158 struct page *page;
1159
1160 mb_debug("load group %lu\n", group);
1161
1162 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1163
1164 e4b->bd_blkbits = sb->s_blocksize_bits;
1165 e4b->bd_info = ext4_get_group_info(sb, group);
1166 e4b->bd_sb = sb;
1167 e4b->bd_group = group;
1168 e4b->bd_buddy_page = NULL;
1169 e4b->bd_bitmap_page = NULL;
1170
1171 /*
1172 * the buddy cache inode stores the block bitmap
1173 * and buddy information in consecutive blocks.
1174 * So for each group we need two blocks.
1175 */
1176 block = group * 2;
1177 pnum = block / blocks_per_page;
1178 poff = block % blocks_per_page;
1179
1180 /* we could use find_or_create_page(), but it locks page
1181 * what we'd like to avoid in fast path ... */
1182 page = find_get_page(inode->i_mapping, pnum);
1183 if (page == NULL || !PageUptodate(page)) {
1184 if (page)
1185 page_cache_release(page);
1186 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1187 if (page) {
1188 BUG_ON(page->mapping != inode->i_mapping);
1189 if (!PageUptodate(page)) {
1190 ext4_mb_init_cache(page, NULL);
1191 mb_cmp_bitmaps(e4b, page_address(page) +
1192 (poff * sb->s_blocksize));
1193 }
1194 unlock_page(page);
1195 }
1196 }
1197 if (page == NULL || !PageUptodate(page))
1198 goto err;
1199 e4b->bd_bitmap_page = page;
1200 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1201 mark_page_accessed(page);
1202
1203 block++;
1204 pnum = block / blocks_per_page;
1205 poff = block % blocks_per_page;
1206
1207 page = find_get_page(inode->i_mapping, pnum);
1208 if (page == NULL || !PageUptodate(page)) {
1209 if (page)
1210 page_cache_release(page);
1211 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1212 if (page) {
1213 BUG_ON(page->mapping != inode->i_mapping);
1214 if (!PageUptodate(page))
1215 ext4_mb_init_cache(page, e4b->bd_bitmap);
1216
1217 unlock_page(page);
1218 }
1219 }
1220 if (page == NULL || !PageUptodate(page))
1221 goto err;
1222 e4b->bd_buddy_page = page;
1223 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1224 mark_page_accessed(page);
1225
1226 BUG_ON(e4b->bd_bitmap_page == NULL);
1227 BUG_ON(e4b->bd_buddy_page == NULL);
1228
1229 return 0;
1230
1231err:
1232 if (e4b->bd_bitmap_page)
1233 page_cache_release(e4b->bd_bitmap_page);
1234 if (e4b->bd_buddy_page)
1235 page_cache_release(e4b->bd_buddy_page);
1236 e4b->bd_buddy = NULL;
1237 e4b->bd_bitmap = NULL;
1238 return -EIO;
1239}
1240
1241static void ext4_mb_release_desc(struct ext4_buddy *e4b)
1242{
1243 if (e4b->bd_bitmap_page)
1244 page_cache_release(e4b->bd_bitmap_page);
1245 if (e4b->bd_buddy_page)
1246 page_cache_release(e4b->bd_buddy_page);
1247}
1248
1249
1250static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1251{
1252 int order = 1;
1253 void *bb;
1254
1255 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
1256 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1257
1258 bb = EXT4_MB_BUDDY(e4b);
1259 while (order <= e4b->bd_blkbits + 1) {
1260 block = block >> 1;
1261 if (!mb_test_bit(block, bb)) {
1262 /* this block is part of buddy of order 'order' */
1263 return order;
1264 }
1265 bb += 1 << (e4b->bd_blkbits - order);
1266 order++;
1267 }
1268 return 0;
1269}
1270
1271static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1272{
1273 __u32 *addr;
1274
1275 len = cur + len;
1276 while (cur < len) {
1277 if ((cur & 31) == 0 && (len - cur) >= 32) {
1278 /* fast path: clear whole word at once */
1279 addr = bm + (cur >> 3);
1280 *addr = 0;
1281 cur += 32;
1282 continue;
1283 }
1284 mb_clear_bit_atomic(lock, cur, bm);
1285 cur++;
1286 }
1287}
1288
1289static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1290{
1291 __u32 *addr;
1292
1293 len = cur + len;
1294 while (cur < len) {
1295 if ((cur & 31) == 0 && (len - cur) >= 32) {
1296 /* fast path: set whole word at once */
1297 addr = bm + (cur >> 3);
1298 *addr = 0xffffffff;
1299 cur += 32;
1300 continue;
1301 }
1302 mb_set_bit_atomic(lock, cur, bm);
1303 cur++;
1304 }
1305}
1306
1307static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1308 int first, int count)
1309{
1310 int block = 0;
1311 int max = 0;
1312 int order;
1313 void *buddy;
1314 void *buddy2;
1315 struct super_block *sb = e4b->bd_sb;
1316
1317 BUG_ON(first + count > (sb->s_blocksize << 3));
1318 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
1319 mb_check_buddy(e4b);
1320 mb_free_blocks_double(inode, e4b, first, count);
1321
1322 e4b->bd_info->bb_free += count;
1323 if (first < e4b->bd_info->bb_first_free)
1324 e4b->bd_info->bb_first_free = first;
1325
1326 /* let's maintain fragments counter */
1327 if (first != 0)
1328 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
1329 if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
1330 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
1331 if (block && max)
1332 e4b->bd_info->bb_fragments--;
1333 else if (!block && !max)
1334 e4b->bd_info->bb_fragments++;
1335
1336 /* let's maintain buddy itself */
1337 while (count-- > 0) {
1338 block = first++;
1339 order = 0;
1340
1341 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
1342 ext4_fsblk_t blocknr;
1343 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
1344 blocknr += block;
1345 blocknr +=
1346 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1347
1348 ext4_error(sb, __FUNCTION__, "double-free of inode"
1349 " %lu's block %llu(bit %u in group %lu)\n",
1350 inode ? inode->i_ino : 0, blocknr, block,
1351 e4b->bd_group);
1352 }
1353 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1354 e4b->bd_info->bb_counters[order]++;
1355
1356 /* start of the buddy */
1357 buddy = mb_find_buddy(e4b, order, &max);
1358
1359 do {
1360 block &= ~1UL;
1361 if (mb_test_bit(block, buddy) ||
1362 mb_test_bit(block + 1, buddy))
1363 break;
1364
1365 /* both the buddies are free, try to coalesce them */
1366 buddy2 = mb_find_buddy(e4b, order + 1, &max);
1367
1368 if (!buddy2)
1369 break;
1370
1371 if (order > 0) {
1372 /* for special purposes, we don't set
1373 * free bits in bitmap */
1374 mb_set_bit(block, buddy);
1375 mb_set_bit(block + 1, buddy);
1376 }
1377 e4b->bd_info->bb_counters[order]--;
1378 e4b->bd_info->bb_counters[order]--;
1379
1380 block = block >> 1;
1381 order++;
1382 e4b->bd_info->bb_counters[order]++;
1383
1384 mb_clear_bit(block, buddy2);
1385 buddy = buddy2;
1386 } while (1);
1387 }
1388 mb_check_buddy(e4b);
1389
1390 return 0;
1391}
1392
1393static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1394 int needed, struct ext4_free_extent *ex)
1395{
1396 int next = block;
1397 int max;
1398 int ord;
1399 void *buddy;
1400
1401 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
1402 BUG_ON(ex == NULL);
1403
1404 buddy = mb_find_buddy(e4b, order, &max);
1405 BUG_ON(buddy == NULL);
1406 BUG_ON(block >= max);
1407 if (mb_test_bit(block, buddy)) {
1408 ex->fe_len = 0;
1409 ex->fe_start = 0;
1410 ex->fe_group = 0;
1411 return 0;
1412 }
1413
1414 /* FIXME dorp order completely ? */
1415 if (likely(order == 0)) {
1416 /* find actual order */
1417 order = mb_find_order_for_block(e4b, block);
1418 block = block >> order;
1419 }
1420
1421 ex->fe_len = 1 << order;
1422 ex->fe_start = block << order;
1423 ex->fe_group = e4b->bd_group;
1424
1425 /* calc difference from given start */
1426 next = next - ex->fe_start;
1427 ex->fe_len -= next;
1428 ex->fe_start += next;
1429
1430 while (needed > ex->fe_len &&
1431 (buddy = mb_find_buddy(e4b, order, &max))) {
1432
1433 if (block + 1 >= max)
1434 break;
1435
1436 next = (block + 1) * (1 << order);
1437 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
1438 break;
1439
1440 ord = mb_find_order_for_block(e4b, next);
1441
1442 order = ord;
1443 block = next >> order;
1444 ex->fe_len += 1 << order;
1445 }
1446
1447 BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1448 return ex->fe_len;
1449}
1450
1451static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1452{
1453 int ord;
1454 int mlen = 0;
1455 int max = 0;
1456 int cur;
1457 int start = ex->fe_start;
1458 int len = ex->fe_len;
1459 unsigned ret = 0;
1460 int len0 = len;
1461 void *buddy;
1462
1463 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1464 BUG_ON(e4b->bd_group != ex->fe_group);
1465 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
1466 mb_check_buddy(e4b);
1467 mb_mark_used_double(e4b, start, len);
1468
1469 e4b->bd_info->bb_free -= len;
1470 if (e4b->bd_info->bb_first_free == start)
1471 e4b->bd_info->bb_first_free += len;
1472
1473 /* let's maintain fragments counter */
1474 if (start != 0)
1475 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
1476 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1477 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
1478 if (mlen && max)
1479 e4b->bd_info->bb_fragments++;
1480 else if (!mlen && !max)
1481 e4b->bd_info->bb_fragments--;
1482
1483 /* let's maintain buddy itself */
1484 while (len) {
1485 ord = mb_find_order_for_block(e4b, start);
1486
1487 if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1488 /* the whole chunk may be allocated at once! */
1489 mlen = 1 << ord;
1490 buddy = mb_find_buddy(e4b, ord, &max);
1491 BUG_ON((start >> ord) >= max);
1492 mb_set_bit(start >> ord, buddy);
1493 e4b->bd_info->bb_counters[ord]--;
1494 start += mlen;
1495 len -= mlen;
1496 BUG_ON(len < 0);
1497 continue;
1498 }
1499
1500 /* store for history */
1501 if (ret == 0)
1502 ret = len | (ord << 16);
1503
1504 /* we have to split large buddy */
1505 BUG_ON(ord <= 0);
1506 buddy = mb_find_buddy(e4b, ord, &max);
1507 mb_set_bit(start >> ord, buddy);
1508 e4b->bd_info->bb_counters[ord]--;
1509
1510 ord--;
1511 cur = (start >> ord) & ~1U;
1512 buddy = mb_find_buddy(e4b, ord, &max);
1513 mb_clear_bit(cur, buddy);
1514 mb_clear_bit(cur + 1, buddy);
1515 e4b->bd_info->bb_counters[ord]++;
1516 e4b->bd_info->bb_counters[ord]++;
1517 }
1518
1519 mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
1520 EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1521 mb_check_buddy(e4b);
1522
1523 return ret;
1524}
1525
1526/*
1527 * Must be called under group lock!
1528 */
1529static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1530 struct ext4_buddy *e4b)
1531{
1532 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1533 int ret;
1534
1535 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1536 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1537
1538 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1539 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1540 ret = mb_mark_used(e4b, &ac->ac_b_ex);
1541
1542 /* preallocation can change ac_b_ex, thus we store actually
1543 * allocated blocks for history */
1544 ac->ac_f_ex = ac->ac_b_ex;
1545
1546 ac->ac_status = AC_STATUS_FOUND;
1547 ac->ac_tail = ret & 0xffff;
1548 ac->ac_buddy = ret >> 16;
1549
1550 /* XXXXXXX: SUCH A HORRIBLE **CK */
1551 /*FIXME!! Why ? */
1552 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1553 get_page(ac->ac_bitmap_page);
1554 ac->ac_buddy_page = e4b->bd_buddy_page;
1555 get_page(ac->ac_buddy_page);
1556
1557 /* store last allocated for subsequent stream allocation */
1558 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1559 spin_lock(&sbi->s_md_lock);
1560 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1561 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1562 spin_unlock(&sbi->s_md_lock);
1563 }
1564}
1565
1566/*
1567 * regular allocator, for general purposes allocation
1568 */
1569
1570static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1571 struct ext4_buddy *e4b,
1572 int finish_group)
1573{
1574 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1575 struct ext4_free_extent *bex = &ac->ac_b_ex;
1576 struct ext4_free_extent *gex = &ac->ac_g_ex;
1577 struct ext4_free_extent ex;
1578 int max;
1579
1580 /*
1581 * We don't want to scan for a whole year
1582 */
1583 if (ac->ac_found > sbi->s_mb_max_to_scan &&
1584 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1585 ac->ac_status = AC_STATUS_BREAK;
1586 return;
1587 }
1588
1589 /*
1590 * Haven't found good chunk so far, let's continue
1591 */
1592 if (bex->fe_len < gex->fe_len)
1593 return;
1594
1595 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1596 && bex->fe_group == e4b->bd_group) {
1597 /* recheck chunk's availability - we don't know
1598 * when it was found (within this lock-unlock
1599 * period or not) */
1600 max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
1601 if (max >= gex->fe_len) {
1602 ext4_mb_use_best_found(ac, e4b);
1603 return;
1604 }
1605 }
1606}
1607
1608/*
1609 * The routine checks whether found extent is good enough. If it is,
1610 * then the extent gets marked used and flag is set to the context
1611 * to stop scanning. Otherwise, the extent is compared with the
1612 * previous found extent and if new one is better, then it's stored
1613 * in the context. Later, the best found extent will be used, if
1614 * mballoc can't find good enough extent.
1615 *
1616 * FIXME: real allocation policy is to be designed yet!
1617 */
1618static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1619 struct ext4_free_extent *ex,
1620 struct ext4_buddy *e4b)
1621{
1622 struct ext4_free_extent *bex = &ac->ac_b_ex;
1623 struct ext4_free_extent *gex = &ac->ac_g_ex;
1624
1625 BUG_ON(ex->fe_len <= 0);
1626 BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1627 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1628 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1629
1630 ac->ac_found++;
1631
1632 /*
1633 * The special case - take what you catch first
1634 */
1635 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1636 *bex = *ex;
1637 ext4_mb_use_best_found(ac, e4b);
1638 return;
1639 }
1640
1641 /*
1642 * Let's check whether the chuck is good enough
1643 */
1644 if (ex->fe_len == gex->fe_len) {
1645 *bex = *ex;
1646 ext4_mb_use_best_found(ac, e4b);
1647 return;
1648 }
1649
1650 /*
1651 * If this is first found extent, just store it in the context
1652 */
1653 if (bex->fe_len == 0) {
1654 *bex = *ex;
1655 return;
1656 }
1657
1658 /*
1659 * If new found extent is better, store it in the context
1660 */
1661 if (bex->fe_len < gex->fe_len) {
1662 /* if the request isn't satisfied, any found extent
1663 * larger than previous best one is better */
1664 if (ex->fe_len > bex->fe_len)
1665 *bex = *ex;
1666 } else if (ex->fe_len > gex->fe_len) {
1667 /* if the request is satisfied, then we try to find
1668 * an extent that still satisfy the request, but is
1669 * smaller than previous one */
1670 if (ex->fe_len < bex->fe_len)
1671 *bex = *ex;
1672 }
1673
1674 ext4_mb_check_limits(ac, e4b, 0);
1675}
1676
1677static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1678 struct ext4_buddy *e4b)
1679{
1680 struct ext4_free_extent ex = ac->ac_b_ex;
1681 ext4_group_t group = ex.fe_group;
1682 int max;
1683 int err;
1684
1685 BUG_ON(ex.fe_len <= 0);
1686 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1687 if (err)
1688 return err;
1689
1690 ext4_lock_group(ac->ac_sb, group);
1691 max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
1692
1693 if (max > 0) {
1694 ac->ac_b_ex = ex;
1695 ext4_mb_use_best_found(ac, e4b);
1696 }
1697
1698 ext4_unlock_group(ac->ac_sb, group);
1699 ext4_mb_release_desc(e4b);
1700
1701 return 0;
1702}
1703
1704static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1705 struct ext4_buddy *e4b)
1706{
1707 ext4_group_t group = ac->ac_g_ex.fe_group;
1708 int max;
1709 int err;
1710 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1711 struct ext4_super_block *es = sbi->s_es;
1712 struct ext4_free_extent ex;
1713
1714 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1715 return 0;
1716
1717 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1718 if (err)
1719 return err;
1720
1721 ext4_lock_group(ac->ac_sb, group);
1722 max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
1723 ac->ac_g_ex.fe_len, &ex);
1724
1725 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1726 ext4_fsblk_t start;
1727
1728 start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
1729 ex.fe_start + le32_to_cpu(es->s_first_data_block);
1730 /* use do_div to get remainder (would be 64-bit modulo) */
1731 if (do_div(start, sbi->s_stripe) == 0) {
1732 ac->ac_found++;
1733 ac->ac_b_ex = ex;
1734 ext4_mb_use_best_found(ac, e4b);
1735 }
1736 } else if (max >= ac->ac_g_ex.fe_len) {
1737 BUG_ON(ex.fe_len <= 0);
1738 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1739 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1740 ac->ac_found++;
1741 ac->ac_b_ex = ex;
1742 ext4_mb_use_best_found(ac, e4b);
1743 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1744 /* Sometimes, caller may want to merge even small
1745 * number of blocks to an existing extent */
1746 BUG_ON(ex.fe_len <= 0);
1747 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1748 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1749 ac->ac_found++;
1750 ac->ac_b_ex = ex;
1751 ext4_mb_use_best_found(ac, e4b);
1752 }
1753 ext4_unlock_group(ac->ac_sb, group);
1754 ext4_mb_release_desc(e4b);
1755
1756 return 0;
1757}
1758
1759/*
1760 * The routine scans buddy structures (not bitmap!) from given order
1761 * to max order and tries to find big enough chunk to satisfy the req
1762 */
1763static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1764 struct ext4_buddy *e4b)
1765{
1766 struct super_block *sb = ac->ac_sb;
1767 struct ext4_group_info *grp = e4b->bd_info;
1768 void *buddy;
1769 int i;
1770 int k;
1771 int max;
1772
1773 BUG_ON(ac->ac_2order <= 0);
1774 for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1775 if (grp->bb_counters[i] == 0)
1776 continue;
1777
1778 buddy = mb_find_buddy(e4b, i, &max);
1779 BUG_ON(buddy == NULL);
1780
1781 k = ext4_find_next_zero_bit(buddy, max, 0);
1782 BUG_ON(k >= max);
1783
1784 ac->ac_found++;
1785
1786 ac->ac_b_ex.fe_len = 1 << i;
1787 ac->ac_b_ex.fe_start = k << i;
1788 ac->ac_b_ex.fe_group = e4b->bd_group;
1789
1790 ext4_mb_use_best_found(ac, e4b);
1791
1792 BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1793
1794 if (EXT4_SB(sb)->s_mb_stats)
1795 atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1796
1797 break;
1798 }
1799}
1800
1801/*
1802 * The routine scans the group and measures all found extents.
1803 * In order to optimize scanning, caller must pass number of
1804 * free blocks in the group, so the routine can know upper limit.
1805 */
1806static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1807 struct ext4_buddy *e4b)
1808{
1809 struct super_block *sb = ac->ac_sb;
1810 void *bitmap = EXT4_MB_BITMAP(e4b);
1811 struct ext4_free_extent ex;
1812 int i;
1813 int free;
1814
1815 free = e4b->bd_info->bb_free;
1816 BUG_ON(free <= 0);
1817
1818 i = e4b->bd_info->bb_first_free;
1819
1820 while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1821 i = ext4_find_next_zero_bit(bitmap,
1822 EXT4_BLOCKS_PER_GROUP(sb), i);
1823 if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
1824 BUG_ON(free != 0);
1825 break;
1826 }
1827
1828 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1829 BUG_ON(ex.fe_len <= 0);
1830 BUG_ON(free < ex.fe_len);
1831
1832 ext4_mb_measure_extent(ac, &ex, e4b);
1833
1834 i += ex.fe_len;
1835 free -= ex.fe_len;
1836 }
1837
1838 ext4_mb_check_limits(ac, e4b, 1);
1839}
1840
1841/*
1842 * This is a special case for storages like raid5
1843 * we try to find stripe-aligned chunks for stripe-size requests
1844 * XXX should do so at least for multiples of stripe size as well
1845 */
1846static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1847 struct ext4_buddy *e4b)
1848{
1849 struct super_block *sb = ac->ac_sb;
1850 struct ext4_sb_info *sbi = EXT4_SB(sb);
1851 void *bitmap = EXT4_MB_BITMAP(e4b);
1852 struct ext4_free_extent ex;
1853 ext4_fsblk_t first_group_block;
1854 ext4_fsblk_t a;
1855 ext4_grpblk_t i;
1856 int max;
1857
1858 BUG_ON(sbi->s_stripe == 0);
1859
1860 /* find first stripe-aligned block in group */
1861 first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
1862 + le32_to_cpu(sbi->s_es->s_first_data_block);
1863 a = first_group_block + sbi->s_stripe - 1;
1864 do_div(a, sbi->s_stripe);
1865 i = (a * sbi->s_stripe) - first_group_block;
1866
1867 while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
1868 if (!mb_test_bit(i, bitmap)) {
1869 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
1870 if (max >= sbi->s_stripe) {
1871 ac->ac_found++;
1872 ac->ac_b_ex = ex;
1873 ext4_mb_use_best_found(ac, e4b);
1874 break;
1875 }
1876 }
1877 i += sbi->s_stripe;
1878 }
1879}
1880
1881static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1882 ext4_group_t group, int cr)
1883{
1884 unsigned free, fragments;
1885 unsigned i, bits;
1886 struct ext4_group_desc *desc;
1887 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1888
1889 BUG_ON(cr < 0 || cr >= 4);
1890 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
1891
1892 free = grp->bb_free;
1893 fragments = grp->bb_fragments;
1894 if (free == 0)
1895 return 0;
1896 if (fragments == 0)
1897 return 0;
1898
1899 switch (cr) {
1900 case 0:
1901 BUG_ON(ac->ac_2order == 0);
1902 /* If this group is uninitialized, skip it initially */
1903 desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
1904 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1905 return 0;
1906
1907 bits = ac->ac_sb->s_blocksize_bits + 1;
1908 for (i = ac->ac_2order; i <= bits; i++)
1909 if (grp->bb_counters[i] > 0)
1910 return 1;
1911 break;
1912 case 1:
1913 if ((free / fragments) >= ac->ac_g_ex.fe_len)
1914 return 1;
1915 break;
1916 case 2:
1917 if (free >= ac->ac_g_ex.fe_len)
1918 return 1;
1919 break;
1920 case 3:
1921 return 1;
1922 default:
1923 BUG();
1924 }
1925
1926 return 0;
1927}
1928
1929static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1930{
1931 ext4_group_t group;
1932 ext4_group_t i;
1933 int cr;
1934 int err = 0;
1935 int bsbits;
1936 struct ext4_sb_info *sbi;
1937 struct super_block *sb;
1938 struct ext4_buddy e4b;
1939 loff_t size, isize;
1940
1941 sb = ac->ac_sb;
1942 sbi = EXT4_SB(sb);
1943 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1944
1945 /* first, try the goal */
1946 err = ext4_mb_find_by_goal(ac, &e4b);
1947 if (err || ac->ac_status == AC_STATUS_FOUND)
1948 goto out;
1949
1950 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
1951 goto out;
1952
1953 /*
1954 * ac->ac2_order is set only if the fe_len is a power of 2
1955 * if ac2_order is set we also set criteria to 0 so that we
1956 * try exact allocation using buddy.
1957 */
1958 i = fls(ac->ac_g_ex.fe_len);
1959 ac->ac_2order = 0;
1960 /*
1961 * We search using buddy data only if the order of the request
1962 * is greater than equal to the sbi_s_mb_order2_reqs
1963 * You can tune it via /proc/fs/ext4/<partition>/order2_req
1964 */
1965 if (i >= sbi->s_mb_order2_reqs) {
1966 /*
1967 * This should tell if fe_len is exactly power of 2
1968 */
1969 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
1970 ac->ac_2order = i - 1;
1971 }
1972
1973 bsbits = ac->ac_sb->s_blocksize_bits;
1974 /* if stream allocation is enabled, use global goal */
1975 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
1976 isize = i_size_read(ac->ac_inode) >> bsbits;
1977 if (size < isize)
1978 size = isize;
1979
1980 if (size < sbi->s_mb_stream_request &&
1981 (ac->ac_flags & EXT4_MB_HINT_DATA)) {
1982 /* TBD: may be hot point */
1983 spin_lock(&sbi->s_md_lock);
1984 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
1985 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
1986 spin_unlock(&sbi->s_md_lock);
1987 }
1988
1989 /* searching for the right group start from the goal value specified */
1990 group = ac->ac_g_ex.fe_group;
1991
1992 /* Let's just scan groups to find more-less suitable blocks */
1993 cr = ac->ac_2order ? 0 : 1;
1994 /*
1995 * cr == 0 try to get exact allocation,
1996 * cr == 3 try to get anything
1997 */
1998repeat:
1999 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2000 ac->ac_criteria = cr;
2001 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
2002 struct ext4_group_info *grp;
2003 struct ext4_group_desc *desc;
2004
2005 if (group == EXT4_SB(sb)->s_groups_count)
2006 group = 0;
2007
2008 /* quick check to skip empty groups */
2009 grp = ext4_get_group_info(ac->ac_sb, group);
2010 if (grp->bb_free == 0)
2011 continue;
2012
2013 /*
2014 * if the group is already init we check whether it is
2015 * a good group and if not we don't load the buddy
2016 */
2017 if (EXT4_MB_GRP_NEED_INIT(grp)) {
2018 /*
2019 * we need full data about the group
2020 * to make a good selection
2021 */
2022 err = ext4_mb_load_buddy(sb, group, &e4b);
2023 if (err)
2024 goto out;
2025 ext4_mb_release_desc(&e4b);
2026 }
2027
2028 /*
2029 * If the particular group doesn't satisfy our
2030 * criteria we continue with the next group
2031 */
2032 if (!ext4_mb_good_group(ac, group, cr))
2033 continue;
2034
2035 err = ext4_mb_load_buddy(sb, group, &e4b);
2036 if (err)
2037 goto out;
2038
2039 ext4_lock_group(sb, group);
2040 if (!ext4_mb_good_group(ac, group, cr)) {
2041 /* someone did allocation from this group */
2042 ext4_unlock_group(sb, group);
2043 ext4_mb_release_desc(&e4b);
2044 continue;
2045 }
2046
2047 ac->ac_groups_scanned++;
2048 desc = ext4_get_group_desc(sb, group, NULL);
2049 if (cr == 0 || (desc->bg_flags &
2050 cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
2051 ac->ac_2order != 0))
2052 ext4_mb_simple_scan_group(ac, &e4b);
2053 else if (cr == 1 &&
2054 ac->ac_g_ex.fe_len == sbi->s_stripe)
2055 ext4_mb_scan_aligned(ac, &e4b);
2056 else
2057 ext4_mb_complex_scan_group(ac, &e4b);
2058
2059 ext4_unlock_group(sb, group);
2060 ext4_mb_release_desc(&e4b);
2061
2062 if (ac->ac_status != AC_STATUS_CONTINUE)
2063 break;
2064 }
2065 }
2066
2067 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2068 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2069 /*
2070 * We've been searching too long. Let's try to allocate
2071 * the best chunk we've found so far
2072 */
2073
2074 ext4_mb_try_best_found(ac, &e4b);
2075 if (ac->ac_status != AC_STATUS_FOUND) {
2076 /*
2077 * Someone more lucky has already allocated it.
2078 * The only thing we can do is just take first
2079 * found block(s)
2080 printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2081 */
2082 ac->ac_b_ex.fe_group = 0;
2083 ac->ac_b_ex.fe_start = 0;
2084 ac->ac_b_ex.fe_len = 0;
2085 ac->ac_status = AC_STATUS_CONTINUE;
2086 ac->ac_flags |= EXT4_MB_HINT_FIRST;
2087 cr = 3;
2088 atomic_inc(&sbi->s_mb_lost_chunks);
2089 goto repeat;
2090 }
2091 }
2092out:
2093 return err;
2094}
2095
2096#ifdef EXT4_MB_HISTORY
2097struct ext4_mb_proc_session {
2098 struct ext4_mb_history *history;
2099 struct super_block *sb;
2100 int start;
2101 int max;
2102};
2103
2104static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
2105 struct ext4_mb_history *hs,
2106 int first)
2107{
2108 if (hs == s->history + s->max)
2109 hs = s->history;
2110 if (!first && hs == s->history + s->start)
2111 return NULL;
2112 while (hs->orig.fe_len == 0) {
2113 hs++;
2114 if (hs == s->history + s->max)
2115 hs = s->history;
2116 if (hs == s->history + s->start)
2117 return NULL;
2118 }
2119 return hs;
2120}
2121
2122static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
2123{
2124 struct ext4_mb_proc_session *s = seq->private;
2125 struct ext4_mb_history *hs;
2126 int l = *pos;
2127
2128 if (l == 0)
2129 return SEQ_START_TOKEN;
2130 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2131 if (!hs)
2132 return NULL;
2133 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
2134 return hs;
2135}
2136
2137static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
2138 loff_t *pos)
2139{
2140 struct ext4_mb_proc_session *s = seq->private;
2141 struct ext4_mb_history *hs = v;
2142
2143 ++*pos;
2144 if (v == SEQ_START_TOKEN)
2145 return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2146 else
2147 return ext4_mb_history_skip_empty(s, ++hs, 0);
2148}
2149
2150static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2151{
2152 char buf[25], buf2[25], buf3[25], *fmt;
2153 struct ext4_mb_history *hs = v;
2154
2155 if (v == SEQ_START_TOKEN) {
2156 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2157 "%-5s %-2s %-5s %-5s %-5s %-6s\n",
2158 "pid", "inode", "original", "goal", "result", "found",
2159 "grps", "cr", "flags", "merge", "tail", "broken");
2160 return 0;
2161 }
2162
2163 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2164 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2165 "%-5u %-5s %-5u %-6u\n";
2166 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
2167 hs->result.fe_start, hs->result.fe_len,
2168 hs->result.fe_logical);
2169 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
2170 hs->orig.fe_start, hs->orig.fe_len,
2171 hs->orig.fe_logical);
2172 sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
2173 hs->goal.fe_start, hs->goal.fe_len,
2174 hs->goal.fe_logical);
2175 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
2176 hs->found, hs->groups, hs->cr, hs->flags,
2177 hs->merged ? "M" : "", hs->tail,
2178 hs->buddy ? 1 << hs->buddy : 0);
2179 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
2180 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
2181 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
2182 hs->result.fe_start, hs->result.fe_len,
2183 hs->result.fe_logical);
2184 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
2185 hs->orig.fe_start, hs->orig.fe_len,
2186 hs->orig.fe_logical);
2187 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
2188 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
2189 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
2190 hs->result.fe_start, hs->result.fe_len);
2191 seq_printf(seq, "%-5u %-8u %-23s discard\n",
2192 hs->pid, hs->ino, buf2);
2193 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
2194 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
2195 hs->result.fe_start, hs->result.fe_len);
2196 seq_printf(seq, "%-5u %-8u %-23s free\n",
2197 hs->pid, hs->ino, buf2);
2198 }
2199 return 0;
2200}
2201
2202static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2203{
2204}
2205
2206static struct seq_operations ext4_mb_seq_history_ops = {
2207 .start = ext4_mb_seq_history_start,
2208 .next = ext4_mb_seq_history_next,
2209 .stop = ext4_mb_seq_history_stop,
2210 .show = ext4_mb_seq_history_show,
2211};
2212
2213static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
2214{
2215 struct super_block *sb = PDE(inode)->data;
2216 struct ext4_sb_info *sbi = EXT4_SB(sb);
2217 struct ext4_mb_proc_session *s;
2218 int rc;
2219 int size;
2220
2221 s = kmalloc(sizeof(*s), GFP_KERNEL);
2222 if (s == NULL)
2223 return -ENOMEM;
2224 s->sb = sb;
2225 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
2226 s->history = kmalloc(size, GFP_KERNEL);
2227 if (s->history == NULL) {
2228 kfree(s);
2229 return -ENOMEM;
2230 }
2231
2232 spin_lock(&sbi->s_mb_history_lock);
2233 memcpy(s->history, sbi->s_mb_history, size);
2234 s->max = sbi->s_mb_history_max;
2235 s->start = sbi->s_mb_history_cur % s->max;
2236 spin_unlock(&sbi->s_mb_history_lock);
2237
2238 rc = seq_open(file, &ext4_mb_seq_history_ops);
2239 if (rc == 0) {
2240 struct seq_file *m = (struct seq_file *)file->private_data;
2241 m->private = s;
2242 } else {
2243 kfree(s->history);
2244 kfree(s);
2245 }
2246 return rc;
2247
2248}
2249
2250static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
2251{
2252 struct seq_file *seq = (struct seq_file *)file->private_data;
2253 struct ext4_mb_proc_session *s = seq->private;
2254 kfree(s->history);
2255 kfree(s);
2256 return seq_release(inode, file);
2257}
2258
2259static ssize_t ext4_mb_seq_history_write(struct file *file,
2260 const char __user *buffer,
2261 size_t count, loff_t *ppos)
2262{
2263 struct seq_file *seq = (struct seq_file *)file->private_data;
2264 struct ext4_mb_proc_session *s = seq->private;
2265 struct super_block *sb = s->sb;
2266 char str[32];
2267 int value;
2268
2269 if (count >= sizeof(str)) {
2270 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
2271 "mb_history", (int)sizeof(str));
2272 return -EOVERFLOW;
2273 }
2274
2275 if (copy_from_user(str, buffer, count))
2276 return -EFAULT;
2277
2278 value = simple_strtol(str, NULL, 0);
2279 if (value < 0)
2280 return -ERANGE;
2281 EXT4_SB(sb)->s_mb_history_filter = value;
2282
2283 return count;
2284}
2285
2286static struct file_operations ext4_mb_seq_history_fops = {
2287 .owner = THIS_MODULE,
2288 .open = ext4_mb_seq_history_open,
2289 .read = seq_read,
2290 .write = ext4_mb_seq_history_write,
2291 .llseek = seq_lseek,
2292 .release = ext4_mb_seq_history_release,
2293};
2294
2295static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2296{
2297 struct super_block *sb = seq->private;
2298 struct ext4_sb_info *sbi = EXT4_SB(sb);
2299 ext4_group_t group;
2300
2301 if (*pos < 0 || *pos >= sbi->s_groups_count)
2302 return NULL;
2303
2304 group = *pos + 1;
2305 return (void *) group;
2306}
2307
2308static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2309{
2310 struct super_block *sb = seq->private;
2311 struct ext4_sb_info *sbi = EXT4_SB(sb);
2312 ext4_group_t group;
2313
2314 ++*pos;
2315 if (*pos < 0 || *pos >= sbi->s_groups_count)
2316 return NULL;
2317 group = *pos + 1;
2318 return (void *) group;;
2319}
2320
2321static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2322{
2323 struct super_block *sb = seq->private;
2324 long group = (long) v;
2325 int i;
2326 int err;
2327 struct ext4_buddy e4b;
2328 struct sg {
2329 struct ext4_group_info info;
2330 unsigned short counters[16];
2331 } sg;
2332
2333 group--;
2334 if (group == 0)
2335 seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
2336 "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
2337 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
2338 "group", "free", "frags", "first",
2339 "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
2340 "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
2341
2342 i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2343 sizeof(struct ext4_group_info);
2344 err = ext4_mb_load_buddy(sb, group, &e4b);
2345 if (err) {
2346 seq_printf(seq, "#%-5lu: I/O error\n", group);
2347 return 0;
2348 }
2349 ext4_lock_group(sb, group);
2350 memcpy(&sg, ext4_get_group_info(sb, group), i);
2351 ext4_unlock_group(sb, group);
2352 ext4_mb_release_desc(&e4b);
2353
2354 seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
2355 sg.info.bb_fragments, sg.info.bb_first_free);
2356 for (i = 0; i <= 13; i++)
2357 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2358 sg.info.bb_counters[i] : 0);
2359 seq_printf(seq, " ]\n");
2360
2361 return 0;
2362}
2363
2364static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2365{
2366}
2367
2368static struct seq_operations ext4_mb_seq_groups_ops = {
2369 .start = ext4_mb_seq_groups_start,
2370 .next = ext4_mb_seq_groups_next,
2371 .stop = ext4_mb_seq_groups_stop,
2372 .show = ext4_mb_seq_groups_show,
2373};
2374
2375static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2376{
2377 struct super_block *sb = PDE(inode)->data;
2378 int rc;
2379
2380 rc = seq_open(file, &ext4_mb_seq_groups_ops);
2381 if (rc == 0) {
2382 struct seq_file *m = (struct seq_file *)file->private_data;
2383 m->private = sb;
2384 }
2385 return rc;
2386
2387}
2388
2389static struct file_operations ext4_mb_seq_groups_fops = {
2390 .owner = THIS_MODULE,
2391 .open = ext4_mb_seq_groups_open,
2392 .read = seq_read,
2393 .llseek = seq_lseek,
2394 .release = seq_release,
2395};
2396
2397static void ext4_mb_history_release(struct super_block *sb)
2398{
2399 struct ext4_sb_info *sbi = EXT4_SB(sb);
2400
2401 remove_proc_entry("mb_groups", sbi->s_mb_proc);
2402 remove_proc_entry("mb_history", sbi->s_mb_proc);
2403
2404 kfree(sbi->s_mb_history);
2405}
2406
2407static void ext4_mb_history_init(struct super_block *sb)
2408{
2409 struct ext4_sb_info *sbi = EXT4_SB(sb);
2410 int i;
2411
2412 if (sbi->s_mb_proc != NULL) {
2413 struct proc_dir_entry *p;
2414 p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
2415 if (p) {
2416 p->proc_fops = &ext4_mb_seq_history_fops;
2417 p->data = sb;
2418 }
2419 p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc);
2420 if (p) {
2421 p->proc_fops = &ext4_mb_seq_groups_fops;
2422 p->data = sb;
2423 }
2424 }
2425
2426 sbi->s_mb_history_max = 1000;
2427 sbi->s_mb_history_cur = 0;
2428 spin_lock_init(&sbi->s_mb_history_lock);
2429 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2430 sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
2431 if (likely(sbi->s_mb_history != NULL))
2432 memset(sbi->s_mb_history, 0, i);
2433 /* if we can't allocate history, then we simple won't use it */
2434}
2435
2436static void ext4_mb_store_history(struct ext4_allocation_context *ac)
2437{
2438 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2439 struct ext4_mb_history h;
2440
2441 if (unlikely(sbi->s_mb_history == NULL))
2442 return;
2443
2444 if (!(ac->ac_op & sbi->s_mb_history_filter))
2445 return;
2446
2447 h.op = ac->ac_op;
2448 h.pid = current->pid;
2449 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
2450 h.orig = ac->ac_o_ex;
2451 h.result = ac->ac_b_ex;
2452 h.flags = ac->ac_flags;
2453 h.found = ac->ac_found;
2454 h.groups = ac->ac_groups_scanned;
2455 h.cr = ac->ac_criteria;
2456 h.tail = ac->ac_tail;
2457 h.buddy = ac->ac_buddy;
2458 h.merged = 0;
2459 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
2460 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
2461 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
2462 h.merged = 1;
2463 h.goal = ac->ac_g_ex;
2464 h.result = ac->ac_f_ex;
2465 }
2466
2467 spin_lock(&sbi->s_mb_history_lock);
2468 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
2469 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
2470 sbi->s_mb_history_cur = 0;
2471 spin_unlock(&sbi->s_mb_history_lock);
2472}
2473
2474#else
2475#define ext4_mb_history_release(sb)
2476#define ext4_mb_history_init(sb)
2477#endif
2478
2479static int ext4_mb_init_backend(struct super_block *sb)
2480{
2481 ext4_group_t i;
2482 int j, len, metalen;
2483 struct ext4_sb_info *sbi = EXT4_SB(sb);
2484 int num_meta_group_infos =
2485 (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2486 EXT4_DESC_PER_BLOCK_BITS(sb);
2487 struct ext4_group_info **meta_group_info;
2488
2489 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2490 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2491 * So a two level scheme suffices for now. */
2492 sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
2493 num_meta_group_infos, GFP_KERNEL);
2494 if (sbi->s_group_info == NULL) {
2495 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2496 return -ENOMEM;
2497 }
2498 sbi->s_buddy_cache = new_inode(sb);
2499 if (sbi->s_buddy_cache == NULL) {
2500 printk(KERN_ERR "EXT4-fs: can't get new inode\n");
2501 goto err_freesgi;
2502 }
2503 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2504
2505 metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
2506 for (i = 0; i < num_meta_group_infos; i++) {
2507 if ((i + 1) == num_meta_group_infos)
2508 metalen = sizeof(*meta_group_info) *
2509 (sbi->s_groups_count -
2510 (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2511 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2512 if (meta_group_info == NULL) {
2513 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2514 "buddy group\n");
2515 goto err_freemeta;
2516 }
2517 sbi->s_group_info[i] = meta_group_info;
2518 }
2519
2520 /*
2521 * calculate needed size. if change bb_counters size,
2522 * don't forget about ext4_mb_generate_buddy()
2523 */
2524 len = sizeof(struct ext4_group_info);
2525 len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
2526 for (i = 0; i < sbi->s_groups_count; i++) {
2527 struct ext4_group_desc *desc;
2528
2529 meta_group_info =
2530 sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2531 j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
2532
2533 meta_group_info[j] = kzalloc(len, GFP_KERNEL);
2534 if (meta_group_info[j] == NULL) {
2535 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2536 i--;
2537 goto err_freebuddy;
2538 }
2539 desc = ext4_get_group_desc(sb, i, NULL);
2540 if (desc == NULL) {
2541 printk(KERN_ERR
2542 "EXT4-fs: can't read descriptor %lu\n", i);
2543 goto err_freebuddy;
2544 }
2545 memset(meta_group_info[j], 0, len);
2546 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2547 &(meta_group_info[j]->bb_state));
2548
2549 /*
2550 * initialize bb_free to be able to skip
2551 * empty groups without initialization
2552 */
2553 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2554 meta_group_info[j]->bb_free =
2555 ext4_free_blocks_after_init(sb, i, desc);
2556 } else {
2557 meta_group_info[j]->bb_free =
2558 le16_to_cpu(desc->bg_free_blocks_count);
2559 }
2560
2561 INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
2562
2563#ifdef DOUBLE_CHECK
2564 {
2565 struct buffer_head *bh;
2566 meta_group_info[j]->bb_bitmap =
2567 kmalloc(sb->s_blocksize, GFP_KERNEL);
2568 BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
2569 bh = read_block_bitmap(sb, i);
2570 BUG_ON(bh == NULL);
2571 memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
2572 sb->s_blocksize);
2573 put_bh(bh);
2574 }
2575#endif
2576
2577 }
2578
2579 return 0;
2580
2581err_freebuddy:
2582 while (i >= 0) {
2583 kfree(ext4_get_group_info(sb, i));
2584 i--;
2585 }
2586 i = num_meta_group_infos;
2587err_freemeta:
2588 while (--i >= 0)
2589 kfree(sbi->s_group_info[i]);
2590 iput(sbi->s_buddy_cache);
2591err_freesgi:
2592 kfree(sbi->s_group_info);
2593 return -ENOMEM;
2594}
2595
2596int ext4_mb_init(struct super_block *sb, int needs_recovery)
2597{
2598 struct ext4_sb_info *sbi = EXT4_SB(sb);
2599 unsigned i;
2600 unsigned offset;
2601 unsigned max;
2602
2603 if (!test_opt(sb, MBALLOC))
2604 return 0;
2605
2606 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
2607
2608 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2609 if (sbi->s_mb_offsets == NULL) {
2610 clear_opt(sbi->s_mount_opt, MBALLOC);
2611 return -ENOMEM;
2612 }
2613 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2614 if (sbi->s_mb_maxs == NULL) {
2615 clear_opt(sbi->s_mount_opt, MBALLOC);
2616 kfree(sbi->s_mb_maxs);
2617 return -ENOMEM;
2618 }
2619
2620 /* order 0 is regular bitmap */
2621 sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2622 sbi->s_mb_offsets[0] = 0;
2623
2624 i = 1;
2625 offset = 0;
2626 max = sb->s_blocksize << 2;
2627 do {
2628 sbi->s_mb_offsets[i] = offset;
2629 sbi->s_mb_maxs[i] = max;
2630 offset += 1 << (sb->s_blocksize_bits - i);
2631 max = max >> 1;
2632 i++;
2633 } while (i <= sb->s_blocksize_bits + 1);
2634
2635 /* init file for buddy data */
2636 i = ext4_mb_init_backend(sb);
2637 if (i) {
2638 clear_opt(sbi->s_mount_opt, MBALLOC);
2639 kfree(sbi->s_mb_offsets);
2640 kfree(sbi->s_mb_maxs);
2641 return i;
2642 }
2643
2644 spin_lock_init(&sbi->s_md_lock);
2645 INIT_LIST_HEAD(&sbi->s_active_transaction);
2646 INIT_LIST_HEAD(&sbi->s_closed_transaction);
2647 INIT_LIST_HEAD(&sbi->s_committed_transaction);
2648 spin_lock_init(&sbi->s_bal_lock);
2649
2650 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2651 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2652 sbi->s_mb_stats = MB_DEFAULT_STATS;
2653 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2654 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2655 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2656 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2657
2658 i = sizeof(struct ext4_locality_group) * NR_CPUS;
2659 sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
2660 if (sbi->s_locality_groups == NULL) {
2661 clear_opt(sbi->s_mount_opt, MBALLOC);
2662 kfree(sbi->s_mb_offsets);
2663 kfree(sbi->s_mb_maxs);
2664 return -ENOMEM;
2665 }
2666 for (i = 0; i < NR_CPUS; i++) {
2667 struct ext4_locality_group *lg;
2668 lg = &sbi->s_locality_groups[i];
2669 mutex_init(&lg->lg_mutex);
2670 INIT_LIST_HEAD(&lg->lg_prealloc_list);
2671 spin_lock_init(&lg->lg_prealloc_lock);
2672 }
2673
2674 ext4_mb_init_per_dev_proc(sb);
2675 ext4_mb_history_init(sb);
2676
2677 printk("EXT4-fs: mballoc enabled\n");
2678 return 0;
2679}
2680
2681/* need to called with ext4 group lock (ext4_lock_group) */
2682static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2683{
2684 struct ext4_prealloc_space *pa;
2685 struct list_head *cur, *tmp;
2686 int count = 0;
2687
2688 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2689 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2690 list_del(&pa->pa_group_list);
2691 count++;
2692 kfree(pa);
2693 }
2694 if (count)
2695 mb_debug("mballoc: %u PAs left\n", count);
2696
2697}
2698
2699int ext4_mb_release(struct super_block *sb)
2700{
2701 ext4_group_t i;
2702 int num_meta_group_infos;
2703 struct ext4_group_info *grinfo;
2704 struct ext4_sb_info *sbi = EXT4_SB(sb);
2705
2706 if (!test_opt(sb, MBALLOC))
2707 return 0;
2708
2709 /* release freed, non-committed blocks */
2710 spin_lock(&sbi->s_md_lock);
2711 list_splice_init(&sbi->s_closed_transaction,
2712 &sbi->s_committed_transaction);
2713 list_splice_init(&sbi->s_active_transaction,
2714 &sbi->s_committed_transaction);
2715 spin_unlock(&sbi->s_md_lock);
2716 ext4_mb_free_committed_blocks(sb);
2717
2718 if (sbi->s_group_info) {
2719 for (i = 0; i < sbi->s_groups_count; i++) {
2720 grinfo = ext4_get_group_info(sb, i);
2721#ifdef DOUBLE_CHECK
2722 kfree(grinfo->bb_bitmap);
2723#endif
2724 ext4_lock_group(sb, i);
2725 ext4_mb_cleanup_pa(grinfo);
2726 ext4_unlock_group(sb, i);
2727 kfree(grinfo);
2728 }
2729 num_meta_group_infos = (sbi->s_groups_count +
2730 EXT4_DESC_PER_BLOCK(sb) - 1) >>
2731 EXT4_DESC_PER_BLOCK_BITS(sb);
2732 for (i = 0; i < num_meta_group_infos; i++)
2733 kfree(sbi->s_group_info[i]);
2734 kfree(sbi->s_group_info);
2735 }
2736 kfree(sbi->s_mb_offsets);
2737 kfree(sbi->s_mb_maxs);
2738 if (sbi->s_buddy_cache)
2739 iput(sbi->s_buddy_cache);
2740 if (sbi->s_mb_stats) {
2741 printk(KERN_INFO
2742 "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
2743 atomic_read(&sbi->s_bal_allocated),
2744 atomic_read(&sbi->s_bal_reqs),
2745 atomic_read(&sbi->s_bal_success));
2746 printk(KERN_INFO
2747 "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
2748 "%u 2^N hits, %u breaks, %u lost\n",
2749 atomic_read(&sbi->s_bal_ex_scanned),
2750 atomic_read(&sbi->s_bal_goals),
2751 atomic_read(&sbi->s_bal_2orders),
2752 atomic_read(&sbi->s_bal_breaks),
2753 atomic_read(&sbi->s_mb_lost_chunks));
2754 printk(KERN_INFO
2755 "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
2756 sbi->s_mb_buddies_generated++,
2757 sbi->s_mb_generation_time);
2758 printk(KERN_INFO
2759 "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
2760 atomic_read(&sbi->s_mb_preallocated),
2761 atomic_read(&sbi->s_mb_discarded));
2762 }
2763
2764 kfree(sbi->s_locality_groups);
2765
2766 ext4_mb_history_release(sb);
2767 ext4_mb_destroy_per_dev_proc(sb);
2768
2769 return 0;
2770}
2771
2772static void ext4_mb_free_committed_blocks(struct super_block *sb)
2773{
2774 struct ext4_sb_info *sbi = EXT4_SB(sb);
2775 int err;
2776 int i;
2777 int count = 0;
2778 int count2 = 0;
2779 struct ext4_free_metadata *md;
2780 struct ext4_buddy e4b;
2781
2782 if (list_empty(&sbi->s_committed_transaction))
2783 return;
2784
2785 /* there is committed blocks to be freed yet */
2786 do {
2787 /* get next array of blocks */
2788 md = NULL;
2789 spin_lock(&sbi->s_md_lock);
2790 if (!list_empty(&sbi->s_committed_transaction)) {
2791 md = list_entry(sbi->s_committed_transaction.next,
2792 struct ext4_free_metadata, list);
2793 list_del(&md->list);
2794 }
2795 spin_unlock(&sbi->s_md_lock);
2796
2797 if (md == NULL)
2798 break;
2799
2800 mb_debug("gonna free %u blocks in group %lu (0x%p):",
2801 md->num, md->group, md);
2802
2803 err = ext4_mb_load_buddy(sb, md->group, &e4b);
2804 /* we expect to find existing buddy because it's pinned */
2805 BUG_ON(err != 0);
2806
2807 /* there are blocks to put in buddy to make them really free */
2808 count += md->num;
2809 count2++;
2810 ext4_lock_group(sb, md->group);
2811 for (i = 0; i < md->num; i++) {
2812 mb_debug(" %u", md->blocks[i]);
2813 err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
2814 BUG_ON(err != 0);
2815 }
2816 mb_debug("\n");
2817 ext4_unlock_group(sb, md->group);
2818
2819 /* balance refcounts from ext4_mb_free_metadata() */
2820 page_cache_release(e4b.bd_buddy_page);
2821 page_cache_release(e4b.bd_bitmap_page);
2822
2823 kfree(md);
2824 ext4_mb_release_desc(&e4b);
2825
2826 } while (md);
2827
2828 mb_debug("freed %u blocks in %u structures\n", count, count2);
2829}
2830
2831#define EXT4_ROOT "ext4"
2832#define EXT4_MB_STATS_NAME "stats"
2833#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
2834#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
2835#define EXT4_MB_ORDER2_REQ "order2_req"
2836#define EXT4_MB_STREAM_REQ "stream_req"
2837#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
2838
2839
2840
2841#define MB_PROC_VALUE_READ(name) \
2842static int ext4_mb_read_##name(char *page, char **start, \
2843 off_t off, int count, int *eof, void *data) \
2844{ \
2845 struct ext4_sb_info *sbi = data; \
2846 int len; \
2847 *eof = 1; \
2848 if (off != 0) \
2849 return 0; \
2850 len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
2851 *start = page; \
2852 return len; \
2853}
2854
2855#define MB_PROC_VALUE_WRITE(name) \
2856static int ext4_mb_write_##name(struct file *file, \
2857 const char __user *buf, unsigned long cnt, void *data) \
2858{ \
2859 struct ext4_sb_info *sbi = data; \
2860 char str[32]; \
2861 long value; \
2862 if (cnt >= sizeof(str)) \
2863 return -EINVAL; \
2864 if (copy_from_user(str, buf, cnt)) \
2865 return -EFAULT; \
2866 value = simple_strtol(str, NULL, 0); \
2867 if (value <= 0) \
2868 return -ERANGE; \
2869 sbi->s_mb_##name = value; \
2870 return cnt; \
2871}
2872
2873MB_PROC_VALUE_READ(stats);
2874MB_PROC_VALUE_WRITE(stats);
2875MB_PROC_VALUE_READ(max_to_scan);
2876MB_PROC_VALUE_WRITE(max_to_scan);
2877MB_PROC_VALUE_READ(min_to_scan);
2878MB_PROC_VALUE_WRITE(min_to_scan);
2879MB_PROC_VALUE_READ(order2_reqs);
2880MB_PROC_VALUE_WRITE(order2_reqs);
2881MB_PROC_VALUE_READ(stream_request);
2882MB_PROC_VALUE_WRITE(stream_request);
2883MB_PROC_VALUE_READ(group_prealloc);
2884MB_PROC_VALUE_WRITE(group_prealloc);
2885
2886#define MB_PROC_HANDLER(name, var) \
2887do { \
2888 proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
2889 if (proc == NULL) { \
2890 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
2891 goto err_out; \
2892 } \
2893 proc->data = sbi; \
2894 proc->read_proc = ext4_mb_read_##var ; \
2895 proc->write_proc = ext4_mb_write_##var; \
2896} while (0)
2897
2898static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2899{
2900 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2901 struct ext4_sb_info *sbi = EXT4_SB(sb);
2902 struct proc_dir_entry *proc;
2903 char devname[64];
2904
2905 snprintf(devname, sizeof(devname) - 1, "%s",
2906 bdevname(sb->s_bdev, devname));
2907 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
2908
2909 MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
2910 MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
2911 MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
2912 MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
2913 MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
2914 MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
2915
2916 return 0;
2917
2918err_out:
2919 printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
2920 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
2921 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
2922 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
2923 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
2924 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
2925 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2926 remove_proc_entry(devname, proc_root_ext4);
2927 sbi->s_mb_proc = NULL;
2928
2929 return -ENOMEM;
2930}
2931
2932static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2933{
2934 struct ext4_sb_info *sbi = EXT4_SB(sb);
2935 char devname[64];
2936
2937 if (sbi->s_mb_proc == NULL)
2938 return -EINVAL;
2939
2940 snprintf(devname, sizeof(devname) - 1, "%s",
2941 bdevname(sb->s_bdev, devname));
2942 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
2943 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
2944 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
2945 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
2946 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
2947 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2948 remove_proc_entry(devname, proc_root_ext4);
2949
2950 return 0;
2951}
2952
2953int __init init_ext4_mballoc(void)
2954{
2955 ext4_pspace_cachep =
2956 kmem_cache_create("ext4_prealloc_space",
2957 sizeof(struct ext4_prealloc_space),
2958 0, SLAB_RECLAIM_ACCOUNT, NULL);
2959 if (ext4_pspace_cachep == NULL)
2960 return -ENOMEM;
2961
2962#ifdef CONFIG_PROC_FS
2963 proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs);
2964 if (proc_root_ext4 == NULL)
2965 printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT);
2966#endif
2967
2968 return 0;
2969}
2970
2971void exit_ext4_mballoc(void)
2972{
2973 /* XXX: synchronize_rcu(); */
2974 kmem_cache_destroy(ext4_pspace_cachep);
2975#ifdef CONFIG_PROC_FS
2976 remove_proc_entry(EXT4_ROOT, proc_root_fs);
2977#endif
2978}
2979
2980
2981/*
2982 * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps
2983 * Returns 0 if success or error code
2984 */
2985static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2986 handle_t *handle)
2987{
2988 struct buffer_head *bitmap_bh = NULL;
2989 struct ext4_super_block *es;
2990 struct ext4_group_desc *gdp;
2991 struct buffer_head *gdp_bh;
2992 struct ext4_sb_info *sbi;
2993 struct super_block *sb;
2994 ext4_fsblk_t block;
2995 int err;
2996
2997 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2998 BUG_ON(ac->ac_b_ex.fe_len <= 0);
2999
3000 sb = ac->ac_sb;
3001 sbi = EXT4_SB(sb);
3002 es = sbi->s_es;
3003
3004 ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
3005 gdp->bg_free_blocks_count);
3006
3007 err = -EIO;
3008 bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
3009 if (!bitmap_bh)
3010 goto out_err;
3011
3012 err = ext4_journal_get_write_access(handle, bitmap_bh);
3013 if (err)
3014 goto out_err;
3015
3016 err = -EIO;
3017 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
3018 if (!gdp)
3019 goto out_err;
3020
3021 err = ext4_journal_get_write_access(handle, gdp_bh);
3022 if (err)
3023 goto out_err;
3024
3025 block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
3026 + ac->ac_b_ex.fe_start
3027 + le32_to_cpu(es->s_first_data_block);
3028
3029 if (block == ext4_block_bitmap(sb, gdp) ||
3030 block == ext4_inode_bitmap(sb, gdp) ||
3031 in_range(block, ext4_inode_table(sb, gdp),
3032 EXT4_SB(sb)->s_itb_per_group)) {
3033
3034 ext4_error(sb, __FUNCTION__,
3035 "Allocating block in system zone - block = %llu",
3036 block);
3037 }
3038#ifdef AGGRESSIVE_CHECK
3039 {
3040 int i;
3041 for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
3042 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
3043 bitmap_bh->b_data));
3044 }
3045 }
3046#endif
3047 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
3048 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
3049
3050 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
3051 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
3052 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3053 gdp->bg_free_blocks_count =
3054 cpu_to_le16(ext4_free_blocks_after_init(sb,
3055 ac->ac_b_ex.fe_group,
3056 gdp));
3057 }
3058 gdp->bg_free_blocks_count =
3059 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
3060 - ac->ac_b_ex.fe_len);
3061 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
3062 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
3063 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
3064
3065 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
3066 if (err)
3067 goto out_err;
3068 err = ext4_journal_dirty_metadata(handle, gdp_bh);
3069
3070out_err:
3071 sb->s_dirt = 1;
3072 put_bh(bitmap_bh);
3073 return err;
3074}
3075
3076/*
3077 * here we normalize request for locality group
3078 * Group request are normalized to s_strip size if we set the same via mount
3079 * option. If not we set it to s_mb_group_prealloc which can be configured via
3080 * /proc/fs/ext4/<partition>/group_prealloc
3081 *
3082 * XXX: should we try to preallocate more than the group has now?
3083 */
3084static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3085{
3086 struct super_block *sb = ac->ac_sb;
3087 struct ext4_locality_group *lg = ac->ac_lg;
3088
3089 BUG_ON(lg == NULL);
3090 if (EXT4_SB(sb)->s_stripe)
3091 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
3092 else
3093 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3094 mb_debug("#%u: goal %lu blocks for locality group\n",
3095 current->pid, ac->ac_g_ex.fe_len);
3096}
3097
3098/*
3099 * Normalization means making request better in terms of
3100 * size and alignment
3101 */
3102static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3103 struct ext4_allocation_request *ar)
3104{
3105 int bsbits, max;
3106 ext4_lblk_t end;
3107 struct list_head *cur;
3108 loff_t size, orig_size, start_off;
3109 ext4_lblk_t start, orig_start;
3110 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3111
3112 /* do normalize only data requests, metadata requests
3113 do not need preallocation */
3114 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3115 return;
3116
3117 /* sometime caller may want exact blocks */
3118 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3119 return;
3120
3121 /* caller may indicate that preallocation isn't
3122 * required (it's a tail, for example) */
3123 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
3124 return;
3125
3126 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
3127 ext4_mb_normalize_group_request(ac);
3128 return ;
3129 }
3130
3131 bsbits = ac->ac_sb->s_blocksize_bits;
3132
3133 /* first, let's learn actual file size
3134 * given current request is allocated */
3135 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
3136 size = size << bsbits;
3137 if (size < i_size_read(ac->ac_inode))
3138 size = i_size_read(ac->ac_inode);
3139
3140 /* max available blocks in a free group */
3141 max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 -
3142 EXT4_SB(ac->ac_sb)->s_itb_per_group;
3143
3144#define NRL_CHECK_SIZE(req, size, max,bits) \
3145 (req <= (size) || max <= ((size) >> bits))
3146
3147 /* first, try to predict filesize */
3148 /* XXX: should this table be tunable? */
3149 start_off = 0;
3150 if (size <= 16 * 1024) {
3151 size = 16 * 1024;
3152 } else if (size <= 32 * 1024) {
3153 size = 32 * 1024;
3154 } else if (size <= 64 * 1024) {
3155 size = 64 * 1024;
3156 } else if (size <= 128 * 1024) {
3157 size = 128 * 1024;
3158 } else if (size <= 256 * 1024) {
3159 size = 256 * 1024;
3160 } else if (size <= 512 * 1024) {
3161 size = 512 * 1024;
3162 } else if (size <= 1024 * 1024) {
3163 size = 1024 * 1024;
3164 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) {
3165 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3166 (20 - bsbits)) << 20;
3167 size = 1024 * 1024;
3168 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) {
3169 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3170 (22 - bsbits)) << 22;
3171 size = 4 * 1024 * 1024;
3172 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3173 (8<<20)>>bsbits, max, bsbits)) {
3174 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3175 (23 - bsbits)) << 23;
3176 size = 8 * 1024 * 1024;
3177 } else {
3178 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
3179 size = ac->ac_o_ex.fe_len << bsbits;
3180 }
3181 orig_size = size = size >> bsbits;
3182 orig_start = start = start_off >> bsbits;
3183
3184 /* don't cover already allocated blocks in selected range */
3185 if (ar->pleft && start <= ar->lleft) {
3186 size -= ar->lleft + 1 - start;
3187 start = ar->lleft + 1;
3188 }
3189 if (ar->pright && start + size - 1 >= ar->lright)
3190 size -= start + size - ar->lright;
3191
3192 end = start + size;
3193
3194 /* check we don't cross already preallocated blocks */
3195 rcu_read_lock();
3196 list_for_each_rcu(cur, &ei->i_prealloc_list) {
3197 struct ext4_prealloc_space *pa;
3198 unsigned long pa_end;
3199
3200 pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
3201
3202 if (pa->pa_deleted)
3203 continue;
3204 spin_lock(&pa->pa_lock);
3205 if (pa->pa_deleted) {
3206 spin_unlock(&pa->pa_lock);
3207 continue;
3208 }
3209
3210 pa_end = pa->pa_lstart + pa->pa_len;
3211
3212 /* PA must not overlap original request */
3213 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3214 ac->ac_o_ex.fe_logical < pa->pa_lstart));
3215
3216 /* skip PA normalized request doesn't overlap with */
3217 if (pa->pa_lstart >= end) {
3218 spin_unlock(&pa->pa_lock);
3219 continue;
3220 }
3221 if (pa_end <= start) {
3222 spin_unlock(&pa->pa_lock);
3223 continue;
3224 }
3225 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3226
3227 if (pa_end <= ac->ac_o_ex.fe_logical) {
3228 BUG_ON(pa_end < start);
3229 start = pa_end;
3230 }
3231
3232 if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3233 BUG_ON(pa->pa_lstart > end);
3234 end = pa->pa_lstart;
3235 }
3236 spin_unlock(&pa->pa_lock);
3237 }
3238 rcu_read_unlock();
3239 size = end - start;
3240
3241 /* XXX: extra loop to check we really don't overlap preallocations */
3242 rcu_read_lock();
3243 list_for_each_rcu(cur, &ei->i_prealloc_list) {
3244 struct ext4_prealloc_space *pa;
3245 unsigned long pa_end;
3246 pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
3247 spin_lock(&pa->pa_lock);
3248 if (pa->pa_deleted == 0) {
3249 pa_end = pa->pa_lstart + pa->pa_len;
3250 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3251 }
3252 spin_unlock(&pa->pa_lock);
3253 }
3254 rcu_read_unlock();
3255
3256 if (start + size <= ac->ac_o_ex.fe_logical &&
3257 start > ac->ac_o_ex.fe_logical) {
3258 printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
3259 (unsigned long) start, (unsigned long) size,
3260 (unsigned long) ac->ac_o_ex.fe_logical);
3261 }
3262 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3263 start > ac->ac_o_ex.fe_logical);
3264 BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3265
3266 /* now prepare goal request */
3267
3268 /* XXX: is it better to align blocks WRT to logical
3269 * placement or satisfy big request as is */
3270 ac->ac_g_ex.fe_logical = start;
3271 ac->ac_g_ex.fe_len = size;
3272
3273 /* define goal start in order to merge */
3274 if (ar->pright && (ar->lright == (start + size))) {
3275 /* merge to the right */
3276 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3277 &ac->ac_f_ex.fe_group,
3278 &ac->ac_f_ex.fe_start);
3279 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3280 }
3281 if (ar->pleft && (ar->lleft + 1 == start)) {
3282 /* merge to the left */
3283 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3284 &ac->ac_f_ex.fe_group,
3285 &ac->ac_f_ex.fe_start);
3286 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3287 }
3288
3289 mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
3290 (unsigned) orig_size, (unsigned) start);
3291}
3292
3293static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3294{
3295 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3296
3297 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3298 atomic_inc(&sbi->s_bal_reqs);
3299 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3300 if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
3301 atomic_inc(&sbi->s_bal_success);
3302 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3303 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3304 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3305 atomic_inc(&sbi->s_bal_goals);
3306 if (ac->ac_found > sbi->s_mb_max_to_scan)
3307 atomic_inc(&sbi->s_bal_breaks);
3308 }
3309
3310 ext4_mb_store_history(ac);
3311}
3312
3313/*
3314 * use blocks preallocated to inode
3315 */
3316static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3317 struct ext4_prealloc_space *pa)
3318{
3319 ext4_fsblk_t start;
3320 ext4_fsblk_t end;
3321 int len;
3322
3323 /* found preallocated blocks, use them */
3324 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3325 end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
3326 len = end - start;
3327 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3328 &ac->ac_b_ex.fe_start);
3329 ac->ac_b_ex.fe_len = len;
3330 ac->ac_status = AC_STATUS_FOUND;
3331 ac->ac_pa = pa;
3332
3333 BUG_ON(start < pa->pa_pstart);
3334 BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
3335 BUG_ON(pa->pa_free < len);
3336 pa->pa_free -= len;
3337
3338 mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa);
3339}
3340
3341/*
3342 * use blocks preallocated to locality group
3343 */
3344static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3345 struct ext4_prealloc_space *pa)
3346{
3347 unsigned len = ac->ac_o_ex.fe_len;
3348
3349 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3350 &ac->ac_b_ex.fe_group,
3351 &ac->ac_b_ex.fe_start);
3352 ac->ac_b_ex.fe_len = len;
3353 ac->ac_status = AC_STATUS_FOUND;
3354 ac->ac_pa = pa;
3355
3356 /* we don't correct pa_pstart or pa_plen here to avoid
3357 * possible race when tte group is being loaded concurrently
3358 * instead we correct pa later, after blocks are marked
3359 * in on-disk bitmap -- see ext4_mb_release_context() */
3360 /*
3361 * FIXME!! but the other CPUs can look at this particular
3362 * pa and think that it have enought free blocks if we
3363 * don't update pa_free here right ?
3364 */
3365 mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3366}
3367
3368/*
3369 * search goal blocks in preallocated space
3370 */
3371static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3372{
3373 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3374 struct ext4_locality_group *lg;
3375 struct ext4_prealloc_space *pa;
3376 struct list_head *cur;
3377
3378 /* only data can be preallocated */
3379 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3380 return 0;
3381
3382 /* first, try per-file preallocation */
3383 rcu_read_lock();
3384 list_for_each_rcu(cur, &ei->i_prealloc_list) {
3385 pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
3386
3387 /* all fields in this condition don't change,
3388 * so we can skip locking for them */
3389 if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3390 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
3391 continue;
3392
3393 /* found preallocated blocks, use them */
3394 spin_lock(&pa->pa_lock);
3395 if (pa->pa_deleted == 0 && pa->pa_free) {
3396 atomic_inc(&pa->pa_count);
3397 ext4_mb_use_inode_pa(ac, pa);
3398 spin_unlock(&pa->pa_lock);
3399 ac->ac_criteria = 10;
3400 rcu_read_unlock();
3401 return 1;
3402 }
3403 spin_unlock(&pa->pa_lock);
3404 }
3405 rcu_read_unlock();
3406
3407 /* can we use group allocation? */
3408 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3409 return 0;
3410
3411 /* inode may have no locality group for some reason */
3412 lg = ac->ac_lg;
3413 if (lg == NULL)
3414 return 0;
3415
3416 rcu_read_lock();
3417 list_for_each_rcu(cur, &lg->lg_prealloc_list) {
3418 pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
3419 spin_lock(&pa->pa_lock);
3420 if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
3421 atomic_inc(&pa->pa_count);
3422 ext4_mb_use_group_pa(ac, pa);
3423 spin_unlock(&pa->pa_lock);
3424 ac->ac_criteria = 20;
3425 rcu_read_unlock();
3426 return 1;
3427 }
3428 spin_unlock(&pa->pa_lock);
3429 }
3430 rcu_read_unlock();
3431
3432 return 0;
3433}
3434
3435/*
3436 * the function goes through all preallocation in this group and marks them
3437 * used in in-core bitmap. buddy must be generated from this bitmap
3438 * Need to be called with ext4 group lock (ext4_lock_group)
3439 */
3440static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3441 ext4_group_t group)
3442{
3443 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3444 struct ext4_prealloc_space *pa;
3445 struct list_head *cur;
3446 ext4_group_t groupnr;
3447 ext4_grpblk_t start;
3448 int preallocated = 0;
3449 int count = 0;
3450 int len;
3451
3452 /* all form of preallocation discards first load group,
3453 * so the only competing code is preallocation use.
3454 * we don't need any locking here
3455 * notice we do NOT ignore preallocations with pa_deleted
3456 * otherwise we could leave used blocks available for
3457 * allocation in buddy when concurrent ext4_mb_put_pa()
3458 * is dropping preallocation
3459 */
3460 list_for_each(cur, &grp->bb_prealloc_list) {
3461 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3462 spin_lock(&pa->pa_lock);
3463 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3464 &groupnr, &start);
3465 len = pa->pa_len;
3466 spin_unlock(&pa->pa_lock);
3467 if (unlikely(len == 0))
3468 continue;
3469 BUG_ON(groupnr != group);
3470 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
3471 bitmap, start, len);
3472 preallocated += len;
3473 count++;
3474 }
3475 mb_debug("prellocated %u for group %lu\n", preallocated, group);
3476}
3477
3478static void ext4_mb_pa_callback(struct rcu_head *head)
3479{
3480 struct ext4_prealloc_space *pa;
3481 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3482 kmem_cache_free(ext4_pspace_cachep, pa);
3483}
3484
3485/*
3486 * drops a reference to preallocated space descriptor
3487 * if this was the last reference and the space is consumed
3488 */
3489static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3490 struct super_block *sb, struct ext4_prealloc_space *pa)
3491{
3492 unsigned long grp;
3493
3494 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3495 return;
3496
3497 /* in this short window concurrent discard can set pa_deleted */
3498 spin_lock(&pa->pa_lock);
3499 if (pa->pa_deleted == 1) {
3500 spin_unlock(&pa->pa_lock);
3501 return;
3502 }
3503
3504 pa->pa_deleted = 1;
3505 spin_unlock(&pa->pa_lock);
3506
3507 /* -1 is to protect from crossing allocation group */
3508 ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
3509
3510 /*
3511 * possible race:
3512 *
3513 * P1 (buddy init) P2 (regular allocation)
3514 * find block B in PA
3515 * copy on-disk bitmap to buddy
3516 * mark B in on-disk bitmap
3517 * drop PA from group
3518 * mark all PAs in buddy
3519 *
3520 * thus, P1 initializes buddy with B available. to prevent this
3521 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3522 * against that pair
3523 */
3524 ext4_lock_group(sb, grp);
3525 list_del(&pa->pa_group_list);
3526 ext4_unlock_group(sb, grp);
3527
3528 spin_lock(pa->pa_obj_lock);
3529 list_del_rcu(&pa->pa_inode_list);
3530 spin_unlock(pa->pa_obj_lock);
3531
3532 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3533}
3534
3535/*
3536 * creates new preallocated space for given inode
3537 */
3538static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3539{
3540 struct super_block *sb = ac->ac_sb;
3541 struct ext4_prealloc_space *pa;
3542 struct ext4_group_info *grp;
3543 struct ext4_inode_info *ei;
3544
3545 /* preallocate only when found space is larger then requested */
3546 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3547 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3548 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3549
3550 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3551 if (pa == NULL)
3552 return -ENOMEM;
3553
3554 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3555 int winl;
3556 int wins;
3557 int win;
3558 int offs;
3559
3560 /* we can't allocate as much as normalizer wants.
3561 * so, found space must get proper lstart
3562 * to cover original request */
3563 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3564 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3565
3566 /* we're limited by original request in that
3567 * logical block must be covered any way
3568 * winl is window we can move our chunk within */
3569 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3570
3571 /* also, we should cover whole original request */
3572 wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
3573
3574 /* the smallest one defines real window */
3575 win = min(winl, wins);
3576
3577 offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
3578 if (offs && offs < win)
3579 win = offs;
3580
3581 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
3582 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3583 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3584 }
3585
3586 /* preallocation can change ac_b_ex, thus we store actually
3587 * allocated blocks for history */
3588 ac->ac_f_ex = ac->ac_b_ex;
3589
3590 pa->pa_lstart = ac->ac_b_ex.fe_logical;
3591 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3592 pa->pa_len = ac->ac_b_ex.fe_len;
3593 pa->pa_free = pa->pa_len;
3594 atomic_set(&pa->pa_count, 1);
3595 spin_lock_init(&pa->pa_lock);
3596 pa->pa_deleted = 0;
3597 pa->pa_linear = 0;
3598
3599 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3600 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3601
3602 ext4_mb_use_inode_pa(ac, pa);
3603 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3604
3605 ei = EXT4_I(ac->ac_inode);
3606 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3607
3608 pa->pa_obj_lock = &ei->i_prealloc_lock;
3609 pa->pa_inode = ac->ac_inode;
3610
3611 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3612 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3613 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3614
3615 spin_lock(pa->pa_obj_lock);
3616 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3617 spin_unlock(pa->pa_obj_lock);
3618
3619 return 0;
3620}
3621
3622/*
3623 * creates new preallocated space for locality group inodes belongs to
3624 */
3625static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3626{
3627 struct super_block *sb = ac->ac_sb;
3628 struct ext4_locality_group *lg;
3629 struct ext4_prealloc_space *pa;
3630 struct ext4_group_info *grp;
3631
3632 /* preallocate only when found space is larger then requested */
3633 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3634 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3635 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3636
3637 BUG_ON(ext4_pspace_cachep == NULL);
3638 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3639 if (pa == NULL)
3640 return -ENOMEM;
3641
3642 /* preallocation can change ac_b_ex, thus we store actually
3643 * allocated blocks for history */
3644 ac->ac_f_ex = ac->ac_b_ex;
3645
3646 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3647 pa->pa_lstart = pa->pa_pstart;
3648 pa->pa_len = ac->ac_b_ex.fe_len;
3649 pa->pa_free = pa->pa_len;
3650 atomic_set(&pa->pa_count, 1);
3651 spin_lock_init(&pa->pa_lock);
3652 pa->pa_deleted = 0;
3653 pa->pa_linear = 1;
3654
3655 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3656 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3657
3658 ext4_mb_use_group_pa(ac, pa);
3659 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3660
3661 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3662 lg = ac->ac_lg;
3663 BUG_ON(lg == NULL);
3664
3665 pa->pa_obj_lock = &lg->lg_prealloc_lock;
3666 pa->pa_inode = NULL;
3667
3668 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3669 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3670 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3671
3672 spin_lock(pa->pa_obj_lock);
3673 list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
3674 spin_unlock(pa->pa_obj_lock);
3675
3676 return 0;
3677}
3678
3679static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3680{
3681 int err;
3682
3683 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3684 err = ext4_mb_new_group_pa(ac);
3685 else
3686 err = ext4_mb_new_inode_pa(ac);
3687 return err;
3688}
3689
3690/*
3691 * finds all unused blocks in on-disk bitmap, frees them in
3692 * in-core bitmap and buddy.
3693 * @pa must be unlinked from inode and group lists, so that
3694 * nobody else can find/use it.
3695 * the caller MUST hold group/inode locks.
3696 * TODO: optimize the case when there are no in-core structures yet
3697 */
3698static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
3699 struct buffer_head *bitmap_bh,
3700 struct ext4_prealloc_space *pa)
3701{
3702 struct ext4_allocation_context ac;
3703 struct super_block *sb = e4b->bd_sb;
3704 struct ext4_sb_info *sbi = EXT4_SB(sb);
3705 unsigned long end;
3706 unsigned long next;
3707 ext4_group_t group;
3708 ext4_grpblk_t bit;
3709 sector_t start;
3710 int err = 0;
3711 int free = 0;
3712
3713 BUG_ON(pa->pa_deleted == 0);
3714 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3715 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3716 end = bit + pa->pa_len;
3717
3718 ac.ac_sb = sb;
3719 ac.ac_inode = pa->pa_inode;
3720 ac.ac_op = EXT4_MB_HISTORY_DISCARD;
3721
3722 while (bit < end) {
3723 bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3724 if (bit >= end)
3725 break;
3726 next = ext4_find_next_bit(bitmap_bh->b_data, end, bit);
3727 if (next > end)
3728 next = end;
3729 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
3730 le32_to_cpu(sbi->s_es->s_first_data_block);
3731 mb_debug(" free preallocated %u/%u in group %u\n",
3732 (unsigned) start, (unsigned) next - bit,
3733 (unsigned) group);
3734 free += next - bit;
3735
3736 ac.ac_b_ex.fe_group = group;
3737 ac.ac_b_ex.fe_start = bit;
3738 ac.ac_b_ex.fe_len = next - bit;
3739 ac.ac_b_ex.fe_logical = 0;
3740 ext4_mb_store_history(&ac);
3741
3742 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3743 bit = next + 1;
3744 }
3745 if (free != pa->pa_free) {
3746 printk(KERN_ERR "pa %p: logic %lu, phys. %lu, len %lu\n",
3747 pa, (unsigned long) pa->pa_lstart,
3748 (unsigned long) pa->pa_pstart,
3749 (unsigned long) pa->pa_len);
3750 printk(KERN_ERR "free %u, pa_free %u\n", free, pa->pa_free);
3751 }
3752 BUG_ON(free != pa->pa_free);
3753 atomic_add(free, &sbi->s_mb_discarded);
3754
3755 return err;
3756}
3757
3758static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3759 struct ext4_prealloc_space *pa)
3760{
3761 struct ext4_allocation_context ac;
3762 struct super_block *sb = e4b->bd_sb;
3763 ext4_group_t group;
3764 ext4_grpblk_t bit;
3765
3766 ac.ac_op = EXT4_MB_HISTORY_DISCARD;
3767
3768 BUG_ON(pa->pa_deleted == 0);
3769 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3770 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3771 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3772 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3773
3774 ac.ac_sb = sb;
3775 ac.ac_inode = NULL;
3776 ac.ac_b_ex.fe_group = group;
3777 ac.ac_b_ex.fe_start = bit;
3778 ac.ac_b_ex.fe_len = pa->pa_len;
3779 ac.ac_b_ex.fe_logical = 0;
3780 ext4_mb_store_history(&ac);
3781
3782 return 0;
3783}
3784
3785/*
3786 * releases all preallocations in given group
3787 *
3788 * first, we need to decide discard policy:
3789 * - when do we discard
3790 * 1) ENOSPC
3791 * - how many do we discard
3792 * 1) how many requested
3793 */
3794static int ext4_mb_discard_group_preallocations(struct super_block *sb,
3795 ext4_group_t group, int needed)
3796{
3797 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3798 struct buffer_head *bitmap_bh = NULL;
3799 struct ext4_prealloc_space *pa, *tmp;
3800 struct list_head list;
3801 struct ext4_buddy e4b;
3802 int err;
3803 int busy = 0;
3804 int free = 0;
3805
3806 mb_debug("discard preallocation for group %lu\n", group);
3807
3808 if (list_empty(&grp->bb_prealloc_list))
3809 return 0;
3810
3811 bitmap_bh = read_block_bitmap(sb, group);
3812 if (bitmap_bh == NULL) {
3813 /* error handling here */
3814 ext4_mb_release_desc(&e4b);
3815 BUG_ON(bitmap_bh == NULL);
3816 }
3817
3818 err = ext4_mb_load_buddy(sb, group, &e4b);
3819 BUG_ON(err != 0); /* error handling here */
3820
3821 if (needed == 0)
3822 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
3823
3824 grp = ext4_get_group_info(sb, group);
3825 INIT_LIST_HEAD(&list);
3826
3827repeat:
3828 ext4_lock_group(sb, group);
3829 list_for_each_entry_safe(pa, tmp,
3830 &grp->bb_prealloc_list, pa_group_list) {
3831 spin_lock(&pa->pa_lock);
3832 if (atomic_read(&pa->pa_count)) {
3833 spin_unlock(&pa->pa_lock);
3834 busy = 1;
3835 continue;
3836 }
3837 if (pa->pa_deleted) {
3838 spin_unlock(&pa->pa_lock);
3839 continue;
3840 }
3841
3842 /* seems this one can be freed ... */
3843 pa->pa_deleted = 1;
3844
3845 /* we can trust pa_free ... */
3846 free += pa->pa_free;
3847
3848 spin_unlock(&pa->pa_lock);
3849
3850 list_del(&pa->pa_group_list);
3851 list_add(&pa->u.pa_tmp_list, &list);
3852 }
3853
3854 /* if we still need more blocks and some PAs were used, try again */
3855 if (free < needed && busy) {
3856 busy = 0;
3857 ext4_unlock_group(sb, group);
3858 /*
3859 * Yield the CPU here so that we don't get soft lockup
3860 * in non preempt case.
3861 */
3862 yield();
3863 goto repeat;
3864 }
3865
3866 /* found anything to free? */
3867 if (list_empty(&list)) {
3868 BUG_ON(free != 0);
3869 goto out;
3870 }
3871
3872 /* now free all selected PAs */
3873 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3874
3875 /* remove from object (inode or locality group) */
3876 spin_lock(pa->pa_obj_lock);
3877 list_del_rcu(&pa->pa_inode_list);
3878 spin_unlock(pa->pa_obj_lock);
3879
3880 if (pa->pa_linear)
3881 ext4_mb_release_group_pa(&e4b, pa);
3882 else
3883 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3884
3885 list_del(&pa->u.pa_tmp_list);
3886 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3887 }
3888
3889out:
3890 ext4_unlock_group(sb, group);
3891 ext4_mb_release_desc(&e4b);
3892 put_bh(bitmap_bh);
3893 return free;
3894}
3895
3896/*
3897 * releases all non-used preallocated blocks for given inode
3898 *
3899 * It's important to discard preallocations under i_data_sem
3900 * We don't want another block to be served from the prealloc
3901 * space when we are discarding the inode prealloc space.
3902 *
3903 * FIXME!! Make sure it is valid at all the call sites
3904 */
3905void ext4_mb_discard_inode_preallocations(struct inode *inode)
3906{
3907 struct ext4_inode_info *ei = EXT4_I(inode);
3908 struct super_block *sb = inode->i_sb;
3909 struct buffer_head *bitmap_bh = NULL;
3910 struct ext4_prealloc_space *pa, *tmp;
3911 ext4_group_t group = 0;
3912 struct list_head list;
3913 struct ext4_buddy e4b;
3914 int err;
3915
3916 if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
3917 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3918 return;
3919 }
3920
3921 mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
3922
3923 INIT_LIST_HEAD(&list);
3924
3925repeat:
3926 /* first, collect all pa's in the inode */
3927 spin_lock(&ei->i_prealloc_lock);
3928 while (!list_empty(&ei->i_prealloc_list)) {
3929 pa = list_entry(ei->i_prealloc_list.next,
3930 struct ext4_prealloc_space, pa_inode_list);
3931 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
3932 spin_lock(&pa->pa_lock);
3933 if (atomic_read(&pa->pa_count)) {
3934 /* this shouldn't happen often - nobody should
3935 * use preallocation while we're discarding it */
3936 spin_unlock(&pa->pa_lock);
3937 spin_unlock(&ei->i_prealloc_lock);
3938 printk(KERN_ERR "uh-oh! used pa while discarding\n");
3939 WARN_ON(1);
3940 schedule_timeout_uninterruptible(HZ);
3941 goto repeat;
3942
3943 }
3944 if (pa->pa_deleted == 0) {
3945 pa->pa_deleted = 1;
3946 spin_unlock(&pa->pa_lock);
3947 list_del_rcu(&pa->pa_inode_list);
3948 list_add(&pa->u.pa_tmp_list, &list);
3949 continue;
3950 }
3951
3952 /* someone is deleting pa right now */
3953 spin_unlock(&pa->pa_lock);
3954 spin_unlock(&ei->i_prealloc_lock);
3955
3956 /* we have to wait here because pa_deleted
3957 * doesn't mean pa is already unlinked from
3958 * the list. as we might be called from
3959 * ->clear_inode() the inode will get freed
3960 * and concurrent thread which is unlinking
3961 * pa from inode's list may access already
3962 * freed memory, bad-bad-bad */
3963
3964 /* XXX: if this happens too often, we can
3965 * add a flag to force wait only in case
3966 * of ->clear_inode(), but not in case of
3967 * regular truncate */
3968 schedule_timeout_uninterruptible(HZ);
3969 goto repeat;
3970 }
3971 spin_unlock(&ei->i_prealloc_lock);
3972
3973 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3974 BUG_ON(pa->pa_linear != 0);
3975 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
3976
3977 err = ext4_mb_load_buddy(sb, group, &e4b);
3978 BUG_ON(err != 0); /* error handling here */
3979
3980 bitmap_bh = read_block_bitmap(sb, group);
3981 if (bitmap_bh == NULL) {
3982 /* error handling here */
3983 ext4_mb_release_desc(&e4b);
3984 BUG_ON(bitmap_bh == NULL);
3985 }
3986
3987 ext4_lock_group(sb, group);
3988 list_del(&pa->pa_group_list);
3989 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3990 ext4_unlock_group(sb, group);
3991
3992 ext4_mb_release_desc(&e4b);
3993 put_bh(bitmap_bh);
3994
3995 list_del(&pa->u.pa_tmp_list);
3996 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3997 }
3998}
3999
4000/*
4001 * finds all preallocated spaces and return blocks being freed to them
4002 * if preallocated space becomes full (no block is used from the space)
4003 * then the function frees space in buddy
4004 * XXX: at the moment, truncate (which is the only way to free blocks)
4005 * discards all preallocations
4006 */
4007static void ext4_mb_return_to_preallocation(struct inode *inode,
4008 struct ext4_buddy *e4b,
4009 sector_t block, int count)
4010{
4011 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
4012}
4013#ifdef MB_DEBUG
4014static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4015{
4016 struct super_block *sb = ac->ac_sb;
4017 ext4_group_t i;
4018
4019 printk(KERN_ERR "EXT4-fs: Can't allocate:"
4020 " Allocation context details:\n");
4021 printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
4022 ac->ac_status, ac->ac_flags);
4023 printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
4024 "best %lu/%lu/%lu@%lu cr %d\n",
4025 (unsigned long)ac->ac_o_ex.fe_group,
4026 (unsigned long)ac->ac_o_ex.fe_start,
4027 (unsigned long)ac->ac_o_ex.fe_len,
4028 (unsigned long)ac->ac_o_ex.fe_logical,
4029 (unsigned long)ac->ac_g_ex.fe_group,
4030 (unsigned long)ac->ac_g_ex.fe_start,
4031 (unsigned long)ac->ac_g_ex.fe_len,
4032 (unsigned long)ac->ac_g_ex.fe_logical,
4033 (unsigned long)ac->ac_b_ex.fe_group,
4034 (unsigned long)ac->ac_b_ex.fe_start,
4035 (unsigned long)ac->ac_b_ex.fe_len,
4036 (unsigned long)ac->ac_b_ex.fe_logical,
4037 (int)ac->ac_criteria);
4038 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
4039 ac->ac_found);
4040 printk(KERN_ERR "EXT4-fs: groups: \n");
4041 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
4042 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4043 struct ext4_prealloc_space *pa;
4044 ext4_grpblk_t start;
4045 struct list_head *cur;
4046 ext4_lock_group(sb, i);
4047 list_for_each(cur, &grp->bb_prealloc_list) {
4048 pa = list_entry(cur, struct ext4_prealloc_space,
4049 pa_group_list);
4050 spin_lock(&pa->pa_lock);
4051 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4052 NULL, &start);
4053 spin_unlock(&pa->pa_lock);
4054 printk(KERN_ERR "PA:%lu:%d:%u \n", i,
4055 start, pa->pa_len);
4056 }
4057 ext4_lock_group(sb, i);
4058
4059 if (grp->bb_free == 0)
4060 continue;
4061 printk(KERN_ERR "%lu: %d/%d \n",
4062 i, grp->bb_free, grp->bb_fragments);
4063 }
4064 printk(KERN_ERR "\n");
4065}
4066#else
4067static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4068{
4069 return;
4070}
4071#endif
4072
4073/*
4074 * We use locality group preallocation for small size file. The size of the
4075 * file is determined by the current size or the resulting size after
4076 * allocation which ever is larger
4077 *
4078 * One can tune this size via /proc/fs/ext4/<partition>/stream_req
4079 */
4080static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4081{
4082 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4083 int bsbits = ac->ac_sb->s_blocksize_bits;
4084 loff_t size, isize;
4085
4086 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4087 return;
4088
4089 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4090 isize = i_size_read(ac->ac_inode) >> bsbits;
4091 size = max(size, isize);
4092
4093 /* don't use group allocation for large files */
4094 if (size >= sbi->s_mb_stream_request)
4095 return;
4096
4097 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4098 return;
4099
4100 BUG_ON(ac->ac_lg != NULL);
4101 /*
4102 * locality group prealloc space are per cpu. The reason for having
4103 * per cpu locality group is to reduce the contention between block
4104 * request from multiple CPUs.
4105 */
4106 ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
4107 put_cpu();
4108
4109 /* we're going to use group allocation */
4110 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4111
4112 /* serialize all allocations in the group */
4113 mutex_lock(&ac->ac_lg->lg_mutex);
4114}
4115
4116static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4117 struct ext4_allocation_request *ar)
4118{
4119 struct super_block *sb = ar->inode->i_sb;
4120 struct ext4_sb_info *sbi = EXT4_SB(sb);
4121 struct ext4_super_block *es = sbi->s_es;
4122 ext4_group_t group;
4123 unsigned long len;
4124 unsigned long goal;
4125 ext4_grpblk_t block;
4126
4127 /* we can't allocate > group size */
4128 len = ar->len;
4129
4130 /* just a dirty hack to filter too big requests */
4131 if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
4132 len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
4133
4134 /* start searching from the goal */
4135 goal = ar->goal;
4136 if (goal < le32_to_cpu(es->s_first_data_block) ||
4137 goal >= ext4_blocks_count(es))
4138 goal = le32_to_cpu(es->s_first_data_block);
4139 ext4_get_group_no_and_offset(sb, goal, &group, &block);
4140
4141 /* set up allocation goals */
4142 ac->ac_b_ex.fe_logical = ar->logical;
4143 ac->ac_b_ex.fe_group = 0;
4144 ac->ac_b_ex.fe_start = 0;
4145 ac->ac_b_ex.fe_len = 0;
4146 ac->ac_status = AC_STATUS_CONTINUE;
4147 ac->ac_groups_scanned = 0;
4148 ac->ac_ex_scanned = 0;
4149 ac->ac_found = 0;
4150 ac->ac_sb = sb;
4151 ac->ac_inode = ar->inode;
4152 ac->ac_o_ex.fe_logical = ar->logical;
4153 ac->ac_o_ex.fe_group = group;
4154 ac->ac_o_ex.fe_start = block;
4155 ac->ac_o_ex.fe_len = len;
4156 ac->ac_g_ex.fe_logical = ar->logical;
4157 ac->ac_g_ex.fe_group = group;
4158 ac->ac_g_ex.fe_start = block;
4159 ac->ac_g_ex.fe_len = len;
4160 ac->ac_f_ex.fe_len = 0;
4161 ac->ac_flags = ar->flags;
4162 ac->ac_2order = 0;
4163 ac->ac_criteria = 0;
4164 ac->ac_pa = NULL;
4165 ac->ac_bitmap_page = NULL;
4166 ac->ac_buddy_page = NULL;
4167 ac->ac_lg = NULL;
4168
4169 /* we have to define context: we'll we work with a file or
4170 * locality group. this is a policy, actually */
4171 ext4_mb_group_or_file(ac);
4172
4173 mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4174 "left: %u/%u, right %u/%u to %swritable\n",
4175 (unsigned) ar->len, (unsigned) ar->logical,
4176 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4177 (unsigned) ar->lleft, (unsigned) ar->pleft,
4178 (unsigned) ar->lright, (unsigned) ar->pright,
4179 atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4180 return 0;
4181
4182}
4183
4184/*
4185 * release all resource we used in allocation
4186 */
4187static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4188{
4189 if (ac->ac_pa) {
4190 if (ac->ac_pa->pa_linear) {
4191 /* see comment in ext4_mb_use_group_pa() */
4192 spin_lock(&ac->ac_pa->pa_lock);
4193 ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
4194 ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
4195 ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
4196 ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
4197 spin_unlock(&ac->ac_pa->pa_lock);
4198 }
4199 ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
4200 }
4201 if (ac->ac_bitmap_page)
4202 page_cache_release(ac->ac_bitmap_page);
4203 if (ac->ac_buddy_page)
4204 page_cache_release(ac->ac_buddy_page);
4205 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4206 mutex_unlock(&ac->ac_lg->lg_mutex);
4207 ext4_mb_collect_stats(ac);
4208 return 0;
4209}
4210
4211static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4212{
4213 ext4_group_t i;
4214 int ret;
4215 int freed = 0;
4216
4217 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
4218 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4219 freed += ret;
4220 needed -= ret;
4221 }
4222
4223 return freed;
4224}
4225
4226/*
4227 * Main entry point into mballoc to allocate blocks
4228 * it tries to use preallocation first, then falls back
4229 * to usual allocation
4230 */
4231ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4232 struct ext4_allocation_request *ar, int *errp)
4233{
4234 struct ext4_allocation_context ac;
4235 struct ext4_sb_info *sbi;
4236 struct super_block *sb;
4237 ext4_fsblk_t block = 0;
4238 int freed;
4239 int inquota;
4240
4241 sb = ar->inode->i_sb;
4242 sbi = EXT4_SB(sb);
4243
4244 if (!test_opt(sb, MBALLOC)) {
4245 block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
4246 &(ar->len), errp);
4247 return block;
4248 }
4249
4250 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
4251 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4252 ar->len--;
4253 }
4254 if (ar->len == 0) {
4255 *errp = -EDQUOT;
4256 return 0;
4257 }
4258 inquota = ar->len;
4259
4260 ext4_mb_poll_new_transaction(sb, handle);
4261
4262 *errp = ext4_mb_initialize_context(&ac, ar);
4263 if (*errp) {
4264 ar->len = 0;
4265 goto out;
4266 }
4267
4268 ac.ac_op = EXT4_MB_HISTORY_PREALLOC;
4269 if (!ext4_mb_use_preallocated(&ac)) {
4270
4271 ac.ac_op = EXT4_MB_HISTORY_ALLOC;
4272 ext4_mb_normalize_request(&ac, ar);
4273
4274repeat:
4275 /* allocate space in core */
4276 ext4_mb_regular_allocator(&ac);
4277
4278 /* as we've just preallocated more space than
4279 * user requested orinally, we store allocated
4280 * space in a special descriptor */
4281 if (ac.ac_status == AC_STATUS_FOUND &&
4282 ac.ac_o_ex.fe_len < ac.ac_b_ex.fe_len)
4283 ext4_mb_new_preallocation(&ac);
4284 }
4285
4286 if (likely(ac.ac_status == AC_STATUS_FOUND)) {
4287 ext4_mb_mark_diskspace_used(&ac, handle);
4288 *errp = 0;
4289 block = ext4_grp_offs_to_block(sb, &ac.ac_b_ex);
4290 ar->len = ac.ac_b_ex.fe_len;
4291 } else {
4292 freed = ext4_mb_discard_preallocations(sb, ac.ac_o_ex.fe_len);
4293 if (freed)
4294 goto repeat;
4295 *errp = -ENOSPC;
4296 ac.ac_b_ex.fe_len = 0;
4297 ar->len = 0;
4298 ext4_mb_show_ac(&ac);
4299 }
4300
4301 ext4_mb_release_context(&ac);
4302
4303out:
4304 if (ar->len < inquota)
4305 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
4306
4307 return block;
4308}
4309static void ext4_mb_poll_new_transaction(struct super_block *sb,
4310 handle_t *handle)
4311{
4312 struct ext4_sb_info *sbi = EXT4_SB(sb);
4313
4314 if (sbi->s_last_transaction == handle->h_transaction->t_tid)
4315 return;
4316
4317 /* new transaction! time to close last one and free blocks for
4318 * committed transaction. we know that only transaction can be
4319 * active, so previos transaction can be being logged and we
4320 * know that transaction before previous is known to be already
4321 * logged. this means that now we may free blocks freed in all
4322 * transactions before previous one. hope I'm clear enough ... */
4323
4324 spin_lock(&sbi->s_md_lock);
4325 if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
4326 mb_debug("new transaction %lu, old %lu\n",
4327 (unsigned long) handle->h_transaction->t_tid,
4328 (unsigned long) sbi->s_last_transaction);
4329 list_splice_init(&sbi->s_closed_transaction,
4330 &sbi->s_committed_transaction);
4331 list_splice_init(&sbi->s_active_transaction,
4332 &sbi->s_closed_transaction);
4333 sbi->s_last_transaction = handle->h_transaction->t_tid;
4334 }
4335 spin_unlock(&sbi->s_md_lock);
4336
4337 ext4_mb_free_committed_blocks(sb);
4338}
4339
4340static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4341 ext4_group_t group, ext4_grpblk_t block, int count)
4342{
4343 struct ext4_group_info *db = e4b->bd_info;
4344 struct super_block *sb = e4b->bd_sb;
4345 struct ext4_sb_info *sbi = EXT4_SB(sb);
4346 struct ext4_free_metadata *md;
4347 int i;
4348
4349 BUG_ON(e4b->bd_bitmap_page == NULL);
4350 BUG_ON(e4b->bd_buddy_page == NULL);
4351
4352 ext4_lock_group(sb, group);
4353 for (i = 0; i < count; i++) {
4354 md = db->bb_md_cur;
4355 if (md && db->bb_tid != handle->h_transaction->t_tid) {
4356 db->bb_md_cur = NULL;
4357 md = NULL;
4358 }
4359
4360 if (md == NULL) {
4361 ext4_unlock_group(sb, group);
4362 md = kmalloc(sizeof(*md), GFP_NOFS);
4363 if (md == NULL)
4364 return -ENOMEM;
4365 md->num = 0;
4366 md->group = group;
4367
4368 ext4_lock_group(sb, group);
4369 if (db->bb_md_cur == NULL) {
4370 spin_lock(&sbi->s_md_lock);
4371 list_add(&md->list, &sbi->s_active_transaction);
4372 spin_unlock(&sbi->s_md_lock);
4373 /* protect buddy cache from being freed,
4374 * otherwise we'll refresh it from
4375 * on-disk bitmap and lose not-yet-available
4376 * blocks */
4377 page_cache_get(e4b->bd_buddy_page);
4378 page_cache_get(e4b->bd_bitmap_page);
4379 db->bb_md_cur = md;
4380 db->bb_tid = handle->h_transaction->t_tid;
4381 mb_debug("new md 0x%p for group %lu\n",
4382 md, md->group);
4383 } else {
4384 kfree(md);
4385 md = db->bb_md_cur;
4386 }
4387 }
4388
4389 BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
4390 md->blocks[md->num] = block + i;
4391 md->num++;
4392 if (md->num == EXT4_BB_MAX_BLOCKS) {
4393 /* no more space, put full container on a sb's list */
4394 db->bb_md_cur = NULL;
4395 }
4396 }
4397 ext4_unlock_group(sb, group);
4398 return 0;
4399}
4400
4401/*
4402 * Main entry point into mballoc to free blocks
4403 */
4404void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4405 unsigned long block, unsigned long count,
4406 int metadata, unsigned long *freed)
4407{
4408 struct buffer_head *bitmap_bh = 0;
4409 struct super_block *sb = inode->i_sb;
4410 struct ext4_allocation_context ac;
4411 struct ext4_group_desc *gdp;
4412 struct ext4_super_block *es;
4413 unsigned long overflow;
4414 ext4_grpblk_t bit;
4415 struct buffer_head *gd_bh;
4416 ext4_group_t block_group;
4417 struct ext4_sb_info *sbi;
4418 struct ext4_buddy e4b;
4419 int err = 0;
4420 int ret;
4421
4422 *freed = 0;
4423
4424 ext4_mb_poll_new_transaction(sb, handle);
4425
4426 sbi = EXT4_SB(sb);
4427 es = EXT4_SB(sb)->s_es;
4428 if (block < le32_to_cpu(es->s_first_data_block) ||
4429 block + count < block ||
4430 block + count > ext4_blocks_count(es)) {
4431 ext4_error(sb, __FUNCTION__,
4432 "Freeing blocks not in datazone - "
4433 "block = %lu, count = %lu", block, count);
4434 goto error_return;
4435 }
4436
4437 ext4_debug("freeing block %lu\n", block);
4438
4439 ac.ac_op = EXT4_MB_HISTORY_FREE;
4440 ac.ac_inode = inode;
4441 ac.ac_sb = sb;
4442
4443do_more:
4444 overflow = 0;
4445 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4446
4447 /*
4448 * Check to see if we are freeing blocks across a group
4449 * boundary.
4450 */
4451 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4452 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
4453 count -= overflow;
4454 }
4455 bitmap_bh = read_block_bitmap(sb, block_group);
4456 if (!bitmap_bh)
4457 goto error_return;
4458 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4459 if (!gdp)
4460 goto error_return;
4461
4462 if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4463 in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4464 in_range(block, ext4_inode_table(sb, gdp),
4465 EXT4_SB(sb)->s_itb_per_group) ||
4466 in_range(block + count - 1, ext4_inode_table(sb, gdp),
4467 EXT4_SB(sb)->s_itb_per_group)) {
4468
4469 ext4_error(sb, __FUNCTION__,
4470 "Freeing blocks in system zone - "
4471 "Block = %lu, count = %lu", block, count);
4472 }
4473
4474 BUFFER_TRACE(bitmap_bh, "getting write access");
4475 err = ext4_journal_get_write_access(handle, bitmap_bh);
4476 if (err)
4477 goto error_return;
4478
4479 /*
4480 * We are about to modify some metadata. Call the journal APIs
4481 * to unshare ->b_data if a currently-committing transaction is
4482 * using it
4483 */
4484 BUFFER_TRACE(gd_bh, "get_write_access");
4485 err = ext4_journal_get_write_access(handle, gd_bh);
4486 if (err)
4487 goto error_return;
4488
4489 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4490 if (err)
4491 goto error_return;
4492
4493#ifdef AGGRESSIVE_CHECK
4494 {
4495 int i;
4496 for (i = 0; i < count; i++)
4497 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4498 }
4499#endif
4500 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4501 bit, count);
4502
4503 /* We dirtied the bitmap block */
4504 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4505 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
4506
4507 ac.ac_b_ex.fe_group = block_group;
4508 ac.ac_b_ex.fe_start = bit;
4509 ac.ac_b_ex.fe_len = count;
4510 ext4_mb_store_history(&ac);
4511
4512 if (metadata) {
4513 /* blocks being freed are metadata. these blocks shouldn't
4514 * be used until this transaction is committed */
4515 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
4516 } else {
4517 ext4_lock_group(sb, block_group);
4518 err = mb_free_blocks(inode, &e4b, bit, count);
4519 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4520 ext4_unlock_group(sb, block_group);
4521 BUG_ON(err != 0);
4522 }
4523
4524 spin_lock(sb_bgl_lock(sbi, block_group));
4525 gdp->bg_free_blocks_count =
4526 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
4527 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4528 spin_unlock(sb_bgl_lock(sbi, block_group));
4529 percpu_counter_add(&sbi->s_freeblocks_counter, count);
4530
4531 ext4_mb_release_desc(&e4b);
4532
4533 *freed += count;
4534
4535 /* And the group descriptor block */
4536 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4537 ret = ext4_journal_dirty_metadata(handle, gd_bh);
4538 if (!err)
4539 err = ret;
4540
4541 if (overflow && !err) {
4542 block += count;
4543 count = overflow;
4544 put_bh(bitmap_bh);
4545 goto do_more;
4546 }
4547 sb->s_dirt = 1;
4548error_return:
4549 brelse(bitmap_bh);
4550 ext4_std_error(sb, err);
4551 return;
4552}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
new file mode 100644
index 000000000000..3ebc2332f52e
--- /dev/null
+++ b/fs/ext4/migrate.c
@@ -0,0 +1,560 @@
1/*
2 * Copyright IBM Corporation, 2007
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/ext4_jbd2.h>
17#include <linux/ext4_fs_extents.h>
18
19/*
20 * The contiguous blocks details which can be
21 * represented by a single extent
22 */
23struct list_blocks_struct {
24 ext4_lblk_t first_block, last_block;
25 ext4_fsblk_t first_pblock, last_pblock;
26};
27
28static int finish_range(handle_t *handle, struct inode *inode,
29 struct list_blocks_struct *lb)
30
31{
32 int retval = 0, needed;
33 struct ext4_extent newext;
34 struct ext4_ext_path *path;
35 if (lb->first_pblock == 0)
36 return 0;
37
38 /* Add the extent to temp inode*/
39 newext.ee_block = cpu_to_le32(lb->first_block);
40 newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
41 ext4_ext_store_pblock(&newext, lb->first_pblock);
42 path = ext4_ext_find_extent(inode, lb->first_block, NULL);
43
44 if (IS_ERR(path)) {
45 retval = PTR_ERR(path);
46 goto err_out;
47 }
48
49 /*
50 * Calculate the credit needed to inserting this extent
51 * Since we are doing this in loop we may accumalate extra
52 * credit. But below we try to not accumalate too much
53 * of them by restarting the journal.
54 */
55 needed = ext4_ext_calc_credits_for_insert(inode, path);
56
57 /*
58 * Make sure the credit we accumalated is not really high
59 */
60 if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) {
61 retval = ext4_journal_restart(handle, needed);
62 if (retval)
63 goto err_out;
64 }
65 if (needed) {
66 retval = ext4_journal_extend(handle, needed);
67 if (retval != 0) {
68 /*
69 * IF not able to extend the journal restart the journal
70 */
71 retval = ext4_journal_restart(handle, needed);
72 if (retval)
73 goto err_out;
74 }
75 }
76 retval = ext4_ext_insert_extent(handle, inode, path, &newext);
77err_out:
78 lb->first_pblock = 0;
79 return retval;
80}
81
82static int update_extent_range(handle_t *handle, struct inode *inode,
83 ext4_fsblk_t pblock, ext4_lblk_t blk_num,
84 struct list_blocks_struct *lb)
85{
86 int retval;
87 /*
88 * See if we can add on to the existing range (if it exists)
89 */
90 if (lb->first_pblock &&
91 (lb->last_pblock+1 == pblock) &&
92 (lb->last_block+1 == blk_num)) {
93 lb->last_pblock = pblock;
94 lb->last_block = blk_num;
95 return 0;
96 }
97 /*
98 * Start a new range.
99 */
100 retval = finish_range(handle, inode, lb);
101 lb->first_pblock = lb->last_pblock = pblock;
102 lb->first_block = lb->last_block = blk_num;
103
104 return retval;
105}
106
107static int update_ind_extent_range(handle_t *handle, struct inode *inode,
108 ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
109 struct list_blocks_struct *lb)
110{
111 struct buffer_head *bh;
112 __le32 *i_data;
113 int i, retval = 0;
114 ext4_lblk_t blk_count = *blk_nump;
115 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
116
117 if (!pblock) {
118 /* Only update the file block number */
119 *blk_nump += max_entries;
120 return 0;
121 }
122
123 bh = sb_bread(inode->i_sb, pblock);
124 if (!bh)
125 return -EIO;
126
127 i_data = (__le32 *)bh->b_data;
128 for (i = 0; i < max_entries; i++, blk_count++) {
129 if (i_data[i]) {
130 retval = update_extent_range(handle, inode,
131 le32_to_cpu(i_data[i]),
132 blk_count, lb);
133 if (retval)
134 break;
135 }
136 }
137
138 /* Update the file block number */
139 *blk_nump = blk_count;
140 put_bh(bh);
141 return retval;
142
143}
144
145static int update_dind_extent_range(handle_t *handle, struct inode *inode,
146 ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
147 struct list_blocks_struct *lb)
148{
149 struct buffer_head *bh;
150 __le32 *i_data;
151 int i, retval = 0;
152 ext4_lblk_t blk_count = *blk_nump;
153 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
154
155 if (!pblock) {
156 /* Only update the file block number */
157 *blk_nump += max_entries * max_entries;
158 return 0;
159 }
160 bh = sb_bread(inode->i_sb, pblock);
161 if (!bh)
162 return -EIO;
163
164 i_data = (__le32 *)bh->b_data;
165 for (i = 0; i < max_entries; i++) {
166 if (i_data[i]) {
167 retval = update_ind_extent_range(handle, inode,
168 le32_to_cpu(i_data[i]),
169 &blk_count, lb);
170 if (retval)
171 break;
172 } else {
173 /* Only update the file block number */
174 blk_count += max_entries;
175 }
176 }
177
178 /* Update the file block number */
179 *blk_nump = blk_count;
180 put_bh(bh);
181 return retval;
182
183}
184
185static int update_tind_extent_range(handle_t *handle, struct inode *inode,
186 ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
187 struct list_blocks_struct *lb)
188{
189 struct buffer_head *bh;
190 __le32 *i_data;
191 int i, retval = 0;
192 ext4_lblk_t blk_count = *blk_nump;
193 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
194
195 if (!pblock) {
196 /* Only update the file block number */
197 *blk_nump += max_entries * max_entries * max_entries;
198 return 0;
199 }
200 bh = sb_bread(inode->i_sb, pblock);
201 if (!bh)
202 return -EIO;
203
204 i_data = (__le32 *)bh->b_data;
205 for (i = 0; i < max_entries; i++) {
206 if (i_data[i]) {
207 retval = update_dind_extent_range(handle, inode,
208 le32_to_cpu(i_data[i]),
209 &blk_count, lb);
210 if (retval)
211 break;
212 } else
213 /* Only update the file block number */
214 blk_count += max_entries * max_entries;
215 }
216 /* Update the file block number */
217 *blk_nump = blk_count;
218 put_bh(bh);
219 return retval;
220
221}
222
223static int free_dind_blocks(handle_t *handle,
224 struct inode *inode, __le32 i_data)
225{
226 int i;
227 __le32 *tmp_idata;
228 struct buffer_head *bh;
229 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
230
231 bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
232 if (!bh)
233 return -EIO;
234
235 tmp_idata = (__le32 *)bh->b_data;
236 for (i = 0; i < max_entries; i++) {
237 if (tmp_idata[i])
238 ext4_free_blocks(handle, inode,
239 le32_to_cpu(tmp_idata[i]), 1, 1);
240 }
241 put_bh(bh);
242 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
243 return 0;
244}
245
246static int free_tind_blocks(handle_t *handle,
247 struct inode *inode, __le32 i_data)
248{
249 int i, retval = 0;
250 __le32 *tmp_idata;
251 struct buffer_head *bh;
252 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
253
254 bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
255 if (!bh)
256 return -EIO;
257
258 tmp_idata = (__le32 *)bh->b_data;
259 for (i = 0; i < max_entries; i++) {
260 if (tmp_idata[i]) {
261 retval = free_dind_blocks(handle,
262 inode, tmp_idata[i]);
263 if (retval) {
264 put_bh(bh);
265 return retval;
266 }
267 }
268 }
269 put_bh(bh);
270 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
271 return 0;
272}
273
274static int free_ind_block(handle_t *handle, struct inode *inode)
275{
276 int retval;
277 struct ext4_inode_info *ei = EXT4_I(inode);
278
279 if (ei->i_data[EXT4_IND_BLOCK])
280 ext4_free_blocks(handle, inode,
281 le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1, 1);
282
283 if (ei->i_data[EXT4_DIND_BLOCK]) {
284 retval = free_dind_blocks(handle, inode,
285 ei->i_data[EXT4_DIND_BLOCK]);
286 if (retval)
287 return retval;
288 }
289
290 if (ei->i_data[EXT4_TIND_BLOCK]) {
291 retval = free_tind_blocks(handle, inode,
292 ei->i_data[EXT4_TIND_BLOCK]);
293 if (retval)
294 return retval;
295 }
296 return 0;
297}
298
299static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
300 struct inode *tmp_inode, int retval)
301{
302 struct ext4_inode_info *ei = EXT4_I(inode);
303 struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode);
304
305 retval = free_ind_block(handle, inode);
306 if (retval)
307 goto err_out;
308
309 /*
310 * One credit accounted for writing the
311 * i_data field of the original inode
312 */
313 retval = ext4_journal_extend(handle, 1);
314 if (retval != 0) {
315 retval = ext4_journal_restart(handle, 1);
316 if (retval)
317 goto err_out;
318 }
319
320 /*
321 * We have the extent map build with the tmp inode.
322 * Now copy the i_data across
323 */
324 ei->i_flags |= EXT4_EXTENTS_FL;
325 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
326
327 /*
328 * Update i_blocks with the new blocks that got
329 * allocated while adding extents for extent index
330 * blocks.
331 *
332 * While converting to extents we need not
333 * update the orignal inode i_blocks for extent blocks
334 * via quota APIs. The quota update happened via tmp_inode already.
335 */
336 spin_lock(&inode->i_lock);
337 inode->i_blocks += tmp_inode->i_blocks;
338 spin_unlock(&inode->i_lock);
339
340 ext4_mark_inode_dirty(handle, inode);
341err_out:
342 return retval;
343}
344
345static int free_ext_idx(handle_t *handle, struct inode *inode,
346 struct ext4_extent_idx *ix)
347{
348 int i, retval = 0;
349 ext4_fsblk_t block;
350 struct buffer_head *bh;
351 struct ext4_extent_header *eh;
352
353 block = idx_pblock(ix);
354 bh = sb_bread(inode->i_sb, block);
355 if (!bh)
356 return -EIO;
357
358 eh = (struct ext4_extent_header *)bh->b_data;
359 if (eh->eh_depth != 0) {
360 ix = EXT_FIRST_INDEX(eh);
361 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
362 retval = free_ext_idx(handle, inode, ix);
363 if (retval)
364 break;
365 }
366 }
367 put_bh(bh);
368 ext4_free_blocks(handle, inode, block, 1, 1);
369 return retval;
370}
371
372/*
373 * Free the extent meta data blocks only
374 */
375static int free_ext_block(handle_t *handle, struct inode *inode)
376{
377 int i, retval = 0;
378 struct ext4_inode_info *ei = EXT4_I(inode);
379 struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data;
380 struct ext4_extent_idx *ix;
381 if (eh->eh_depth == 0)
382 /*
383 * No extra blocks allocated for extent meta data
384 */
385 return 0;
386 ix = EXT_FIRST_INDEX(eh);
387 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
388 retval = free_ext_idx(handle, inode, ix);
389 if (retval)
390 return retval;
391 }
392 return retval;
393
394}
395
396int ext4_ext_migrate(struct inode *inode, struct file *filp,
397 unsigned int cmd, unsigned long arg)
398{
399 handle_t *handle;
400 int retval = 0, i;
401 __le32 *i_data;
402 ext4_lblk_t blk_count = 0;
403 struct ext4_inode_info *ei;
404 struct inode *tmp_inode = NULL;
405 struct list_blocks_struct lb;
406 unsigned long max_entries;
407
408 if (!test_opt(inode->i_sb, EXTENTS))
409 /*
410 * if mounted with noextents we don't allow the migrate
411 */
412 return -EINVAL;
413
414 if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
415 return -EINVAL;
416
417 down_write(&EXT4_I(inode)->i_data_sem);
418 handle = ext4_journal_start(inode,
419 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
420 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
421 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
422 + 1);
423 if (IS_ERR(handle)) {
424 retval = PTR_ERR(handle);
425 goto err_out;
426 }
427 tmp_inode = ext4_new_inode(handle,
428 inode->i_sb->s_root->d_inode,
429 S_IFREG);
430 if (IS_ERR(tmp_inode)) {
431 retval = -ENOMEM;
432 ext4_journal_stop(handle);
433 tmp_inode = NULL;
434 goto err_out;
435 }
436 i_size_write(tmp_inode, i_size_read(inode));
437 /*
438 * We don't want the inode to be reclaimed
439 * if we got interrupted in between. We have
440 * this tmp inode carrying reference to the
441 * data blocks of the original file. We set
442 * the i_nlink to zero at the last stage after
443 * switching the original file to extent format
444 */
445 tmp_inode->i_nlink = 1;
446
447 ext4_ext_tree_init(handle, tmp_inode);
448 ext4_orphan_add(handle, tmp_inode);
449 ext4_journal_stop(handle);
450
451 ei = EXT4_I(inode);
452 i_data = ei->i_data;
453 memset(&lb, 0, sizeof(lb));
454
455 /* 32 bit block address 4 bytes */
456 max_entries = inode->i_sb->s_blocksize >> 2;
457
458 /*
459 * start with one credit accounted for
460 * superblock modification.
461 *
462 * For the tmp_inode we already have commited the
463 * trascation that created the inode. Later as and
464 * when we add extents we extent the journal
465 */
466 handle = ext4_journal_start(inode, 1);
467 for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
468 if (i_data[i]) {
469 retval = update_extent_range(handle, tmp_inode,
470 le32_to_cpu(i_data[i]),
471 blk_count, &lb);
472 if (retval)
473 goto err_out;
474 }
475 }
476 if (i_data[EXT4_IND_BLOCK]) {
477 retval = update_ind_extent_range(handle, tmp_inode,
478 le32_to_cpu(i_data[EXT4_IND_BLOCK]),
479 &blk_count, &lb);
480 if (retval)
481 goto err_out;
482 } else
483 blk_count += max_entries;
484 if (i_data[EXT4_DIND_BLOCK]) {
485 retval = update_dind_extent_range(handle, tmp_inode,
486 le32_to_cpu(i_data[EXT4_DIND_BLOCK]),
487 &blk_count, &lb);
488 if (retval)
489 goto err_out;
490 } else
491 blk_count += max_entries * max_entries;
492 if (i_data[EXT4_TIND_BLOCK]) {
493 retval = update_tind_extent_range(handle, tmp_inode,
494 le32_to_cpu(i_data[EXT4_TIND_BLOCK]),
495 &blk_count, &lb);
496 if (retval)
497 goto err_out;
498 }
499 /*
500 * Build the last extent
501 */
502 retval = finish_range(handle, tmp_inode, &lb);
503err_out:
504 /*
505 * We are either freeing extent information or indirect
506 * blocks. During this we touch superblock, group descriptor
507 * and block bitmap. Later we mark the tmp_inode dirty
508 * via ext4_ext_tree_init. So allocate a credit of 4
509 * We may update quota (user and group).
510 *
511 * FIXME!! we may be touching bitmaps in different block groups.
512 */
513 if (ext4_journal_extend(handle,
514 4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)) != 0)
515 ext4_journal_restart(handle,
516 4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
517 if (retval)
518 /*
519 * Failure case delete the extent information with the
520 * tmp_inode
521 */
522 free_ext_block(handle, tmp_inode);
523 else
524 retval = ext4_ext_swap_inode_data(handle, inode,
525 tmp_inode, retval);
526
527 /*
528 * Mark the tmp_inode as of size zero
529 */
530 i_size_write(tmp_inode, 0);
531
532 /*
533 * set the i_blocks count to zero
534 * so that the ext4_delete_inode does the
535 * right job
536 *
537 * We don't need to take the i_lock because
538 * the inode is not visible to user space.
539 */
540 tmp_inode->i_blocks = 0;
541
542 /* Reset the extent details */
543 ext4_ext_tree_init(handle, tmp_inode);
544
545 /*
546 * Set the i_nlink to zero so that
547 * generic_drop_inode really deletes the
548 * inode
549 */
550 tmp_inode->i_nlink = 0;
551
552 ext4_journal_stop(handle);
553
554 up_write(&EXT4_I(inode)->i_data_sem);
555
556 if (tmp_inode)
557 iput(tmp_inode);
558
559 return retval;
560}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 94ee6f315dc1..67b6d8a1ceff 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -51,7 +51,7 @@
51 51
52static struct buffer_head *ext4_append(handle_t *handle, 52static struct buffer_head *ext4_append(handle_t *handle,
53 struct inode *inode, 53 struct inode *inode,
54 u32 *block, int *err) 54 ext4_lblk_t *block, int *err)
55{ 55{
56 struct buffer_head *bh; 56 struct buffer_head *bh;
57 57
@@ -144,8 +144,8 @@ struct dx_map_entry
144 u16 size; 144 u16 size;
145}; 145};
146 146
147static inline unsigned dx_get_block (struct dx_entry *entry); 147static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
148static void dx_set_block (struct dx_entry *entry, unsigned value); 148static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
149static inline unsigned dx_get_hash (struct dx_entry *entry); 149static inline unsigned dx_get_hash (struct dx_entry *entry);
150static void dx_set_hash (struct dx_entry *entry, unsigned value); 150static void dx_set_hash (struct dx_entry *entry, unsigned value);
151static unsigned dx_get_count (struct dx_entry *entries); 151static unsigned dx_get_count (struct dx_entry *entries);
@@ -166,7 +166,8 @@ static void dx_sort_map(struct dx_map_entry *map, unsigned count);
166static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to, 166static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
167 struct dx_map_entry *offsets, int count); 167 struct dx_map_entry *offsets, int count);
168static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size); 168static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
169static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); 169static void dx_insert_block(struct dx_frame *frame,
170 u32 hash, ext4_lblk_t block);
170static int ext4_htree_next_block(struct inode *dir, __u32 hash, 171static int ext4_htree_next_block(struct inode *dir, __u32 hash,
171 struct dx_frame *frame, 172 struct dx_frame *frame,
172 struct dx_frame *frames, 173 struct dx_frame *frames,
@@ -181,12 +182,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
181 * Mask them off for now. 182 * Mask them off for now.
182 */ 183 */
183 184
184static inline unsigned dx_get_block (struct dx_entry *entry) 185static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
185{ 186{
186 return le32_to_cpu(entry->block) & 0x00ffffff; 187 return le32_to_cpu(entry->block) & 0x00ffffff;
187} 188}
188 189
189static inline void dx_set_block (struct dx_entry *entry, unsigned value) 190static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
190{ 191{
191 entry->block = cpu_to_le32(value); 192 entry->block = cpu_to_le32(value);
192} 193}
@@ -243,8 +244,8 @@ static void dx_show_index (char * label, struct dx_entry *entries)
243 int i, n = dx_get_count (entries); 244 int i, n = dx_get_count (entries);
244 printk("%s index ", label); 245 printk("%s index ", label);
245 for (i = 0; i < n; i++) { 246 for (i = 0; i < n; i++) {
246 printk("%x->%u ", i? dx_get_hash(entries + i) : 247 printk("%x->%lu ", i? dx_get_hash(entries + i) :
247 0, dx_get_block(entries + i)); 248 0, (unsigned long)dx_get_block(entries + i));
248 } 249 }
249 printk("\n"); 250 printk("\n");
250} 251}
@@ -280,7 +281,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
280 space += EXT4_DIR_REC_LEN(de->name_len); 281 space += EXT4_DIR_REC_LEN(de->name_len);
281 names++; 282 names++;
282 } 283 }
283 de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); 284 de = ext4_next_entry(de);
284 } 285 }
285 printk("(%i)\n", names); 286 printk("(%i)\n", names);
286 return (struct stats) { names, space, 1 }; 287 return (struct stats) { names, space, 1 };
@@ -297,7 +298,8 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
297 printk("%i indexed blocks...\n", count); 298 printk("%i indexed blocks...\n", count);
298 for (i = 0; i < count; i++, entries++) 299 for (i = 0; i < count; i++, entries++)
299 { 300 {
300 u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; 301 ext4_lblk_t block = dx_get_block(entries);
302 ext4_lblk_t hash = i ? dx_get_hash(entries): 0;
301 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; 303 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
302 struct stats stats; 304 struct stats stats;
303 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); 305 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
@@ -551,7 +553,8 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
551 */ 553 */
552static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p) 554static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
553{ 555{
554 return (struct ext4_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); 556 return (struct ext4_dir_entry_2 *)((char *)p +
557 ext4_rec_len_from_disk(p->rec_len));
555} 558}
556 559
557/* 560/*
@@ -560,7 +563,7 @@ static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *
560 * into the tree. If there is an error it is returned in err. 563 * into the tree. If there is an error it is returned in err.
561 */ 564 */
562static int htree_dirblock_to_tree(struct file *dir_file, 565static int htree_dirblock_to_tree(struct file *dir_file,
563 struct inode *dir, int block, 566 struct inode *dir, ext4_lblk_t block,
564 struct dx_hash_info *hinfo, 567 struct dx_hash_info *hinfo,
565 __u32 start_hash, __u32 start_minor_hash) 568 __u32 start_hash, __u32 start_minor_hash)
566{ 569{
@@ -568,7 +571,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
568 struct ext4_dir_entry_2 *de, *top; 571 struct ext4_dir_entry_2 *de, *top;
569 int err, count = 0; 572 int err, count = 0;
570 573
571 dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); 574 dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
575 (unsigned long)block));
572 if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) 576 if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
573 return err; 577 return err;
574 578
@@ -620,9 +624,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
620 struct ext4_dir_entry_2 *de; 624 struct ext4_dir_entry_2 *de;
621 struct dx_frame frames[2], *frame; 625 struct dx_frame frames[2], *frame;
622 struct inode *dir; 626 struct inode *dir;
623 int block, err; 627 ext4_lblk_t block;
624 int count = 0; 628 int count = 0;
625 int ret; 629 int ret, err;
626 __u32 hashval; 630 __u32 hashval;
627 631
628 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, 632 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
@@ -720,7 +724,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
720 cond_resched(); 724 cond_resched();
721 } 725 }
722 /* XXX: do we need to check rec_len == 0 case? -Chris */ 726 /* XXX: do we need to check rec_len == 0 case? -Chris */
723 de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); 727 de = ext4_next_entry(de);
724 } 728 }
725 return count; 729 return count;
726} 730}
@@ -752,7 +756,7 @@ static void dx_sort_map (struct dx_map_entry *map, unsigned count)
752 } while(more); 756 } while(more);
753} 757}
754 758
755static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) 759static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
756{ 760{
757 struct dx_entry *entries = frame->entries; 761 struct dx_entry *entries = frame->entries;
758 struct dx_entry *old = frame->at, *new = old + 1; 762 struct dx_entry *old = frame->at, *new = old + 1;
@@ -820,7 +824,7 @@ static inline int search_dirblock(struct buffer_head * bh,
820 return 1; 824 return 1;
821 } 825 }
822 /* prevent looping on a bad block */ 826 /* prevent looping on a bad block */
823 de_len = le16_to_cpu(de->rec_len); 827 de_len = ext4_rec_len_from_disk(de->rec_len);
824 if (de_len <= 0) 828 if (de_len <= 0)
825 return -1; 829 return -1;
826 offset += de_len; 830 offset += de_len;
@@ -847,23 +851,20 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
847 struct super_block * sb; 851 struct super_block * sb;
848 struct buffer_head * bh_use[NAMEI_RA_SIZE]; 852 struct buffer_head * bh_use[NAMEI_RA_SIZE];
849 struct buffer_head * bh, *ret = NULL; 853 struct buffer_head * bh, *ret = NULL;
850 unsigned long start, block, b; 854 ext4_lblk_t start, block, b;
851 int ra_max = 0; /* Number of bh's in the readahead 855 int ra_max = 0; /* Number of bh's in the readahead
852 buffer, bh_use[] */ 856 buffer, bh_use[] */
853 int ra_ptr = 0; /* Current index into readahead 857 int ra_ptr = 0; /* Current index into readahead
854 buffer */ 858 buffer */
855 int num = 0; 859 int num = 0;
856 int nblocks, i, err; 860 ext4_lblk_t nblocks;
861 int i, err;
857 struct inode *dir = dentry->d_parent->d_inode; 862 struct inode *dir = dentry->d_parent->d_inode;
858 int namelen; 863 int namelen;
859 const u8 *name;
860 unsigned blocksize;
861 864
862 *res_dir = NULL; 865 *res_dir = NULL;
863 sb = dir->i_sb; 866 sb = dir->i_sb;
864 blocksize = sb->s_blocksize;
865 namelen = dentry->d_name.len; 867 namelen = dentry->d_name.len;
866 name = dentry->d_name.name;
867 if (namelen > EXT4_NAME_LEN) 868 if (namelen > EXT4_NAME_LEN)
868 return NULL; 869 return NULL;
869 if (is_dx(dir)) { 870 if (is_dx(dir)) {
@@ -914,7 +915,8 @@ restart:
914 if (!buffer_uptodate(bh)) { 915 if (!buffer_uptodate(bh)) {
915 /* read error, skip block & hope for the best */ 916 /* read error, skip block & hope for the best */
916 ext4_error(sb, __FUNCTION__, "reading directory #%lu " 917 ext4_error(sb, __FUNCTION__, "reading directory #%lu "
917 "offset %lu", dir->i_ino, block); 918 "offset %lu", dir->i_ino,
919 (unsigned long)block);
918 brelse(bh); 920 brelse(bh);
919 goto next; 921 goto next;
920 } 922 }
@@ -961,7 +963,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
961 struct dx_frame frames[2], *frame; 963 struct dx_frame frames[2], *frame;
962 struct ext4_dir_entry_2 *de, *top; 964 struct ext4_dir_entry_2 *de, *top;
963 struct buffer_head *bh; 965 struct buffer_head *bh;
964 unsigned long block; 966 ext4_lblk_t block;
965 int retval; 967 int retval;
966 int namelen = dentry->d_name.len; 968 int namelen = dentry->d_name.len;
967 const u8 *name = dentry->d_name.name; 969 const u8 *name = dentry->d_name.name;
@@ -1128,7 +1130,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1128 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1130 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1129 memcpy (to, de, rec_len); 1131 memcpy (to, de, rec_len);
1130 ((struct ext4_dir_entry_2 *) to)->rec_len = 1132 ((struct ext4_dir_entry_2 *) to)->rec_len =
1131 cpu_to_le16(rec_len); 1133 ext4_rec_len_to_disk(rec_len);
1132 de->inode = 0; 1134 de->inode = 0;
1133 map++; 1135 map++;
1134 to += rec_len; 1136 to += rec_len;
@@ -1147,13 +1149,12 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
1147 1149
1148 prev = to = de; 1150 prev = to = de;
1149 while ((char*)de < base + size) { 1151 while ((char*)de < base + size) {
1150 next = (struct ext4_dir_entry_2 *) ((char *) de + 1152 next = ext4_next_entry(de);
1151 le16_to_cpu(de->rec_len));
1152 if (de->inode && de->name_len) { 1153 if (de->inode && de->name_len) {
1153 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1154 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1154 if (de > to) 1155 if (de > to)
1155 memmove(to, de, rec_len); 1156 memmove(to, de, rec_len);
1156 to->rec_len = cpu_to_le16(rec_len); 1157 to->rec_len = ext4_rec_len_to_disk(rec_len);
1157 prev = to; 1158 prev = to;
1158 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); 1159 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1159 } 1160 }
@@ -1174,7 +1175,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1174 unsigned blocksize = dir->i_sb->s_blocksize; 1175 unsigned blocksize = dir->i_sb->s_blocksize;
1175 unsigned count, continued; 1176 unsigned count, continued;
1176 struct buffer_head *bh2; 1177 struct buffer_head *bh2;
1177 u32 newblock; 1178 ext4_lblk_t newblock;
1178 u32 hash2; 1179 u32 hash2;
1179 struct dx_map_entry *map; 1180 struct dx_map_entry *map;
1180 char *data1 = (*bh)->b_data, *data2; 1181 char *data1 = (*bh)->b_data, *data2;
@@ -1221,14 +1222,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1221 split = count - move; 1222 split = count - move;
1222 hash2 = map[split].hash; 1223 hash2 = map[split].hash;
1223 continued = hash2 == map[split - 1].hash; 1224 continued = hash2 == map[split - 1].hash;
1224 dxtrace(printk("Split block %i at %x, %i/%i\n", 1225 dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
1225 dx_get_block(frame->at), hash2, split, count-split)); 1226 (unsigned long)dx_get_block(frame->at),
1227 hash2, split, count-split));
1226 1228
1227 /* Fancy dance to stay within two buffers */ 1229 /* Fancy dance to stay within two buffers */
1228 de2 = dx_move_dirents(data1, data2, map + split, count - split); 1230 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1229 de = dx_pack_dirents(data1,blocksize); 1231 de = dx_pack_dirents(data1,blocksize);
1230 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); 1232 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
1231 de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); 1233 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
1232 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); 1234 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1233 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); 1235 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1234 1236
@@ -1297,7 +1299,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1297 return -EEXIST; 1299 return -EEXIST;
1298 } 1300 }
1299 nlen = EXT4_DIR_REC_LEN(de->name_len); 1301 nlen = EXT4_DIR_REC_LEN(de->name_len);
1300 rlen = le16_to_cpu(de->rec_len); 1302 rlen = ext4_rec_len_from_disk(de->rec_len);
1301 if ((de->inode? rlen - nlen: rlen) >= reclen) 1303 if ((de->inode? rlen - nlen: rlen) >= reclen)
1302 break; 1304 break;
1303 de = (struct ext4_dir_entry_2 *)((char *)de + rlen); 1305 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1316,11 +1318,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1316 1318
1317 /* By now the buffer is marked for journaling */ 1319 /* By now the buffer is marked for journaling */
1318 nlen = EXT4_DIR_REC_LEN(de->name_len); 1320 nlen = EXT4_DIR_REC_LEN(de->name_len);
1319 rlen = le16_to_cpu(de->rec_len); 1321 rlen = ext4_rec_len_from_disk(de->rec_len);
1320 if (de->inode) { 1322 if (de->inode) {
1321 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); 1323 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1322 de1->rec_len = cpu_to_le16(rlen - nlen); 1324 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
1323 de->rec_len = cpu_to_le16(nlen); 1325 de->rec_len = ext4_rec_len_to_disk(nlen);
1324 de = de1; 1326 de = de1;
1325 } 1327 }
1326 de->file_type = EXT4_FT_UNKNOWN; 1328 de->file_type = EXT4_FT_UNKNOWN;
@@ -1374,7 +1376,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1374 int retval; 1376 int retval;
1375 unsigned blocksize; 1377 unsigned blocksize;
1376 struct dx_hash_info hinfo; 1378 struct dx_hash_info hinfo;
1377 u32 block; 1379 ext4_lblk_t block;
1378 struct fake_dirent *fde; 1380 struct fake_dirent *fde;
1379 1381
1380 blocksize = dir->i_sb->s_blocksize; 1382 blocksize = dir->i_sb->s_blocksize;
@@ -1397,17 +1399,18 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1397 1399
1398 /* The 0th block becomes the root, move the dirents out */ 1400 /* The 0th block becomes the root, move the dirents out */
1399 fde = &root->dotdot; 1401 fde = &root->dotdot;
1400 de = (struct ext4_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len)); 1402 de = (struct ext4_dir_entry_2 *)((char *)fde +
1403 ext4_rec_len_from_disk(fde->rec_len));
1401 len = ((char *) root) + blocksize - (char *) de; 1404 len = ((char *) root) + blocksize - (char *) de;
1402 memcpy (data1, de, len); 1405 memcpy (data1, de, len);
1403 de = (struct ext4_dir_entry_2 *) data1; 1406 de = (struct ext4_dir_entry_2 *) data1;
1404 top = data1 + len; 1407 top = data1 + len;
1405 while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top) 1408 while ((char *)(de2 = ext4_next_entry(de)) < top)
1406 de = de2; 1409 de = de2;
1407 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); 1410 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
1408 /* Initialize the root; the dot dirents already exist */ 1411 /* Initialize the root; the dot dirents already exist */
1409 de = (struct ext4_dir_entry_2 *) (&root->dotdot); 1412 de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1410 de->rec_len = cpu_to_le16(blocksize - EXT4_DIR_REC_LEN(2)); 1413 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
1411 memset (&root->info, 0, sizeof(root->info)); 1414 memset (&root->info, 0, sizeof(root->info));
1412 root->info.info_length = sizeof(root->info); 1415 root->info.info_length = sizeof(root->info);
1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 1416 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1454,7 +1457,7 @@ static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
1454 int retval; 1457 int retval;
1455 int dx_fallback=0; 1458 int dx_fallback=0;
1456 unsigned blocksize; 1459 unsigned blocksize;
1457 u32 block, blocks; 1460 ext4_lblk_t block, blocks;
1458 1461
1459 sb = dir->i_sb; 1462 sb = dir->i_sb;
1460 blocksize = sb->s_blocksize; 1463 blocksize = sb->s_blocksize;
@@ -1487,7 +1490,7 @@ static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
1487 return retval; 1490 return retval;
1488 de = (struct ext4_dir_entry_2 *) bh->b_data; 1491 de = (struct ext4_dir_entry_2 *) bh->b_data;
1489 de->inode = 0; 1492 de->inode = 0;
1490 de->rec_len = cpu_to_le16(blocksize); 1493 de->rec_len = ext4_rec_len_to_disk(blocksize);
1491 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1494 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1492} 1495}
1493 1496
@@ -1531,7 +1534,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1531 dx_get_count(entries), dx_get_limit(entries))); 1534 dx_get_count(entries), dx_get_limit(entries)));
1532 /* Need to split index? */ 1535 /* Need to split index? */
1533 if (dx_get_count(entries) == dx_get_limit(entries)) { 1536 if (dx_get_count(entries) == dx_get_limit(entries)) {
1534 u32 newblock; 1537 ext4_lblk_t newblock;
1535 unsigned icount = dx_get_count(entries); 1538 unsigned icount = dx_get_count(entries);
1536 int levels = frame - frames; 1539 int levels = frame - frames;
1537 struct dx_entry *entries2; 1540 struct dx_entry *entries2;
@@ -1550,7 +1553,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1550 goto cleanup; 1553 goto cleanup;
1551 node2 = (struct dx_node *)(bh2->b_data); 1554 node2 = (struct dx_node *)(bh2->b_data);
1552 entries2 = node2->entries; 1555 entries2 = node2->entries;
1553 node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); 1556 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
1554 node2->fake.inode = 0; 1557 node2->fake.inode = 0;
1555 BUFFER_TRACE(frame->bh, "get_write_access"); 1558 BUFFER_TRACE(frame->bh, "get_write_access");
1556 err = ext4_journal_get_write_access(handle, frame->bh); 1559 err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1648,9 +1651,9 @@ static int ext4_delete_entry (handle_t *handle,
1648 BUFFER_TRACE(bh, "get_write_access"); 1651 BUFFER_TRACE(bh, "get_write_access");
1649 ext4_journal_get_write_access(handle, bh); 1652 ext4_journal_get_write_access(handle, bh);
1650 if (pde) 1653 if (pde)
1651 pde->rec_len = 1654 pde->rec_len = ext4_rec_len_to_disk(
1652 cpu_to_le16(le16_to_cpu(pde->rec_len) + 1655 ext4_rec_len_from_disk(pde->rec_len) +
1653 le16_to_cpu(de->rec_len)); 1656 ext4_rec_len_from_disk(de->rec_len));
1654 else 1657 else
1655 de->inode = 0; 1658 de->inode = 0;
1656 dir->i_version++; 1659 dir->i_version++;
@@ -1658,10 +1661,9 @@ static int ext4_delete_entry (handle_t *handle,
1658 ext4_journal_dirty_metadata(handle, bh); 1661 ext4_journal_dirty_metadata(handle, bh);
1659 return 0; 1662 return 0;
1660 } 1663 }
1661 i += le16_to_cpu(de->rec_len); 1664 i += ext4_rec_len_from_disk(de->rec_len);
1662 pde = de; 1665 pde = de;
1663 de = (struct ext4_dir_entry_2 *) 1666 de = ext4_next_entry(de);
1664 ((char *) de + le16_to_cpu(de->rec_len));
1665 } 1667 }
1666 return -ENOENT; 1668 return -ENOENT;
1667} 1669}
@@ -1824,13 +1826,13 @@ retry:
1824 de = (struct ext4_dir_entry_2 *) dir_block->b_data; 1826 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1825 de->inode = cpu_to_le32(inode->i_ino); 1827 de->inode = cpu_to_le32(inode->i_ino);
1826 de->name_len = 1; 1828 de->name_len = 1;
1827 de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de->name_len)); 1829 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
1828 strcpy (de->name, "."); 1830 strcpy (de->name, ".");
1829 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1831 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1830 de = (struct ext4_dir_entry_2 *) 1832 de = ext4_next_entry(de);
1831 ((char *) de + le16_to_cpu(de->rec_len));
1832 de->inode = cpu_to_le32(dir->i_ino); 1833 de->inode = cpu_to_le32(dir->i_ino);
1833 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT4_DIR_REC_LEN(1)); 1834 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
1835 EXT4_DIR_REC_LEN(1));
1834 de->name_len = 2; 1836 de->name_len = 2;
1835 strcpy (de->name, ".."); 1837 strcpy (de->name, "..");
1836 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1838 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1882,8 +1884,7 @@ static int empty_dir (struct inode * inode)
1882 return 1; 1884 return 1;
1883 } 1885 }
1884 de = (struct ext4_dir_entry_2 *) bh->b_data; 1886 de = (struct ext4_dir_entry_2 *) bh->b_data;
1885 de1 = (struct ext4_dir_entry_2 *) 1887 de1 = ext4_next_entry(de);
1886 ((char *) de + le16_to_cpu(de->rec_len));
1887 if (le32_to_cpu(de->inode) != inode->i_ino || 1888 if (le32_to_cpu(de->inode) != inode->i_ino ||
1888 !le32_to_cpu(de1->inode) || 1889 !le32_to_cpu(de1->inode) ||
1889 strcmp (".", de->name) || 1890 strcmp (".", de->name) ||
@@ -1894,9 +1895,9 @@ static int empty_dir (struct inode * inode)
1894 brelse (bh); 1895 brelse (bh);
1895 return 1; 1896 return 1;
1896 } 1897 }
1897 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); 1898 offset = ext4_rec_len_from_disk(de->rec_len) +
1898 de = (struct ext4_dir_entry_2 *) 1899 ext4_rec_len_from_disk(de1->rec_len);
1899 ((char *) de1 + le16_to_cpu(de1->rec_len)); 1900 de = ext4_next_entry(de1);
1900 while (offset < inode->i_size ) { 1901 while (offset < inode->i_size ) {
1901 if (!bh || 1902 if (!bh ||
1902 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1903 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1925,9 +1926,8 @@ static int empty_dir (struct inode * inode)
1925 brelse (bh); 1926 brelse (bh);
1926 return 0; 1927 return 0;
1927 } 1928 }
1928 offset += le16_to_cpu(de->rec_len); 1929 offset += ext4_rec_len_from_disk(de->rec_len);
1929 de = (struct ext4_dir_entry_2 *) 1930 de = ext4_next_entry(de);
1930 ((char *) de + le16_to_cpu(de->rec_len));
1931 } 1931 }
1932 brelse (bh); 1932 brelse (bh);
1933 return 1; 1933 return 1;
@@ -2282,8 +2282,7 @@ retry:
2282} 2282}
2283 2283
2284#define PARENT_INO(buffer) \ 2284#define PARENT_INO(buffer) \
2285 ((struct ext4_dir_entry_2 *) ((char *) buffer + \ 2285 (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
2286 le16_to_cpu(((struct ext4_dir_entry_2 *) buffer)->rec_len)))->inode
2287 2286
2288/* 2287/*
2289 * Anybody can rename anything with this: the permission checks are left to the 2288 * Anybody can rename anything with this: the permission checks are left to the
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bd8a52bb3999..4fbba60816f4 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -28,7 +28,7 @@ static int verify_group_input(struct super_block *sb,
28 struct ext4_super_block *es = sbi->s_es; 28 struct ext4_super_block *es = sbi->s_es;
29 ext4_fsblk_t start = ext4_blocks_count(es); 29 ext4_fsblk_t start = ext4_blocks_count(es);
30 ext4_fsblk_t end = start + input->blocks_count; 30 ext4_fsblk_t end = start + input->blocks_count;
31 unsigned group = input->group; 31 ext4_group_t group = input->group;
32 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; 32 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
33 unsigned overhead = ext4_bg_has_super(sb, group) ? 33 unsigned overhead = ext4_bg_has_super(sb, group) ?
34 (1 + ext4_bg_num_gdb(sb, group) + 34 (1 + ext4_bg_num_gdb(sb, group) +
@@ -206,7 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb,
206 } 206 }
207 207
208 if (ext4_bg_has_super(sb, input->group)) { 208 if (ext4_bg_has_super(sb, input->group)) {
209 ext4_debug("mark backup superblock %#04lx (+0)\n", start); 209 ext4_debug("mark backup superblock %#04llx (+0)\n", start);
210 ext4_set_bit(0, bh->b_data); 210 ext4_set_bit(0, bh->b_data);
211 } 211 }
212 212
@@ -215,7 +215,7 @@ static int setup_new_group_blocks(struct super_block *sb,
215 i < gdblocks; i++, block++, bit++) { 215 i < gdblocks; i++, block++, bit++) {
216 struct buffer_head *gdb; 216 struct buffer_head *gdb;
217 217
218 ext4_debug("update backup group %#04lx (+%d)\n", block, bit); 218 ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
219 219
220 if ((err = extend_or_restart_transaction(handle, 1, bh))) 220 if ((err = extend_or_restart_transaction(handle, 1, bh)))
221 goto exit_bh; 221 goto exit_bh;
@@ -243,7 +243,7 @@ static int setup_new_group_blocks(struct super_block *sb,
243 i < reserved_gdb; i++, block++, bit++) { 243 i < reserved_gdb; i++, block++, bit++) {
244 struct buffer_head *gdb; 244 struct buffer_head *gdb;
245 245
246 ext4_debug("clear reserved block %#04lx (+%d)\n", block, bit); 246 ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
247 247
248 if ((err = extend_or_restart_transaction(handle, 1, bh))) 248 if ((err = extend_or_restart_transaction(handle, 1, bh)))
249 goto exit_bh; 249 goto exit_bh;
@@ -256,10 +256,10 @@ static int setup_new_group_blocks(struct super_block *sb,
256 ext4_set_bit(bit, bh->b_data); 256 ext4_set_bit(bit, bh->b_data);
257 brelse(gdb); 257 brelse(gdb);
258 } 258 }
259 ext4_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap, 259 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
260 input->block_bitmap - start); 260 input->block_bitmap - start);
261 ext4_set_bit(input->block_bitmap - start, bh->b_data); 261 ext4_set_bit(input->block_bitmap - start, bh->b_data);
262 ext4_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap, 262 ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
263 input->inode_bitmap - start); 263 input->inode_bitmap - start);
264 ext4_set_bit(input->inode_bitmap - start, bh->b_data); 264 ext4_set_bit(input->inode_bitmap - start, bh->b_data);
265 265
@@ -268,7 +268,7 @@ static int setup_new_group_blocks(struct super_block *sb,
268 i < sbi->s_itb_per_group; i++, bit++, block++) { 268 i < sbi->s_itb_per_group; i++, bit++, block++) {
269 struct buffer_head *it; 269 struct buffer_head *it;
270 270
271 ext4_debug("clear inode block %#04lx (+%d)\n", block, bit); 271 ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
272 272
273 if ((err = extend_or_restart_transaction(handle, 1, bh))) 273 if ((err = extend_or_restart_transaction(handle, 1, bh)))
274 goto exit_bh; 274 goto exit_bh;
@@ -291,7 +291,7 @@ static int setup_new_group_blocks(struct super_block *sb,
291 brelse(bh); 291 brelse(bh);
292 292
293 /* Mark unused entries in inode bitmap used */ 293 /* Mark unused entries in inode bitmap used */
294 ext4_debug("clear inode bitmap %#04x (+%ld)\n", 294 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
295 input->inode_bitmap, input->inode_bitmap - start); 295 input->inode_bitmap, input->inode_bitmap - start);
296 if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { 296 if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
297 err = PTR_ERR(bh); 297 err = PTR_ERR(bh);
@@ -357,7 +357,7 @@ static int verify_reserved_gdb(struct super_block *sb,
357 struct buffer_head *primary) 357 struct buffer_head *primary)
358{ 358{
359 const ext4_fsblk_t blk = primary->b_blocknr; 359 const ext4_fsblk_t blk = primary->b_blocknr;
360 const unsigned long end = EXT4_SB(sb)->s_groups_count; 360 const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
361 unsigned three = 1; 361 unsigned three = 1;
362 unsigned five = 5; 362 unsigned five = 5;
363 unsigned seven = 7; 363 unsigned seven = 7;
@@ -656,12 +656,12 @@ static void update_backups(struct super_block *sb,
656 int blk_off, char *data, int size) 656 int blk_off, char *data, int size)
657{ 657{
658 struct ext4_sb_info *sbi = EXT4_SB(sb); 658 struct ext4_sb_info *sbi = EXT4_SB(sb);
659 const unsigned long last = sbi->s_groups_count; 659 const ext4_group_t last = sbi->s_groups_count;
660 const int bpg = EXT4_BLOCKS_PER_GROUP(sb); 660 const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
661 unsigned three = 1; 661 unsigned three = 1;
662 unsigned five = 5; 662 unsigned five = 5;
663 unsigned seven = 7; 663 unsigned seven = 7;
664 unsigned group; 664 ext4_group_t group;
665 int rest = sb->s_blocksize - size; 665 int rest = sb->s_blocksize - size;
666 handle_t *handle; 666 handle_t *handle;
667 int err = 0, err2; 667 int err = 0, err2;
@@ -716,7 +716,7 @@ static void update_backups(struct super_block *sb,
716exit_err: 716exit_err:
717 if (err) { 717 if (err) {
718 ext4_warning(sb, __FUNCTION__, 718 ext4_warning(sb, __FUNCTION__,
719 "can't update backup for group %d (err %d), " 719 "can't update backup for group %lu (err %d), "
720 "forcing fsck on next reboot", group, err); 720 "forcing fsck on next reboot", group, err);
721 sbi->s_mount_state &= ~EXT4_VALID_FS; 721 sbi->s_mount_state &= ~EXT4_VALID_FS;
722 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 722 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -952,7 +952,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
952 ext4_fsblk_t n_blocks_count) 952 ext4_fsblk_t n_blocks_count)
953{ 953{
954 ext4_fsblk_t o_blocks_count; 954 ext4_fsblk_t o_blocks_count;
955 unsigned long o_groups_count; 955 ext4_group_t o_groups_count;
956 ext4_grpblk_t last; 956 ext4_grpblk_t last;
957 ext4_grpblk_t add; 957 ext4_grpblk_t add;
958 struct buffer_head * bh; 958 struct buffer_head * bh;
@@ -1054,7 +1054,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1054 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 1054 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
1055 sb->s_dirt = 1; 1055 sb->s_dirt = 1;
1056 unlock_super(sb); 1056 unlock_super(sb);
1057 ext4_debug("freeing blocks %lu through %llu\n", o_blocks_count, 1057 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1058 o_blocks_count + add); 1058 o_blocks_count + add);
1059 ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1059 ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
1060 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, 1060 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8031dc0e24e5..055a0cd0168e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -373,6 +373,66 @@ void ext4_update_dynamic_rev(struct super_block *sb)
373 */ 373 */
374} 374}
375 375
376int ext4_update_compat_feature(handle_t *handle,
377 struct super_block *sb, __u32 compat)
378{
379 int err = 0;
380 if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
381 err = ext4_journal_get_write_access(handle,
382 EXT4_SB(sb)->s_sbh);
383 if (err)
384 return err;
385 EXT4_SET_COMPAT_FEATURE(sb, compat);
386 sb->s_dirt = 1;
387 handle->h_sync = 1;
388 BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
389 "call ext4_journal_dirty_met adata");
390 err = ext4_journal_dirty_metadata(handle,
391 EXT4_SB(sb)->s_sbh);
392 }
393 return err;
394}
395
396int ext4_update_rocompat_feature(handle_t *handle,
397 struct super_block *sb, __u32 rocompat)
398{
399 int err = 0;
400 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
401 err = ext4_journal_get_write_access(handle,
402 EXT4_SB(sb)->s_sbh);
403 if (err)
404 return err;
405 EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
406 sb->s_dirt = 1;
407 handle->h_sync = 1;
408 BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
409 "call ext4_journal_dirty_met adata");
410 err = ext4_journal_dirty_metadata(handle,
411 EXT4_SB(sb)->s_sbh);
412 }
413 return err;
414}
415
416int ext4_update_incompat_feature(handle_t *handle,
417 struct super_block *sb, __u32 incompat)
418{
419 int err = 0;
420 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
421 err = ext4_journal_get_write_access(handle,
422 EXT4_SB(sb)->s_sbh);
423 if (err)
424 return err;
425 EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
426 sb->s_dirt = 1;
427 handle->h_sync = 1;
428 BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
429 "call ext4_journal_dirty_met adata");
430 err = ext4_journal_dirty_metadata(handle,
431 EXT4_SB(sb)->s_sbh);
432 }
433 return err;
434}
435
376/* 436/*
377 * Open the external journal device 437 * Open the external journal device
378 */ 438 */
@@ -443,6 +503,7 @@ static void ext4_put_super (struct super_block * sb)
443 struct ext4_super_block *es = sbi->s_es; 503 struct ext4_super_block *es = sbi->s_es;
444 int i; 504 int i;
445 505
506 ext4_mb_release(sb);
446 ext4_ext_release(sb); 507 ext4_ext_release(sb);
447 ext4_xattr_put_super(sb); 508 ext4_xattr_put_super(sb);
448 jbd2_journal_destroy(sbi->s_journal); 509 jbd2_journal_destroy(sbi->s_journal);
@@ -509,6 +570,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
509 ei->i_block_alloc_info = NULL; 570 ei->i_block_alloc_info = NULL;
510 ei->vfs_inode.i_version = 1; 571 ei->vfs_inode.i_version = 1;
511 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 572 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
573 INIT_LIST_HEAD(&ei->i_prealloc_list);
574 spin_lock_init(&ei->i_prealloc_lock);
512 return &ei->vfs_inode; 575 return &ei->vfs_inode;
513} 576}
514 577
@@ -533,7 +596,7 @@ static void init_once(struct kmem_cache *cachep, void *foo)
533#ifdef CONFIG_EXT4DEV_FS_XATTR 596#ifdef CONFIG_EXT4DEV_FS_XATTR
534 init_rwsem(&ei->xattr_sem); 597 init_rwsem(&ei->xattr_sem);
535#endif 598#endif
536 mutex_init(&ei->truncate_mutex); 599 init_rwsem(&ei->i_data_sem);
537 inode_init_once(&ei->vfs_inode); 600 inode_init_once(&ei->vfs_inode);
538} 601}
539 602
@@ -605,18 +668,20 @@ static inline void ext4_show_quota_options(struct seq_file *seq, struct super_bl
605 */ 668 */
606static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) 669static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
607{ 670{
671 int def_errors;
672 unsigned long def_mount_opts;
608 struct super_block *sb = vfs->mnt_sb; 673 struct super_block *sb = vfs->mnt_sb;
609 struct ext4_sb_info *sbi = EXT4_SB(sb); 674 struct ext4_sb_info *sbi = EXT4_SB(sb);
610 struct ext4_super_block *es = sbi->s_es; 675 struct ext4_super_block *es = sbi->s_es;
611 unsigned long def_mount_opts;
612 676
613 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 677 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
678 def_errors = le16_to_cpu(es->s_errors);
614 679
615 if (sbi->s_sb_block != 1) 680 if (sbi->s_sb_block != 1)
616 seq_printf(seq, ",sb=%llu", sbi->s_sb_block); 681 seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
617 if (test_opt(sb, MINIX_DF)) 682 if (test_opt(sb, MINIX_DF))
618 seq_puts(seq, ",minixdf"); 683 seq_puts(seq, ",minixdf");
619 if (test_opt(sb, GRPID)) 684 if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
620 seq_puts(seq, ",grpid"); 685 seq_puts(seq, ",grpid");
621 if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS)) 686 if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
622 seq_puts(seq, ",nogrpid"); 687 seq_puts(seq, ",nogrpid");
@@ -628,34 +693,33 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
628 le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) { 693 le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
629 seq_printf(seq, ",resgid=%u", sbi->s_resgid); 694 seq_printf(seq, ",resgid=%u", sbi->s_resgid);
630 } 695 }
631 if (test_opt(sb, ERRORS_CONT)) { 696 if (test_opt(sb, ERRORS_RO)) {
632 int def_errors = le16_to_cpu(es->s_errors);
633
634 if (def_errors == EXT4_ERRORS_PANIC || 697 if (def_errors == EXT4_ERRORS_PANIC ||
635 def_errors == EXT4_ERRORS_RO) { 698 def_errors == EXT4_ERRORS_CONTINUE) {
636 seq_puts(seq, ",errors=continue"); 699 seq_puts(seq, ",errors=remount-ro");
637 } 700 }
638 } 701 }
639 if (test_opt(sb, ERRORS_RO)) 702 if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
640 seq_puts(seq, ",errors=remount-ro"); 703 seq_puts(seq, ",errors=continue");
641 if (test_opt(sb, ERRORS_PANIC)) 704 if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
642 seq_puts(seq, ",errors=panic"); 705 seq_puts(seq, ",errors=panic");
643 if (test_opt(sb, NO_UID32)) 706 if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
644 seq_puts(seq, ",nouid32"); 707 seq_puts(seq, ",nouid32");
645 if (test_opt(sb, DEBUG)) 708 if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
646 seq_puts(seq, ",debug"); 709 seq_puts(seq, ",debug");
647 if (test_opt(sb, OLDALLOC)) 710 if (test_opt(sb, OLDALLOC))
648 seq_puts(seq, ",oldalloc"); 711 seq_puts(seq, ",oldalloc");
649#ifdef CONFIG_EXT4_FS_XATTR 712#ifdef CONFIG_EXT4DEV_FS_XATTR
650 if (test_opt(sb, XATTR_USER)) 713 if (test_opt(sb, XATTR_USER) &&
714 !(def_mount_opts & EXT4_DEFM_XATTR_USER))
651 seq_puts(seq, ",user_xattr"); 715 seq_puts(seq, ",user_xattr");
652 if (!test_opt(sb, XATTR_USER) && 716 if (!test_opt(sb, XATTR_USER) &&
653 (def_mount_opts & EXT4_DEFM_XATTR_USER)) { 717 (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
654 seq_puts(seq, ",nouser_xattr"); 718 seq_puts(seq, ",nouser_xattr");
655 } 719 }
656#endif 720#endif
657#ifdef CONFIG_EXT4_FS_POSIX_ACL 721#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
658 if (test_opt(sb, POSIX_ACL)) 722 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
659 seq_puts(seq, ",acl"); 723 seq_puts(seq, ",acl");
660 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) 724 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
661 seq_puts(seq, ",noacl"); 725 seq_puts(seq, ",noacl");
@@ -672,7 +736,17 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
672 seq_puts(seq, ",nobh"); 736 seq_puts(seq, ",nobh");
673 if (!test_opt(sb, EXTENTS)) 737 if (!test_opt(sb, EXTENTS))
674 seq_puts(seq, ",noextents"); 738 seq_puts(seq, ",noextents");
739 if (!test_opt(sb, MBALLOC))
740 seq_puts(seq, ",nomballoc");
741 if (test_opt(sb, I_VERSION))
742 seq_puts(seq, ",i_version");
675 743
744 if (sbi->s_stripe)
745 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
746 /*
747 * journal mode get enabled in different ways
748 * So just print the value even if we didn't specify it
749 */
676 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 750 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
677 seq_puts(seq, ",data=journal"); 751 seq_puts(seq, ",data=journal");
678 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 752 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
@@ -681,7 +755,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
681 seq_puts(seq, ",data=writeback"); 755 seq_puts(seq, ",data=writeback");
682 756
683 ext4_show_quota_options(seq, sb); 757 ext4_show_quota_options(seq, sb);
684
685 return 0; 758 return 0;
686} 759}
687 760
@@ -809,11 +882,13 @@ enum {
809 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 882 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
810 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 883 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
811 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 884 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
885 Opt_journal_checksum, Opt_journal_async_commit,
812 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 886 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
813 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 887 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
814 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 888 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
815 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 889 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
816 Opt_grpquota, Opt_extents, Opt_noextents, 890 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
891 Opt_mballoc, Opt_nomballoc, Opt_stripe,
817}; 892};
818 893
819static match_table_t tokens = { 894static match_table_t tokens = {
@@ -848,6 +923,8 @@ static match_table_t tokens = {
848 {Opt_journal_update, "journal=update"}, 923 {Opt_journal_update, "journal=update"},
849 {Opt_journal_inum, "journal=%u"}, 924 {Opt_journal_inum, "journal=%u"},
850 {Opt_journal_dev, "journal_dev=%u"}, 925 {Opt_journal_dev, "journal_dev=%u"},
926 {Opt_journal_checksum, "journal_checksum"},
927 {Opt_journal_async_commit, "journal_async_commit"},
851 {Opt_abort, "abort"}, 928 {Opt_abort, "abort"},
852 {Opt_data_journal, "data=journal"}, 929 {Opt_data_journal, "data=journal"},
853 {Opt_data_ordered, "data=ordered"}, 930 {Opt_data_ordered, "data=ordered"},
@@ -865,6 +942,10 @@ static match_table_t tokens = {
865 {Opt_barrier, "barrier=%u"}, 942 {Opt_barrier, "barrier=%u"},
866 {Opt_extents, "extents"}, 943 {Opt_extents, "extents"},
867 {Opt_noextents, "noextents"}, 944 {Opt_noextents, "noextents"},
945 {Opt_i_version, "i_version"},
946 {Opt_mballoc, "mballoc"},
947 {Opt_nomballoc, "nomballoc"},
948 {Opt_stripe, "stripe=%u"},
868 {Opt_err, NULL}, 949 {Opt_err, NULL},
869 {Opt_resize, "resize"}, 950 {Opt_resize, "resize"},
870}; 951};
@@ -1035,6 +1116,13 @@ static int parse_options (char *options, struct super_block *sb,
1035 return 0; 1116 return 0;
1036 *journal_devnum = option; 1117 *journal_devnum = option;
1037 break; 1118 break;
1119 case Opt_journal_checksum:
1120 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1121 break;
1122 case Opt_journal_async_commit:
1123 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
1124 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1125 break;
1038 case Opt_noload: 1126 case Opt_noload:
1039 set_opt (sbi->s_mount_opt, NOLOAD); 1127 set_opt (sbi->s_mount_opt, NOLOAD);
1040 break; 1128 break;
@@ -1203,6 +1291,23 @@ clear_qf_name:
1203 case Opt_noextents: 1291 case Opt_noextents:
1204 clear_opt (sbi->s_mount_opt, EXTENTS); 1292 clear_opt (sbi->s_mount_opt, EXTENTS);
1205 break; 1293 break;
1294 case Opt_i_version:
1295 set_opt(sbi->s_mount_opt, I_VERSION);
1296 sb->s_flags |= MS_I_VERSION;
1297 break;
1298 case Opt_mballoc:
1299 set_opt(sbi->s_mount_opt, MBALLOC);
1300 break;
1301 case Opt_nomballoc:
1302 clear_opt(sbi->s_mount_opt, MBALLOC);
1303 break;
1304 case Opt_stripe:
1305 if (match_int(&args[0], &option))
1306 return 0;
1307 if (option < 0)
1308 return 0;
1309 sbi->s_stripe = option;
1310 break;
1206 default: 1311 default:
1207 printk (KERN_ERR 1312 printk (KERN_ERR
1208 "EXT4-fs: Unrecognized mount option \"%s\" " 1313 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1364,7 +1469,7 @@ static int ext4_check_descriptors (struct super_block * sb)
1364 struct ext4_group_desc * gdp = NULL; 1469 struct ext4_group_desc * gdp = NULL;
1365 int desc_block = 0; 1470 int desc_block = 0;
1366 int flexbg_flag = 0; 1471 int flexbg_flag = 0;
1367 int i; 1472 ext4_group_t i;
1368 1473
1369 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 1474 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
1370 flexbg_flag = 1; 1475 flexbg_flag = 1;
@@ -1386,7 +1491,7 @@ static int ext4_check_descriptors (struct super_block * sb)
1386 if (block_bitmap < first_block || block_bitmap > last_block) 1491 if (block_bitmap < first_block || block_bitmap > last_block)
1387 { 1492 {
1388 ext4_error (sb, "ext4_check_descriptors", 1493 ext4_error (sb, "ext4_check_descriptors",
1389 "Block bitmap for group %d" 1494 "Block bitmap for group %lu"
1390 " not in group (block %llu)!", 1495 " not in group (block %llu)!",
1391 i, block_bitmap); 1496 i, block_bitmap);
1392 return 0; 1497 return 0;
@@ -1395,7 +1500,7 @@ static int ext4_check_descriptors (struct super_block * sb)
1395 if (inode_bitmap < first_block || inode_bitmap > last_block) 1500 if (inode_bitmap < first_block || inode_bitmap > last_block)
1396 { 1501 {
1397 ext4_error (sb, "ext4_check_descriptors", 1502 ext4_error (sb, "ext4_check_descriptors",
1398 "Inode bitmap for group %d" 1503 "Inode bitmap for group %lu"
1399 " not in group (block %llu)!", 1504 " not in group (block %llu)!",
1400 i, inode_bitmap); 1505 i, inode_bitmap);
1401 return 0; 1506 return 0;
@@ -1405,17 +1510,16 @@ static int ext4_check_descriptors (struct super_block * sb)
1405 inode_table + sbi->s_itb_per_group - 1 > last_block) 1510 inode_table + sbi->s_itb_per_group - 1 > last_block)
1406 { 1511 {
1407 ext4_error (sb, "ext4_check_descriptors", 1512 ext4_error (sb, "ext4_check_descriptors",
1408 "Inode table for group %d" 1513 "Inode table for group %lu"
1409 " not in group (block %llu)!", 1514 " not in group (block %llu)!",
1410 i, inode_table); 1515 i, inode_table);
1411 return 0; 1516 return 0;
1412 } 1517 }
1413 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { 1518 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
1414 ext4_error(sb, __FUNCTION__, 1519 ext4_error(sb, __FUNCTION__,
1415 "Checksum for group %d failed (%u!=%u)\n", i, 1520 "Checksum for group %lu failed (%u!=%u)\n",
1416 le16_to_cpu(ext4_group_desc_csum(sbi, i, 1521 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1417 gdp)), 1522 gdp)), le16_to_cpu(gdp->bg_checksum));
1418 le16_to_cpu(gdp->bg_checksum));
1419 return 0; 1523 return 0;
1420 } 1524 }
1421 if (!flexbg_flag) 1525 if (!flexbg_flag)
@@ -1429,7 +1533,6 @@ static int ext4_check_descriptors (struct super_block * sb)
1429 return 1; 1533 return 1;
1430} 1534}
1431 1535
1432
1433/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at 1536/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
1434 * the superblock) which were deleted from all directories, but held open by 1537 * the superblock) which were deleted from all directories, but held open by
1435 * a process at the time of a crash. We walk the list and try to delete these 1538 * a process at the time of a crash. We walk the list and try to delete these
@@ -1542,20 +1645,95 @@ static void ext4_orphan_cleanup (struct super_block * sb,
1542#endif 1645#endif
1543 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 1646 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
1544} 1647}
1648/*
1649 * Maximal extent format file size.
1650 * Resulting logical blkno at s_maxbytes must fit in our on-disk
1651 * extent format containers, within a sector_t, and within i_blocks
1652 * in the vfs. ext4 inode has 48 bits of i_block in fsblock units,
1653 * so that won't be a limiting factor.
1654 *
1655 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
1656 */
1657static loff_t ext4_max_size(int blkbits)
1658{
1659 loff_t res;
1660 loff_t upper_limit = MAX_LFS_FILESIZE;
1661
1662 /* small i_blocks in vfs inode? */
1663 if (sizeof(blkcnt_t) < sizeof(u64)) {
1664 /*
1665 * CONFIG_LSF is not enabled implies the inode
1666 * i_block represent total blocks in 512 bytes
1667 * 32 == size of vfs inode i_blocks * 8
1668 */
1669 upper_limit = (1LL << 32) - 1;
1670
1671 /* total blocks in file system block size */
1672 upper_limit >>= (blkbits - 9);
1673 upper_limit <<= blkbits;
1674 }
1675
1676 /* 32-bit extent-start container, ee_block */
1677 res = 1LL << 32;
1678 res <<= blkbits;
1679 res -= 1;
1680
1681 /* Sanity check against vm- & vfs- imposed limits */
1682 if (res > upper_limit)
1683 res = upper_limit;
1684
1685 return res;
1686}
1545 1687
1546/* 1688/*
1547 * Maximal file size. There is a direct, and {,double-,triple-}indirect 1689 * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect
1548 * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. 1690 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
1549 * We need to be 1 filesystem block less than the 2^32 sector limit. 1691 * We need to be 1 filesystem block less than the 2^48 sector limit.
1550 */ 1692 */
1551static loff_t ext4_max_size(int bits) 1693static loff_t ext4_max_bitmap_size(int bits)
1552{ 1694{
1553 loff_t res = EXT4_NDIR_BLOCKS; 1695 loff_t res = EXT4_NDIR_BLOCKS;
1554 /* This constant is calculated to be the largest file size for a 1696 int meta_blocks;
1555 * dense, 4k-blocksize file such that the total number of 1697 loff_t upper_limit;
1698 /* This is calculated to be the largest file size for a
1699 * dense, bitmapped file such that the total number of
1556 * sectors in the file, including data and all indirect blocks, 1700 * sectors in the file, including data and all indirect blocks,
1557 * does not exceed 2^32. */ 1701 * does not exceed 2^48 -1
1558 const loff_t upper_limit = 0x1ff7fffd000LL; 1702 * __u32 i_blocks_lo and _u16 i_blocks_high representing the
1703 * total number of 512 bytes blocks of the file
1704 */
1705
1706 if (sizeof(blkcnt_t) < sizeof(u64)) {
1707 /*
1708 * CONFIG_LSF is not enabled implies the inode
1709 * i_block represent total blocks in 512 bytes
1710 * 32 == size of vfs inode i_blocks * 8
1711 */
1712 upper_limit = (1LL << 32) - 1;
1713
1714 /* total blocks in file system block size */
1715 upper_limit >>= (bits - 9);
1716
1717 } else {
1718 /*
1719 * We use 48 bit ext4_inode i_blocks
1720 * With EXT4_HUGE_FILE_FL set the i_blocks
1721 * represent total number of blocks in
1722 * file system block size
1723 */
1724 upper_limit = (1LL << 48) - 1;
1725
1726 }
1727
1728 /* indirect blocks */
1729 meta_blocks = 1;
1730 /* double indirect blocks */
1731 meta_blocks += 1 + (1LL << (bits-2));
1732 /* tripple indirect blocks */
1733 meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
1734
1735 upper_limit -= meta_blocks;
1736 upper_limit <<= bits;
1559 1737
1560 res += 1LL << (bits-2); 1738 res += 1LL << (bits-2);
1561 res += 1LL << (2*(bits-2)); 1739 res += 1LL << (2*(bits-2));
@@ -1563,6 +1741,10 @@ static loff_t ext4_max_size(int bits)
1563 res <<= bits; 1741 res <<= bits;
1564 if (res > upper_limit) 1742 if (res > upper_limit)
1565 res = upper_limit; 1743 res = upper_limit;
1744
1745 if (res > MAX_LFS_FILESIZE)
1746 res = MAX_LFS_FILESIZE;
1747
1566 return res; 1748 return res;
1567} 1749}
1568 1750
@@ -1570,7 +1752,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1570 ext4_fsblk_t logical_sb_block, int nr) 1752 ext4_fsblk_t logical_sb_block, int nr)
1571{ 1753{
1572 struct ext4_sb_info *sbi = EXT4_SB(sb); 1754 struct ext4_sb_info *sbi = EXT4_SB(sb);
1573 unsigned long bg, first_meta_bg; 1755 ext4_group_t bg, first_meta_bg;
1574 int has_super = 0; 1756 int has_super = 0;
1575 1757
1576 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); 1758 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
@@ -1584,8 +1766,39 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1584 return (has_super + ext4_group_first_block_no(sb, bg)); 1766 return (has_super + ext4_group_first_block_no(sb, bg));
1585} 1767}
1586 1768
1769/**
1770 * ext4_get_stripe_size: Get the stripe size.
1771 * @sbi: In memory super block info
1772 *
1773 * If we have specified it via mount option, then
1774 * use the mount option value. If the value specified at mount time is
1775 * greater than the blocks per group use the super block value.
1776 * If the super block value is greater than blocks per group return 0.
1777 * Allocator needs it be less than blocks per group.
1778 *
1779 */
1780static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
1781{
1782 unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
1783 unsigned long stripe_width =
1784 le32_to_cpu(sbi->s_es->s_raid_stripe_width);
1785
1786 if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
1787 return sbi->s_stripe;
1788
1789 if (stripe_width <= sbi->s_blocks_per_group)
1790 return stripe_width;
1791
1792 if (stride <= sbi->s_blocks_per_group)
1793 return stride;
1794
1795 return 0;
1796}
1587 1797
1588static int ext4_fill_super (struct super_block *sb, void *data, int silent) 1798static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1799 __releases(kernel_sem)
1800 __acquires(kernel_sem)
1801
1589{ 1802{
1590 struct buffer_head * bh; 1803 struct buffer_head * bh;
1591 struct ext4_super_block *es = NULL; 1804 struct ext4_super_block *es = NULL;
@@ -1599,7 +1812,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1599 unsigned long def_mount_opts; 1812 unsigned long def_mount_opts;
1600 struct inode *root; 1813 struct inode *root;
1601 int blocksize; 1814 int blocksize;
1602 int hblock;
1603 int db_count; 1815 int db_count;
1604 int i; 1816 int i;
1605 int needs_recovery; 1817 int needs_recovery;
@@ -1624,6 +1836,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1624 goto out_fail; 1836 goto out_fail;
1625 } 1837 }
1626 1838
1839 if (!sb_set_blocksize(sb, blocksize)) {
1840 printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
1841 goto out_fail;
1842 }
1843
1627 /* 1844 /*
1628 * The ext4 superblock will not be buffer aligned for other than 1kB 1845 * The ext4 superblock will not be buffer aligned for other than 1kB
1629 * block sizes. We need to calculate the offset from buffer start. 1846 * block sizes. We need to calculate the offset from buffer start.
@@ -1674,10 +1891,10 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1674 1891
1675 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 1892 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
1676 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 1893 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
1677 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_RO) 1894 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
1678 set_opt(sbi->s_mount_opt, ERRORS_RO);
1679 else
1680 set_opt(sbi->s_mount_opt, ERRORS_CONT); 1895 set_opt(sbi->s_mount_opt, ERRORS_CONT);
1896 else
1897 set_opt(sbi->s_mount_opt, ERRORS_RO);
1681 1898
1682 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 1899 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1683 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 1900 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -1689,6 +1906,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1689 * User -o noextents to turn it off 1906 * User -o noextents to turn it off
1690 */ 1907 */
1691 set_opt(sbi->s_mount_opt, EXTENTS); 1908 set_opt(sbi->s_mount_opt, EXTENTS);
1909 /*
1910 * turn on mballoc feature by default in ext4 filesystem
1911 * User -o nomballoc to turn it off
1912 */
1913 set_opt(sbi->s_mount_opt, MBALLOC);
1692 1914
1693 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, 1915 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1694 NULL, 0)) 1916 NULL, 0))
@@ -1723,6 +1945,19 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1723 sb->s_id, le32_to_cpu(features)); 1945 sb->s_id, le32_to_cpu(features));
1724 goto failed_mount; 1946 goto failed_mount;
1725 } 1947 }
1948 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
1949 /*
1950 * Large file size enabled file system can only be
1951 * mount if kernel is build with CONFIG_LSF
1952 */
1953 if (sizeof(root->i_blocks) < sizeof(u64) &&
1954 !(sb->s_flags & MS_RDONLY)) {
1955 printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
1956 "files cannot be mounted read-write "
1957 "without CONFIG_LSF.\n", sb->s_id);
1958 goto failed_mount;
1959 }
1960 }
1726 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 1961 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
1727 1962
1728 if (blocksize < EXT4_MIN_BLOCK_SIZE || 1963 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -1733,20 +1968,16 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1733 goto failed_mount; 1968 goto failed_mount;
1734 } 1969 }
1735 1970
1736 hblock = bdev_hardsect_size(sb->s_bdev);
1737 if (sb->s_blocksize != blocksize) { 1971 if (sb->s_blocksize != blocksize) {
1738 /* 1972
1739 * Make sure the blocksize for the filesystem is larger 1973 /* Validate the filesystem blocksize */
1740 * than the hardware sectorsize for the machine. 1974 if (!sb_set_blocksize(sb, blocksize)) {
1741 */ 1975 printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
1742 if (blocksize < hblock) { 1976 blocksize);
1743 printk(KERN_ERR "EXT4-fs: blocksize %d too small for "
1744 "device blocksize %d.\n", blocksize, hblock);
1745 goto failed_mount; 1977 goto failed_mount;
1746 } 1978 }
1747 1979
1748 brelse (bh); 1980 brelse (bh);
1749 sb_set_blocksize(sb, blocksize);
1750 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; 1981 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
1751 offset = do_div(logical_sb_block, blocksize); 1982 offset = do_div(logical_sb_block, blocksize);
1752 bh = sb_bread(sb, logical_sb_block); 1983 bh = sb_bread(sb, logical_sb_block);
@@ -1764,6 +1995,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1764 } 1995 }
1765 } 1996 }
1766 1997
1998 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
1767 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); 1999 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
1768 2000
1769 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { 2001 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
@@ -1797,7 +2029,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1797 sbi->s_desc_size = EXT4_MIN_DESC_SIZE; 2029 sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
1798 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); 2030 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
1799 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); 2031 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
1800 if (EXT4_INODE_SIZE(sb) == 0) 2032 if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
1801 goto cantfind_ext4; 2033 goto cantfind_ext4;
1802 sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); 2034 sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
1803 if (sbi->s_inodes_per_block == 0) 2035 if (sbi->s_inodes_per_block == 0)
@@ -1838,6 +2070,17 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1838 2070
1839 if (EXT4_BLOCKS_PER_GROUP(sb) == 0) 2071 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
1840 goto cantfind_ext4; 2072 goto cantfind_ext4;
2073
2074 /* ensure blocks_count calculation below doesn't sign-extend */
2075 if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
2076 le32_to_cpu(es->s_first_data_block) + 1) {
2077 printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
2078 "first data block %u, blocks per group %lu\n",
2079 ext4_blocks_count(es),
2080 le32_to_cpu(es->s_first_data_block),
2081 EXT4_BLOCKS_PER_GROUP(sb));
2082 goto failed_mount;
2083 }
1841 blocks_count = (ext4_blocks_count(es) - 2084 blocks_count = (ext4_blocks_count(es) -
1842 le32_to_cpu(es->s_first_data_block) + 2085 le32_to_cpu(es->s_first_data_block) +
1843 EXT4_BLOCKS_PER_GROUP(sb) - 1); 2086 EXT4_BLOCKS_PER_GROUP(sb) - 1);
@@ -1900,6 +2143,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1900 sbi->s_rsv_window_head.rsv_goal_size = 0; 2143 sbi->s_rsv_window_head.rsv_goal_size = 0;
1901 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head); 2144 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
1902 2145
2146 sbi->s_stripe = ext4_get_stripe_size(sbi);
2147
1903 /* 2148 /*
1904 * set up enough so that it can read an inode 2149 * set up enough so that it can read an inode
1905 */ 2150 */
@@ -1944,6 +2189,21 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1944 goto failed_mount4; 2189 goto failed_mount4;
1945 } 2190 }
1946 2191
2192 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
2193 jbd2_journal_set_features(sbi->s_journal,
2194 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2195 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2196 } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
2197 jbd2_journal_set_features(sbi->s_journal,
2198 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2199 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
2200 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2201 } else {
2202 jbd2_journal_clear_features(sbi->s_journal,
2203 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2204 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2205 }
2206
1947 /* We have now updated the journal if required, so we can 2207 /* We have now updated the journal if required, so we can
1948 * validate the data journaling mode. */ 2208 * validate the data journaling mode. */
1949 switch (test_opt(sb, DATA_FLAGS)) { 2209 switch (test_opt(sb, DATA_FLAGS)) {
@@ -2044,6 +2304,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
2044 "writeback"); 2304 "writeback");
2045 2305
2046 ext4_ext_init(sb); 2306 ext4_ext_init(sb);
2307 ext4_mb_init(sb, needs_recovery);
2047 2308
2048 lock_kernel(); 2309 lock_kernel();
2049 return 0; 2310 return 0;
@@ -2673,7 +2934,7 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
2673 if (test_opt(sb, MINIX_DF)) { 2934 if (test_opt(sb, MINIX_DF)) {
2674 sbi->s_overhead_last = 0; 2935 sbi->s_overhead_last = 0;
2675 } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { 2936 } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
2676 unsigned long ngroups = sbi->s_groups_count, i; 2937 ext4_group_t ngroups = sbi->s_groups_count, i;
2677 ext4_fsblk_t overhead = 0; 2938 ext4_fsblk_t overhead = 0;
2678 smp_rmb(); 2939 smp_rmb();
2679 2940
@@ -2909,7 +3170,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
2909 size_t len, loff_t off) 3170 size_t len, loff_t off)
2910{ 3171{
2911 struct inode *inode = sb_dqopt(sb)->files[type]; 3172 struct inode *inode = sb_dqopt(sb)->files[type];
2912 sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 3173 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
2913 int err = 0; 3174 int err = 0;
2914 int offset = off & (sb->s_blocksize - 1); 3175 int offset = off & (sb->s_blocksize - 1);
2915 int tocopy; 3176 int tocopy;
@@ -2947,7 +3208,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
2947 const char *data, size_t len, loff_t off) 3208 const char *data, size_t len, loff_t off)
2948{ 3209{
2949 struct inode *inode = sb_dqopt(sb)->files[type]; 3210 struct inode *inode = sb_dqopt(sb)->files[type];
2950 sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 3211 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
2951 int err = 0; 3212 int err = 0;
2952 int offset = off & (sb->s_blocksize - 1); 3213 int offset = off & (sb->s_blocksize - 1);
2953 int tocopy; 3214 int tocopy;
@@ -3002,7 +3263,6 @@ out:
3002 i_size_write(inode, off+len-towrite); 3263 i_size_write(inode, off+len-towrite);
3003 EXT4_I(inode)->i_disksize = inode->i_size; 3264 EXT4_I(inode)->i_disksize = inode->i_size;
3004 } 3265 }
3005 inode->i_version++;
3006 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 3266 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3007 ext4_mark_inode_dirty(handle, inode); 3267 ext4_mark_inode_dirty(handle, inode);
3008 mutex_unlock(&inode->i_mutex); 3268 mutex_unlock(&inode->i_mutex);
@@ -3027,9 +3287,15 @@ static struct file_system_type ext4dev_fs_type = {
3027 3287
3028static int __init init_ext4_fs(void) 3288static int __init init_ext4_fs(void)
3029{ 3289{
3030 int err = init_ext4_xattr(); 3290 int err;
3291
3292 err = init_ext4_mballoc();
3031 if (err) 3293 if (err)
3032 return err; 3294 return err;
3295
3296 err = init_ext4_xattr();
3297 if (err)
3298 goto out2;
3033 err = init_inodecache(); 3299 err = init_inodecache();
3034 if (err) 3300 if (err)
3035 goto out1; 3301 goto out1;
@@ -3041,6 +3307,8 @@ out:
3041 destroy_inodecache(); 3307 destroy_inodecache();
3042out1: 3308out1:
3043 exit_ext4_xattr(); 3309 exit_ext4_xattr();
3310out2:
3311 exit_ext4_mballoc();
3044 return err; 3312 return err;
3045} 3313}
3046 3314
@@ -3049,6 +3317,7 @@ static void __exit exit_ext4_fs(void)
3049 unregister_filesystem(&ext4dev_fs_type); 3317 unregister_filesystem(&ext4dev_fs_type);
3050 destroy_inodecache(); 3318 destroy_inodecache();
3051 exit_ext4_xattr(); 3319 exit_ext4_xattr();
3320 exit_ext4_mballoc();
3052} 3321}
3053 3322
3054MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3323MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 86387302c2a9..d7962139c010 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -480,7 +480,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
480 ea_bdebug(bh, "refcount now=0; freeing"); 480 ea_bdebug(bh, "refcount now=0; freeing");
481 if (ce) 481 if (ce)
482 mb_cache_entry_free(ce); 482 mb_cache_entry_free(ce);
483 ext4_free_blocks(handle, inode, bh->b_blocknr, 1); 483 ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
484 get_bh(bh); 484 get_bh(bh);
485 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 485 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
486 } else { 486 } else {
@@ -821,7 +821,7 @@ inserted:
821 new_bh = sb_getblk(sb, block); 821 new_bh = sb_getblk(sb, block);
822 if (!new_bh) { 822 if (!new_bh) {
823getblk_failed: 823getblk_failed:
824 ext4_free_blocks(handle, inode, block, 1); 824 ext4_free_blocks(handle, inode, block, 1, 1);
825 error = -EIO; 825 error = -EIO;
826 goto cleanup; 826 goto cleanup;
827 } 827 }
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 2c1b73fb82ae..5fb366992b73 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -590,21 +590,49 @@ error:
590 590
591EXPORT_SYMBOL_GPL(fat_free_clusters); 591EXPORT_SYMBOL_GPL(fat_free_clusters);
592 592
593/* 128kb is the whole sectors for FAT12 and FAT16 */
594#define FAT_READA_SIZE (128 * 1024)
595
596static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent,
597 unsigned long reada_blocks)
598{
599 struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
600 sector_t blocknr;
601 int i, offset;
602
603 ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr);
604
605 for (i = 0; i < reada_blocks; i++)
606 sb_breadahead(sb, blocknr + i);
607}
608
593int fat_count_free_clusters(struct super_block *sb) 609int fat_count_free_clusters(struct super_block *sb)
594{ 610{
595 struct msdos_sb_info *sbi = MSDOS_SB(sb); 611 struct msdos_sb_info *sbi = MSDOS_SB(sb);
596 struct fatent_operations *ops = sbi->fatent_ops; 612 struct fatent_operations *ops = sbi->fatent_ops;
597 struct fat_entry fatent; 613 struct fat_entry fatent;
614 unsigned long reada_blocks, reada_mask, cur_block;
598 int err = 0, free; 615 int err = 0, free;
599 616
600 lock_fat(sbi); 617 lock_fat(sbi);
601 if (sbi->free_clusters != -1) 618 if (sbi->free_clusters != -1)
602 goto out; 619 goto out;
603 620
621 reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits;
622 reada_mask = reada_blocks - 1;
623 cur_block = 0;
624
604 free = 0; 625 free = 0;
605 fatent_init(&fatent); 626 fatent_init(&fatent);
606 fatent_set_entry(&fatent, FAT_START_ENT); 627 fatent_set_entry(&fatent, FAT_START_ENT);
607 while (fatent.entry < sbi->max_cluster) { 628 while (fatent.entry < sbi->max_cluster) {
629 /* readahead of fat blocks */
630 if ((cur_block & reada_mask) == 0) {
631 unsigned long rest = sbi->fat_length - cur_block;
632 fat_ent_reada(sb, &fatent, min(reada_blocks, rest));
633 }
634 cur_block++;
635
608 err = fat_ent_read_block(sb, &fatent); 636 err = fat_ent_read_block(sb, &fatent);
609 if (err) 637 if (err)
610 goto out; 638 goto out;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0fca82021d76..300324bd563c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -482,8 +482,6 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
482 if (wbc->nr_to_write <= 0) 482 if (wbc->nr_to_write <= 0)
483 break; 483 break;
484 } 484 }
485 if (!list_empty(&sb->s_more_io))
486 wbc->more_io = 1;
487 return; /* Leave any unwritten inodes on s_io */ 485 return; /* Leave any unwritten inodes on s_io */
488} 486}
489 487
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 84f9f7dfdf5b..e5e80d1a4687 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -744,9 +744,6 @@ static inline void unregister_fuseblk(void)
744} 744}
745#endif 745#endif
746 746
747static decl_subsys(fuse, NULL, NULL);
748static decl_subsys(connections, NULL, NULL);
749
750static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo) 747static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
751{ 748{
752 struct inode * inode = foo; 749 struct inode * inode = foo;
@@ -791,32 +788,37 @@ static void fuse_fs_cleanup(void)
791 kmem_cache_destroy(fuse_inode_cachep); 788 kmem_cache_destroy(fuse_inode_cachep);
792} 789}
793 790
791static struct kobject *fuse_kobj;
792static struct kobject *connections_kobj;
793
794static int fuse_sysfs_init(void) 794static int fuse_sysfs_init(void)
795{ 795{
796 int err; 796 int err;
797 797
798 kobj_set_kset_s(&fuse_subsys, fs_subsys); 798 fuse_kobj = kobject_create_and_add("fuse", fs_kobj);
799 err = subsystem_register(&fuse_subsys); 799 if (!fuse_kobj) {
800 if (err) 800 err = -ENOMEM;
801 goto out_err; 801 goto out_err;
802 }
802 803
803 kobj_set_kset_s(&connections_subsys, fuse_subsys); 804 connections_kobj = kobject_create_and_add("connections", fuse_kobj);
804 err = subsystem_register(&connections_subsys); 805 if (!connections_kobj) {
805 if (err) 806 err = -ENOMEM;
806 goto out_fuse_unregister; 807 goto out_fuse_unregister;
808 }
807 809
808 return 0; 810 return 0;
809 811
810 out_fuse_unregister: 812 out_fuse_unregister:
811 subsystem_unregister(&fuse_subsys); 813 kobject_put(fuse_kobj);
812 out_err: 814 out_err:
813 return err; 815 return err;
814} 816}
815 817
816static void fuse_sysfs_cleanup(void) 818static void fuse_sysfs_cleanup(void)
817{ 819{
818 subsystem_unregister(&connections_subsys); 820 kobject_put(connections_kobj);
819 subsystem_unregister(&fuse_subsys); 821 kobject_put(fuse_kobj);
820} 822}
821 823
822static int __init fuse_init(void) 824static int __init fuse_init(void)
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 04ad0caebedb..8fff11058cee 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ 2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \ 3 glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ 4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \ 5 ops_fstype.o ops_inode.o ops_super.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o 6 recovery.o rgrp.o super.o sys.o trans.o util.o
7 7
8obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/ 8obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 93fa427bb5f5..e4effc47abfc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -59,7 +59,6 @@ struct strip_mine {
59static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, 59static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
60 u64 block, struct page *page) 60 u64 block, struct page *page)
61{ 61{
62 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
63 struct inode *inode = &ip->i_inode; 62 struct inode *inode = &ip->i_inode;
64 struct buffer_head *bh; 63 struct buffer_head *bh;
65 int release = 0; 64 int release = 0;
@@ -95,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
95 set_buffer_uptodate(bh); 94 set_buffer_uptodate(bh);
96 if (!gfs2_is_jdata(ip)) 95 if (!gfs2_is_jdata(ip))
97 mark_buffer_dirty(bh); 96 mark_buffer_dirty(bh);
98 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) 97 if (!gfs2_is_writeback(ip))
99 gfs2_trans_add_bh(ip->i_gl, bh, 0); 98 gfs2_trans_add_bh(ip->i_gl, bh, 0);
100 99
101 if (release) { 100 if (release) {
@@ -453,8 +452,8 @@ static inline void bmap_unlock(struct inode *inode, int create)
453 * Returns: errno 452 * Returns: errno
454 */ 453 */
455 454
456int gfs2_block_map(struct inode *inode, u64 lblock, int create, 455int gfs2_block_map(struct inode *inode, sector_t lblock,
457 struct buffer_head *bh_map) 456 struct buffer_head *bh_map, int create)
458{ 457{
459 struct gfs2_inode *ip = GFS2_I(inode); 458 struct gfs2_inode *ip = GFS2_I(inode);
460 struct gfs2_sbd *sdp = GFS2_SB(inode); 459 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -470,6 +469,7 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create,
470 unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; 469 unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
471 struct metapath mp; 470 struct metapath mp;
472 u64 size; 471 u64 size;
472 struct buffer_head *dibh = NULL;
473 473
474 BUG_ON(maxlen == 0); 474 BUG_ON(maxlen == 0);
475 475
@@ -500,6 +500,8 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create,
500 error = gfs2_meta_inode_buffer(ip, &bh); 500 error = gfs2_meta_inode_buffer(ip, &bh);
501 if (error) 501 if (error)
502 goto out_fail; 502 goto out_fail;
503 dibh = bh;
504 get_bh(dibh);
503 505
504 for (x = 0; x < end_of_metadata; x++) { 506 for (x = 0; x < end_of_metadata; x++) {
505 lookup_block(ip, bh, x, &mp, create, &new, &dblock); 507 lookup_block(ip, bh, x, &mp, create, &new, &dblock);
@@ -518,13 +520,8 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create,
518 if (boundary) 520 if (boundary)
519 set_buffer_boundary(bh_map); 521 set_buffer_boundary(bh_map);
520 if (new) { 522 if (new) {
521 struct buffer_head *dibh; 523 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
522 error = gfs2_meta_inode_buffer(ip, &dibh); 524 gfs2_dinode_out(ip, dibh->b_data);
523 if (!error) {
524 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
525 gfs2_dinode_out(ip, dibh->b_data);
526 brelse(dibh);
527 }
528 set_buffer_new(bh_map); 525 set_buffer_new(bh_map);
529 goto out_brelse; 526 goto out_brelse;
530 } 527 }
@@ -545,6 +542,8 @@ out_brelse:
545out_ok: 542out_ok:
546 error = 0; 543 error = 0;
547out_fail: 544out_fail:
545 if (dibh)
546 brelse(dibh);
548 bmap_unlock(inode, create); 547 bmap_unlock(inode, create);
549 return error; 548 return error;
550} 549}
@@ -560,7 +559,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
560 BUG_ON(!new); 559 BUG_ON(!new);
561 560
562 bh.b_size = 1 << (inode->i_blkbits + 5); 561 bh.b_size = 1 << (inode->i_blkbits + 5);
563 ret = gfs2_block_map(inode, lblock, create, &bh); 562 ret = gfs2_block_map(inode, lblock, &bh, create);
564 *extlen = bh.b_size >> inode->i_blkbits; 563 *extlen = bh.b_size >> inode->i_blkbits;
565 *dblock = bh.b_blocknr; 564 *dblock = bh.b_blocknr;
566 if (buffer_new(&bh)) 565 if (buffer_new(&bh))
@@ -684,7 +683,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
684 if (metadata) 683 if (metadata)
685 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; 684 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
686 685
687 error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh); 686 error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
688 if (error) 687 if (error)
689 return error; 688 return error;
690 689
@@ -786,7 +785,7 @@ out_rg_gunlock:
786out_rlist: 785out_rlist:
787 gfs2_rlist_free(&rlist); 786 gfs2_rlist_free(&rlist);
788out: 787out:
789 gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh); 788 gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
790 return error; 789 return error;
791} 790}
792 791
@@ -879,7 +878,6 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
879{ 878{
880 struct inode *inode = mapping->host; 879 struct inode *inode = mapping->host;
881 struct gfs2_inode *ip = GFS2_I(inode); 880 struct gfs2_inode *ip = GFS2_I(inode);
882 struct gfs2_sbd *sdp = GFS2_SB(inode);
883 loff_t from = inode->i_size; 881 loff_t from = inode->i_size;
884 unsigned long index = from >> PAGE_CACHE_SHIFT; 882 unsigned long index = from >> PAGE_CACHE_SHIFT;
885 unsigned offset = from & (PAGE_CACHE_SIZE-1); 883 unsigned offset = from & (PAGE_CACHE_SIZE-1);
@@ -911,7 +909,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
911 err = 0; 909 err = 0;
912 910
913 if (!buffer_mapped(bh)) { 911 if (!buffer_mapped(bh)) {
914 gfs2_get_block(inode, iblock, bh, 0); 912 gfs2_block_map(inode, iblock, bh, 0);
915 /* unmapped? It's a hole - nothing to do */ 913 /* unmapped? It's a hole - nothing to do */
916 if (!buffer_mapped(bh)) 914 if (!buffer_mapped(bh))
917 goto unlock; 915 goto unlock;
@@ -931,7 +929,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
931 err = 0; 929 err = 0;
932 } 930 }
933 931
934 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) 932 if (!gfs2_is_writeback(ip))
935 gfs2_trans_add_bh(ip->i_gl, bh, 0); 933 gfs2_trans_add_bh(ip->i_gl, bh, 0);
936 934
937 zero_user_page(page, offset, length, KM_USER0); 935 zero_user_page(page, offset, length, KM_USER0);
@@ -1224,8 +1222,13 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1224 do_div(lblock_stop, bsize); 1222 do_div(lblock_stop, bsize);
1225 } else { 1223 } else {
1226 unsigned int shift = sdp->sd_sb.sb_bsize_shift; 1224 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1225 u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
1227 lblock = offset >> shift; 1226 lblock = offset >> shift;
1228 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; 1227 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1228 if (lblock_stop > end_of_file) {
1229 *alloc_required = 1;
1230 return 0;
1231 }
1229 } 1232 }
1230 1233
1231 for (; lblock < lblock_stop; lblock += extlen) { 1234 for (; lblock < lblock_stop; lblock += extlen) {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index ac2fd04370dc..4e6cde2943bd 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -15,7 +15,7 @@ struct gfs2_inode;
15struct page; 15struct page;
16 16
17int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); 17int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
18int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh); 18int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
19int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); 19int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
20 20
21int gfs2_truncatei(struct gfs2_inode *ip, u64 size); 21int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
index 3731ab0771d5..e51991947d2c 100644
--- a/fs/gfs2/daemon.c
+++ b/fs/gfs2/daemon.c
@@ -83,56 +83,6 @@ int gfs2_recoverd(void *data)
83} 83}
84 84
85/** 85/**
86 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
87 * @sdp: Pointer to GFS2 superblock
88 *
89 * Also, periodically check to make sure that we're using the most recent
90 * journal index.
91 */
92
93int gfs2_logd(void *data)
94{
95 struct gfs2_sbd *sdp = data;
96 struct gfs2_holder ji_gh;
97 unsigned long t;
98 int need_flush;
99
100 while (!kthread_should_stop()) {
101 /* Advance the log tail */
102
103 t = sdp->sd_log_flush_time +
104 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
105
106 gfs2_ail1_empty(sdp, DIO_ALL);
107 gfs2_log_lock(sdp);
108 need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
109 gfs2_log_unlock(sdp);
110 if (need_flush || time_after_eq(jiffies, t)) {
111 gfs2_log_flush(sdp, NULL);
112 sdp->sd_log_flush_time = jiffies;
113 }
114
115 /* Check for latest journal index */
116
117 t = sdp->sd_jindex_refresh_time +
118 gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
119
120 if (time_after_eq(jiffies, t)) {
121 if (!gfs2_jindex_hold(sdp, &ji_gh))
122 gfs2_glock_dq_uninit(&ji_gh);
123 sdp->sd_jindex_refresh_time = jiffies;
124 }
125
126 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
127 if (freezing(current))
128 refrigerator();
129 schedule_timeout_interruptible(t);
130 }
131
132 return 0;
133}
134
135/**
136 * gfs2_quotad - Write cached quota changes into the quota file 86 * gfs2_quotad - Write cached quota changes into the quota file
137 * @sdp: Pointer to GFS2 superblock 87 * @sdp: Pointer to GFS2 superblock
138 * 88 *
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
index 0de9b3557955..4be084fb6a62 100644
--- a/fs/gfs2/daemon.h
+++ b/fs/gfs2/daemon.h
@@ -12,7 +12,6 @@
12 12
13int gfs2_glockd(void *data); 13int gfs2_glockd(void *data);
14int gfs2_recoverd(void *data); 14int gfs2_recoverd(void *data);
15int gfs2_logd(void *data);
16int gfs2_quotad(void *data); 15int gfs2_quotad(void *data);
17 16
18#endif /* __DAEMON_DOT_H__ */ 17#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 9949bb746a52..57e2ed932adc 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1876,7 +1876,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1876 if (error) 1876 if (error)
1877 goto out; 1877 goto out;
1878 1878
1879 error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh); 1879 error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
1880 if (error) 1880 if (error)
1881 goto out_qs; 1881 goto out_qs;
1882 1882
@@ -1949,7 +1949,7 @@ out_rg_gunlock:
1949 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); 1949 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1950out_rlist: 1950out_rlist:
1951 gfs2_rlist_free(&rlist); 1951 gfs2_rlist_free(&rlist);
1952 gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh); 1952 gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
1953out_qs: 1953out_qs:
1954 gfs2_quota_unhold(dip); 1954 gfs2_quota_unhold(dip);
1955out: 1955out:
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index aa8dbf303f6d..f114ba2b3557 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -56,46 +56,6 @@ unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
56 return type; 56 return type;
57} 57}
58 58
59static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
60{
61 struct inode *inode = &ip->i_inode;
62 int error = permission(inode, MAY_READ, NULL);
63 if (error)
64 return error;
65
66 return gfs2_ea_get_i(ip, er);
67}
68
69static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
70{
71 struct inode *inode = &ip->i_inode;
72
73 if (S_ISREG(inode->i_mode) ||
74 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
75 int error = permission(inode, MAY_WRITE, NULL);
76 if (error)
77 return error;
78 } else
79 return -EPERM;
80
81 return gfs2_ea_set_i(ip, er);
82}
83
84static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
85{
86 struct inode *inode = &ip->i_inode;
87
88 if (S_ISREG(inode->i_mode) ||
89 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
90 int error = permission(inode, MAY_WRITE, NULL);
91 if (error)
92 return error;
93 } else
94 return -EPERM;
95
96 return gfs2_ea_remove_i(ip, er);
97}
98
99static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) 59static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
100{ 60{
101 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) && 61 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
@@ -108,8 +68,6 @@ static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
108 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len))) 68 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
109 return -EOPNOTSUPP; 69 return -EOPNOTSUPP;
110 70
111
112
113 return gfs2_ea_get_i(ip, er); 71 return gfs2_ea_get_i(ip, er);
114} 72}
115 73
@@ -170,40 +128,10 @@ static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
170 return gfs2_ea_remove_i(ip, er); 128 return gfs2_ea_remove_i(ip, er);
171} 129}
172 130
173static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
174{
175 struct inode *inode = &ip->i_inode;
176 int error = permission(inode, MAY_READ, NULL);
177 if (error)
178 return error;
179
180 return gfs2_ea_get_i(ip, er);
181}
182
183static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
184{
185 struct inode *inode = &ip->i_inode;
186 int error = permission(inode, MAY_WRITE, NULL);
187 if (error)
188 return error;
189
190 return gfs2_ea_set_i(ip, er);
191}
192
193static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
194{
195 struct inode *inode = &ip->i_inode;
196 int error = permission(inode, MAY_WRITE, NULL);
197 if (error)
198 return error;
199
200 return gfs2_ea_remove_i(ip, er);
201}
202
203static const struct gfs2_eattr_operations gfs2_user_eaops = { 131static const struct gfs2_eattr_operations gfs2_user_eaops = {
204 .eo_get = user_eo_get, 132 .eo_get = gfs2_ea_get_i,
205 .eo_set = user_eo_set, 133 .eo_set = gfs2_ea_set_i,
206 .eo_remove = user_eo_remove, 134 .eo_remove = gfs2_ea_remove_i,
207 .eo_name = "user", 135 .eo_name = "user",
208}; 136};
209 137
@@ -215,9 +143,9 @@ const struct gfs2_eattr_operations gfs2_system_eaops = {
215}; 143};
216 144
217static const struct gfs2_eattr_operations gfs2_security_eaops = { 145static const struct gfs2_eattr_operations gfs2_security_eaops = {
218 .eo_get = security_eo_get, 146 .eo_get = gfs2_ea_get_i,
219 .eo_set = security_eo_set, 147 .eo_set = gfs2_ea_set_i,
220 .eo_remove = security_eo_remove, 148 .eo_remove = gfs2_ea_remove_i,
221 .eo_name = "security", 149 .eo_name = "security",
222}; 150};
223 151
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 2a7435b5c4dc..bee99704ea10 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -1418,7 +1418,7 @@ out:
1418static int ea_dealloc_block(struct gfs2_inode *ip) 1418static int ea_dealloc_block(struct gfs2_inode *ip)
1419{ 1419{
1420 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1420 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1421 struct gfs2_alloc *al = &ip->i_alloc; 1421 struct gfs2_alloc *al = ip->i_alloc;
1422 struct gfs2_rgrpd *rgd; 1422 struct gfs2_rgrpd *rgd;
1423 struct buffer_head *dibh; 1423 struct buffer_head *dibh;
1424 int error; 1424 int error;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a37efe4aae6f..80e09c50590a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -217,7 +217,6 @@ int gfs2_glock_put(struct gfs2_glock *gl)
217 if (atomic_dec_and_test(&gl->gl_ref)) { 217 if (atomic_dec_and_test(&gl->gl_ref)) {
218 hlist_del(&gl->gl_list); 218 hlist_del(&gl->gl_list);
219 write_unlock(gl_lock_addr(gl->gl_hash)); 219 write_unlock(gl_lock_addr(gl->gl_hash));
220 BUG_ON(spin_is_locked(&gl->gl_spin));
221 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED); 220 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
222 gfs2_assert(sdp, list_empty(&gl->gl_reclaim)); 221 gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
223 gfs2_assert(sdp, list_empty(&gl->gl_holders)); 222 gfs2_assert(sdp, list_empty(&gl->gl_holders));
@@ -346,7 +345,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
346 gl->gl_object = NULL; 345 gl->gl_object = NULL;
347 gl->gl_sbd = sdp; 346 gl->gl_sbd = sdp;
348 gl->gl_aspace = NULL; 347 gl->gl_aspace = NULL;
349 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
350 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); 348 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
351 349
352 /* If this glock protects actual on-disk data or metadata blocks, 350 /* If this glock protects actual on-disk data or metadata blocks,
@@ -461,7 +459,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
461 459
462static void gfs2_demote_wake(struct gfs2_glock *gl) 460static void gfs2_demote_wake(struct gfs2_glock *gl)
463{ 461{
464 BUG_ON(!spin_is_locked(&gl->gl_spin));
465 gl->gl_demote_state = LM_ST_EXCLUSIVE; 462 gl->gl_demote_state = LM_ST_EXCLUSIVE;
466 clear_bit(GLF_DEMOTE, &gl->gl_flags); 463 clear_bit(GLF_DEMOTE, &gl->gl_flags);
467 smp_mb__after_clear_bit(); 464 smp_mb__after_clear_bit();
@@ -507,21 +504,12 @@ static int rq_mutex(struct gfs2_holder *gh)
507static int rq_promote(struct gfs2_holder *gh) 504static int rq_promote(struct gfs2_holder *gh)
508{ 505{
509 struct gfs2_glock *gl = gh->gh_gl; 506 struct gfs2_glock *gl = gh->gh_gl;
510 struct gfs2_sbd *sdp = gl->gl_sbd;
511 507
512 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { 508 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
513 if (list_empty(&gl->gl_holders)) { 509 if (list_empty(&gl->gl_holders)) {
514 gl->gl_req_gh = gh; 510 gl->gl_req_gh = gh;
515 set_bit(GLF_LOCK, &gl->gl_flags); 511 set_bit(GLF_LOCK, &gl->gl_flags);
516 spin_unlock(&gl->gl_spin); 512 spin_unlock(&gl->gl_spin);
517
518 if (atomic_read(&sdp->sd_reclaim_count) >
519 gfs2_tune_get(sdp, gt_reclaim_limit) &&
520 !(gh->gh_flags & LM_FLAG_PRIORITY)) {
521 gfs2_reclaim_glock(sdp);
522 gfs2_reclaim_glock(sdp);
523 }
524
525 gfs2_glock_xmote_th(gh->gh_gl, gh); 513 gfs2_glock_xmote_th(gh->gh_gl, gh);
526 spin_lock(&gl->gl_spin); 514 spin_lock(&gl->gl_spin);
527 } 515 }
@@ -567,7 +555,10 @@ static int rq_demote(struct gfs2_glock *gl)
567 gfs2_demote_wake(gl); 555 gfs2_demote_wake(gl);
568 return 0; 556 return 0;
569 } 557 }
558
570 set_bit(GLF_LOCK, &gl->gl_flags); 559 set_bit(GLF_LOCK, &gl->gl_flags);
560 set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
561
571 if (gl->gl_demote_state == LM_ST_UNLOCKED || 562 if (gl->gl_demote_state == LM_ST_UNLOCKED ||
572 gl->gl_state != LM_ST_EXCLUSIVE) { 563 gl->gl_state != LM_ST_EXCLUSIVE) {
573 spin_unlock(&gl->gl_spin); 564 spin_unlock(&gl->gl_spin);
@@ -576,7 +567,9 @@ static int rq_demote(struct gfs2_glock *gl)
576 spin_unlock(&gl->gl_spin); 567 spin_unlock(&gl->gl_spin);
577 gfs2_glock_xmote_th(gl, NULL); 568 gfs2_glock_xmote_th(gl, NULL);
578 } 569 }
570
579 spin_lock(&gl->gl_spin); 571 spin_lock(&gl->gl_spin);
572 clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
580 573
581 return 0; 574 return 0;
582} 575}
@@ -598,23 +591,18 @@ static void run_queue(struct gfs2_glock *gl)
598 if (!list_empty(&gl->gl_waiters1)) { 591 if (!list_empty(&gl->gl_waiters1)) {
599 gh = list_entry(gl->gl_waiters1.next, 592 gh = list_entry(gl->gl_waiters1.next,
600 struct gfs2_holder, gh_list); 593 struct gfs2_holder, gh_list);
601 594 blocked = rq_mutex(gh);
602 if (test_bit(HIF_MUTEX, &gh->gh_iflags))
603 blocked = rq_mutex(gh);
604 else
605 gfs2_assert_warn(gl->gl_sbd, 0);
606
607 } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { 595 } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
608 blocked = rq_demote(gl); 596 blocked = rq_demote(gl);
597 if (gl->gl_waiters2 && !blocked) {
598 set_bit(GLF_DEMOTE, &gl->gl_flags);
599 gl->gl_demote_state = LM_ST_UNLOCKED;
600 }
601 gl->gl_waiters2 = 0;
609 } else if (!list_empty(&gl->gl_waiters3)) { 602 } else if (!list_empty(&gl->gl_waiters3)) {
610 gh = list_entry(gl->gl_waiters3.next, 603 gh = list_entry(gl->gl_waiters3.next,
611 struct gfs2_holder, gh_list); 604 struct gfs2_holder, gh_list);
612 605 blocked = rq_promote(gh);
613 if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
614 blocked = rq_promote(gh);
615 else
616 gfs2_assert_warn(gl->gl_sbd, 0);
617
618 } else 606 } else
619 break; 607 break;
620 608
@@ -632,27 +620,21 @@ static void run_queue(struct gfs2_glock *gl)
632 620
633static void gfs2_glmutex_lock(struct gfs2_glock *gl) 621static void gfs2_glmutex_lock(struct gfs2_glock *gl)
634{ 622{
635 struct gfs2_holder gh;
636
637 gfs2_holder_init(gl, 0, 0, &gh);
638 set_bit(HIF_MUTEX, &gh.gh_iflags);
639 if (test_and_set_bit(HIF_WAIT, &gh.gh_iflags))
640 BUG();
641
642 spin_lock(&gl->gl_spin); 623 spin_lock(&gl->gl_spin);
643 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { 624 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
625 struct gfs2_holder gh;
626
627 gfs2_holder_init(gl, 0, 0, &gh);
628 set_bit(HIF_WAIT, &gh.gh_iflags);
644 list_add_tail(&gh.gh_list, &gl->gl_waiters1); 629 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
630 spin_unlock(&gl->gl_spin);
631 wait_on_holder(&gh);
632 gfs2_holder_uninit(&gh);
645 } else { 633 } else {
646 gl->gl_owner_pid = current->pid; 634 gl->gl_owner_pid = current->pid;
647 gl->gl_ip = (unsigned long)__builtin_return_address(0); 635 gl->gl_ip = (unsigned long)__builtin_return_address(0);
648 clear_bit(HIF_WAIT, &gh.gh_iflags); 636 spin_unlock(&gl->gl_spin);
649 smp_mb();
650 wake_up_bit(&gh.gh_iflags, HIF_WAIT);
651 } 637 }
652 spin_unlock(&gl->gl_spin);
653
654 wait_on_holder(&gh);
655 gfs2_holder_uninit(&gh);
656} 638}
657 639
658/** 640/**
@@ -691,7 +673,6 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
691 gl->gl_owner_pid = 0; 673 gl->gl_owner_pid = 0;
692 gl->gl_ip = 0; 674 gl->gl_ip = 0;
693 run_queue(gl); 675 run_queue(gl);
694 BUG_ON(!spin_is_locked(&gl->gl_spin));
695 spin_unlock(&gl->gl_spin); 676 spin_unlock(&gl->gl_spin);
696} 677}
697 678
@@ -722,7 +703,10 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
722 } 703 }
723 } else if (gl->gl_demote_state != LM_ST_UNLOCKED && 704 } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
724 gl->gl_demote_state != state) { 705 gl->gl_demote_state != state) {
725 gl->gl_demote_state = LM_ST_UNLOCKED; 706 if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
707 gl->gl_waiters2 = 1;
708 else
709 gl->gl_demote_state = LM_ST_UNLOCKED;
726 } 710 }
727 spin_unlock(&gl->gl_spin); 711 spin_unlock(&gl->gl_spin);
728} 712}
@@ -943,8 +927,8 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
943 const struct gfs2_glock_operations *glops = gl->gl_ops; 927 const struct gfs2_glock_operations *glops = gl->gl_ops;
944 unsigned int ret; 928 unsigned int ret;
945 929
946 if (glops->go_drop_th) 930 if (glops->go_xmote_th)
947 glops->go_drop_th(gl); 931 glops->go_xmote_th(gl);
948 932
949 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); 933 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
950 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); 934 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
@@ -1156,8 +1140,6 @@ restart:
1156 return -EIO; 1140 return -EIO;
1157 } 1141 }
1158 1142
1159 set_bit(HIF_PROMOTE, &gh->gh_iflags);
1160
1161 spin_lock(&gl->gl_spin); 1143 spin_lock(&gl->gl_spin);
1162 add_to_queue(gh); 1144 add_to_queue(gh);
1163 run_queue(gl); 1145 run_queue(gl);
@@ -1248,12 +1230,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1248 list_del_init(&gh->gh_list); 1230 list_del_init(&gh->gh_list);
1249 1231
1250 if (list_empty(&gl->gl_holders)) { 1232 if (list_empty(&gl->gl_holders)) {
1251 spin_unlock(&gl->gl_spin); 1233 if (glops->go_unlock) {
1252 1234 spin_unlock(&gl->gl_spin);
1253 if (glops->go_unlock)
1254 glops->go_unlock(gh); 1235 glops->go_unlock(gh);
1255 1236 spin_lock(&gl->gl_spin);
1256 spin_lock(&gl->gl_spin); 1237 }
1257 gl->gl_stamp = jiffies; 1238 gl->gl_stamp = jiffies;
1258 } 1239 }
1259 1240
@@ -1910,8 +1891,6 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
1910 print_dbg(gi, " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no"); 1891 print_dbg(gi, " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
1911 print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count)); 1892 print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
1912 print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no"); 1893 print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no");
1913 print_dbg(gi, " le = %s\n",
1914 (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
1915 print_dbg(gi, " reclaim = %s\n", 1894 print_dbg(gi, " reclaim = %s\n",
1916 (list_empty(&gl->gl_reclaim)) ? "no" : "yes"); 1895 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
1917 if (gl->gl_aspace) 1896 if (gl->gl_aspace)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 4670dcb2a877..c663b7a0f410 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -56,7 +56,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
56 bd = list_entry(head->next, struct gfs2_bufdata, 56 bd = list_entry(head->next, struct gfs2_bufdata,
57 bd_ail_gl_list); 57 bd_ail_gl_list);
58 bh = bd->bd_bh; 58 bh = bd->bd_bh;
59 gfs2_remove_from_ail(NULL, bd); 59 gfs2_remove_from_ail(bd);
60 bd->bd_bh = NULL; 60 bd->bd_bh = NULL;
61 bh->b_private = NULL; 61 bh->b_private = NULL;
62 bd->bd_blkno = bh->b_blocknr; 62 bd->bd_blkno = bh->b_blocknr;
@@ -86,15 +86,10 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
86 if (!ip || !S_ISREG(inode->i_mode)) 86 if (!ip || !S_ISREG(inode->i_mode))
87 return; 87 return;
88 88
89 if (!test_bit(GIF_PAGED, &ip->i_flags))
90 return;
91
92 unmap_shared_mapping_range(inode->i_mapping, 0, 0); 89 unmap_shared_mapping_range(inode->i_mapping, 0, 0);
93
94 if (test_bit(GIF_SW_PAGED, &ip->i_flags)) 90 if (test_bit(GIF_SW_PAGED, &ip->i_flags))
95 set_bit(GLF_DIRTY, &gl->gl_flags); 91 set_bit(GLF_DIRTY, &gl->gl_flags);
96 92
97 clear_bit(GIF_SW_PAGED, &ip->i_flags);
98} 93}
99 94
100/** 95/**
@@ -143,44 +138,34 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
143static void inode_go_sync(struct gfs2_glock *gl) 138static void inode_go_sync(struct gfs2_glock *gl)
144{ 139{
145 struct gfs2_inode *ip = gl->gl_object; 140 struct gfs2_inode *ip = gl->gl_object;
141 struct address_space *metamapping = gl->gl_aspace->i_mapping;
142 int error;
143
144 if (gl->gl_state != LM_ST_UNLOCKED)
145 gfs2_pte_inval(gl);
146 if (gl->gl_state != LM_ST_EXCLUSIVE)
147 return;
146 148
147 if (ip && !S_ISREG(ip->i_inode.i_mode)) 149 if (ip && !S_ISREG(ip->i_inode.i_mode))
148 ip = NULL; 150 ip = NULL;
149 151
150 if (test_bit(GLF_DIRTY, &gl->gl_flags)) { 152 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
151 if (ip && !gfs2_is_jdata(ip))
152 filemap_fdatawrite(ip->i_inode.i_mapping);
153 gfs2_log_flush(gl->gl_sbd, gl); 153 gfs2_log_flush(gl->gl_sbd, gl);
154 if (ip && gfs2_is_jdata(ip)) 154 filemap_fdatawrite(metamapping);
155 filemap_fdatawrite(ip->i_inode.i_mapping);
156 gfs2_meta_sync(gl);
157 if (ip) { 155 if (ip) {
158 struct address_space *mapping = ip->i_inode.i_mapping; 156 struct address_space *mapping = ip->i_inode.i_mapping;
159 int error = filemap_fdatawait(mapping); 157 filemap_fdatawrite(mapping);
158 error = filemap_fdatawait(mapping);
160 mapping_set_error(mapping, error); 159 mapping_set_error(mapping, error);
161 } 160 }
161 error = filemap_fdatawait(metamapping);
162 mapping_set_error(metamapping, error);
162 clear_bit(GLF_DIRTY, &gl->gl_flags); 163 clear_bit(GLF_DIRTY, &gl->gl_flags);
163 gfs2_ail_empty_gl(gl); 164 gfs2_ail_empty_gl(gl);
164 } 165 }
165} 166}
166 167
167/** 168/**
168 * inode_go_xmote_th - promote/demote a glock
169 * @gl: the glock
170 * @state: the requested state
171 * @flags:
172 *
173 */
174
175static void inode_go_xmote_th(struct gfs2_glock *gl)
176{
177 if (gl->gl_state != LM_ST_UNLOCKED)
178 gfs2_pte_inval(gl);
179 if (gl->gl_state == LM_ST_EXCLUSIVE)
180 inode_go_sync(gl);
181}
182
183/**
184 * inode_go_xmote_bh - After promoting/demoting a glock 169 * inode_go_xmote_bh - After promoting/demoting a glock
185 * @gl: the glock 170 * @gl: the glock
186 * 171 *
@@ -201,22 +186,6 @@ static void inode_go_xmote_bh(struct gfs2_glock *gl)
201} 186}
202 187
203/** 188/**
204 * inode_go_drop_th - unlock a glock
205 * @gl: the glock
206 *
207 * Invoked from rq_demote().
208 * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
209 * is being purged from our node's glock cache; we're dropping lock.
210 */
211
212static void inode_go_drop_th(struct gfs2_glock *gl)
213{
214 gfs2_pte_inval(gl);
215 if (gl->gl_state == LM_ST_EXCLUSIVE)
216 inode_go_sync(gl);
217}
218
219/**
220 * inode_go_inval - prepare a inode glock to be released 189 * inode_go_inval - prepare a inode glock to be released
221 * @gl: the glock 190 * @gl: the glock
222 * @flags: 191 * @flags:
@@ -234,10 +203,8 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
234 set_bit(GIF_INVALID, &ip->i_flags); 203 set_bit(GIF_INVALID, &ip->i_flags);
235 } 204 }
236 205
237 if (ip && S_ISREG(ip->i_inode.i_mode)) { 206 if (ip && S_ISREG(ip->i_inode.i_mode))
238 truncate_inode_pages(ip->i_inode.i_mapping, 0); 207 truncate_inode_pages(ip->i_inode.i_mapping, 0);
239 clear_bit(GIF_PAGED, &ip->i_flags);
240 }
241} 208}
242 209
243/** 210/**
@@ -294,23 +261,6 @@ static int inode_go_lock(struct gfs2_holder *gh)
294} 261}
295 262
296/** 263/**
297 * inode_go_unlock - operation done before an inode lock is unlocked by a
298 * process
299 * @gl: the glock
300 * @flags:
301 *
302 */
303
304static void inode_go_unlock(struct gfs2_holder *gh)
305{
306 struct gfs2_glock *gl = gh->gh_gl;
307 struct gfs2_inode *ip = gl->gl_object;
308
309 if (ip)
310 gfs2_meta_cache_flush(ip);
311}
312
313/**
314 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock 264 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
315 * @gl: the glock 265 * @gl: the glock
316 * 266 *
@@ -350,14 +300,14 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
350} 300}
351 301
352/** 302/**
353 * trans_go_xmote_th - promote/demote the transaction glock 303 * trans_go_sync - promote/demote the transaction glock
354 * @gl: the glock 304 * @gl: the glock
355 * @state: the requested state 305 * @state: the requested state
356 * @flags: 306 * @flags:
357 * 307 *
358 */ 308 */
359 309
360static void trans_go_xmote_th(struct gfs2_glock *gl) 310static void trans_go_sync(struct gfs2_glock *gl)
361{ 311{
362 struct gfs2_sbd *sdp = gl->gl_sbd; 312 struct gfs2_sbd *sdp = gl->gl_sbd;
363 313
@@ -384,7 +334,6 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
384 334
385 if (gl->gl_state != LM_ST_UNLOCKED && 335 if (gl->gl_state != LM_ST_UNLOCKED &&
386 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { 336 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
387 gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
388 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); 337 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
389 338
390 error = gfs2_find_jhead(sdp->sd_jdesc, &head); 339 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -402,24 +351,6 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
402} 351}
403 352
404/** 353/**
405 * trans_go_drop_th - unlock the transaction glock
406 * @gl: the glock
407 *
408 * We want to sync the device even with localcaching. Remember
409 * that localcaching journal replay only marks buffers dirty.
410 */
411
412static void trans_go_drop_th(struct gfs2_glock *gl)
413{
414 struct gfs2_sbd *sdp = gl->gl_sbd;
415
416 if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
417 gfs2_meta_syncfs(sdp);
418 gfs2_log_shutdown(sdp);
419 }
420}
421
422/**
423 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock 354 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
424 * @gl: the glock 355 * @gl: the glock
425 * 356 *
@@ -433,25 +364,21 @@ static int quota_go_demote_ok(struct gfs2_glock *gl)
433 364
434const struct gfs2_glock_operations gfs2_meta_glops = { 365const struct gfs2_glock_operations gfs2_meta_glops = {
435 .go_xmote_th = meta_go_sync, 366 .go_xmote_th = meta_go_sync,
436 .go_drop_th = meta_go_sync,
437 .go_type = LM_TYPE_META, 367 .go_type = LM_TYPE_META,
438}; 368};
439 369
440const struct gfs2_glock_operations gfs2_inode_glops = { 370const struct gfs2_glock_operations gfs2_inode_glops = {
441 .go_xmote_th = inode_go_xmote_th, 371 .go_xmote_th = inode_go_sync,
442 .go_xmote_bh = inode_go_xmote_bh, 372 .go_xmote_bh = inode_go_xmote_bh,
443 .go_drop_th = inode_go_drop_th,
444 .go_inval = inode_go_inval, 373 .go_inval = inode_go_inval,
445 .go_demote_ok = inode_go_demote_ok, 374 .go_demote_ok = inode_go_demote_ok,
446 .go_lock = inode_go_lock, 375 .go_lock = inode_go_lock,
447 .go_unlock = inode_go_unlock,
448 .go_type = LM_TYPE_INODE, 376 .go_type = LM_TYPE_INODE,
449 .go_min_hold_time = HZ / 10, 377 .go_min_hold_time = HZ / 10,
450}; 378};
451 379
452const struct gfs2_glock_operations gfs2_rgrp_glops = { 380const struct gfs2_glock_operations gfs2_rgrp_glops = {
453 .go_xmote_th = meta_go_sync, 381 .go_xmote_th = meta_go_sync,
454 .go_drop_th = meta_go_sync,
455 .go_inval = meta_go_inval, 382 .go_inval = meta_go_inval,
456 .go_demote_ok = rgrp_go_demote_ok, 383 .go_demote_ok = rgrp_go_demote_ok,
457 .go_lock = rgrp_go_lock, 384 .go_lock = rgrp_go_lock,
@@ -461,9 +388,8 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
461}; 388};
462 389
463const struct gfs2_glock_operations gfs2_trans_glops = { 390const struct gfs2_glock_operations gfs2_trans_glops = {
464 .go_xmote_th = trans_go_xmote_th, 391 .go_xmote_th = trans_go_sync,
465 .go_xmote_bh = trans_go_xmote_bh, 392 .go_xmote_bh = trans_go_xmote_bh,
466 .go_drop_th = trans_go_drop_th,
467 .go_type = LM_TYPE_NONDISK, 393 .go_type = LM_TYPE_NONDISK,
468}; 394};
469 395
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eaddfb5a8e6f..513aaf0dc0ab 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -131,7 +131,6 @@ struct gfs2_bufdata {
131struct gfs2_glock_operations { 131struct gfs2_glock_operations {
132 void (*go_xmote_th) (struct gfs2_glock *gl); 132 void (*go_xmote_th) (struct gfs2_glock *gl);
133 void (*go_xmote_bh) (struct gfs2_glock *gl); 133 void (*go_xmote_bh) (struct gfs2_glock *gl);
134 void (*go_drop_th) (struct gfs2_glock *gl);
135 void (*go_inval) (struct gfs2_glock *gl, int flags); 134 void (*go_inval) (struct gfs2_glock *gl, int flags);
136 int (*go_demote_ok) (struct gfs2_glock *gl); 135 int (*go_demote_ok) (struct gfs2_glock *gl);
137 int (*go_lock) (struct gfs2_holder *gh); 136 int (*go_lock) (struct gfs2_holder *gh);
@@ -141,10 +140,6 @@ struct gfs2_glock_operations {
141}; 140};
142 141
143enum { 142enum {
144 /* Actions */
145 HIF_MUTEX = 0,
146 HIF_PROMOTE = 1,
147
148 /* States */ 143 /* States */
149 HIF_HOLDER = 6, 144 HIF_HOLDER = 6,
150 HIF_FIRST = 7, 145 HIF_FIRST = 7,
@@ -171,6 +166,8 @@ enum {
171 GLF_DEMOTE = 3, 166 GLF_DEMOTE = 3,
172 GLF_PENDING_DEMOTE = 4, 167 GLF_PENDING_DEMOTE = 4,
173 GLF_DIRTY = 5, 168 GLF_DIRTY = 5,
169 GLF_DEMOTE_IN_PROGRESS = 6,
170 GLF_LFLUSH = 7,
174}; 171};
175 172
176struct gfs2_glock { 173struct gfs2_glock {
@@ -190,6 +187,7 @@ struct gfs2_glock {
190 struct list_head gl_holders; 187 struct list_head gl_holders;
191 struct list_head gl_waiters1; /* HIF_MUTEX */ 188 struct list_head gl_waiters1; /* HIF_MUTEX */
192 struct list_head gl_waiters3; /* HIF_PROMOTE */ 189 struct list_head gl_waiters3; /* HIF_PROMOTE */
190 int gl_waiters2; /* GIF_DEMOTE */
193 191
194 const struct gfs2_glock_operations *gl_ops; 192 const struct gfs2_glock_operations *gl_ops;
195 193
@@ -210,7 +208,6 @@ struct gfs2_glock {
210 struct gfs2_sbd *gl_sbd; 208 struct gfs2_sbd *gl_sbd;
211 209
212 struct inode *gl_aspace; 210 struct inode *gl_aspace;
213 struct gfs2_log_element gl_le;
214 struct list_head gl_ail_list; 211 struct list_head gl_ail_list;
215 atomic_t gl_ail_count; 212 atomic_t gl_ail_count;
216 struct delayed_work gl_work; 213 struct delayed_work gl_work;
@@ -239,7 +236,6 @@ struct gfs2_alloc {
239enum { 236enum {
240 GIF_INVALID = 0, 237 GIF_INVALID = 0,
241 GIF_QD_LOCKED = 1, 238 GIF_QD_LOCKED = 1,
242 GIF_PAGED = 2,
243 GIF_SW_PAGED = 3, 239 GIF_SW_PAGED = 3,
244}; 240};
245 241
@@ -268,14 +264,10 @@ struct gfs2_inode {
268 struct gfs2_glock *i_gl; /* Move into i_gh? */ 264 struct gfs2_glock *i_gl; /* Move into i_gh? */
269 struct gfs2_holder i_iopen_gh; 265 struct gfs2_holder i_iopen_gh;
270 struct gfs2_holder i_gh; /* for prepare/commit_write only */ 266 struct gfs2_holder i_gh; /* for prepare/commit_write only */
271 struct gfs2_alloc i_alloc; 267 struct gfs2_alloc *i_alloc;
272 u64 i_last_rg_alloc; 268 u64 i_last_rg_alloc;
273 269
274 spinlock_t i_spin;
275 struct rw_semaphore i_rw_mutex; 270 struct rw_semaphore i_rw_mutex;
276 unsigned long i_last_pfault;
277
278 struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
279}; 271};
280 272
281/* 273/*
@@ -287,19 +279,12 @@ static inline struct gfs2_inode *GFS2_I(struct inode *inode)
287 return container_of(inode, struct gfs2_inode, i_inode); 279 return container_of(inode, struct gfs2_inode, i_inode);
288} 280}
289 281
290/* To be removed? */ 282static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode)
291static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
292{ 283{
293 return inode->i_sb->s_fs_info; 284 return inode->i_sb->s_fs_info;
294} 285}
295 286
296enum {
297 GFF_DID_DIRECT_ALLOC = 0,
298 GFF_EXLOCK = 1,
299};
300
301struct gfs2_file { 287struct gfs2_file {
302 unsigned long f_flags; /* GFF_... */
303 struct mutex f_fl_mutex; 288 struct mutex f_fl_mutex;
304 struct gfs2_holder f_fl_gh; 289 struct gfs2_holder f_fl_gh;
305}; 290};
@@ -373,8 +358,17 @@ struct gfs2_ail {
373 u64 ai_sync_gen; 358 u64 ai_sync_gen;
374}; 359};
375 360
361struct gfs2_journal_extent {
362 struct list_head extent_list;
363
364 unsigned int lblock; /* First logical block */
365 u64 dblock; /* First disk block */
366 u64 blocks;
367};
368
376struct gfs2_jdesc { 369struct gfs2_jdesc {
377 struct list_head jd_list; 370 struct list_head jd_list;
371 struct list_head extent_list;
378 372
379 struct inode *jd_inode; 373 struct inode *jd_inode;
380 unsigned int jd_jid; 374 unsigned int jd_jid;
@@ -421,13 +415,9 @@ struct gfs2_args {
421struct gfs2_tune { 415struct gfs2_tune {
422 spinlock_t gt_spin; 416 spinlock_t gt_spin;
423 417
424 unsigned int gt_ilimit;
425 unsigned int gt_ilimit_tries;
426 unsigned int gt_ilimit_min;
427 unsigned int gt_demote_secs; /* Cache retention for unheld glock */ 418 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
428 unsigned int gt_incore_log_blocks; 419 unsigned int gt_incore_log_blocks;
429 unsigned int gt_log_flush_secs; 420 unsigned int gt_log_flush_secs;
430 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
431 421
432 unsigned int gt_recoverd_secs; 422 unsigned int gt_recoverd_secs;
433 unsigned int gt_logd_secs; 423 unsigned int gt_logd_secs;
@@ -443,10 +433,8 @@ struct gfs2_tune {
443 unsigned int gt_new_files_jdata; 433 unsigned int gt_new_files_jdata;
444 unsigned int gt_new_files_directio; 434 unsigned int gt_new_files_directio;
445 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 435 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
446 unsigned int gt_lockdump_size;
447 unsigned int gt_stall_secs; /* Detects trouble! */ 436 unsigned int gt_stall_secs; /* Detects trouble! */
448 unsigned int gt_complain_secs; 437 unsigned int gt_complain_secs;
449 unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
450 unsigned int gt_statfs_quantum; 438 unsigned int gt_statfs_quantum;
451 unsigned int gt_statfs_slow; 439 unsigned int gt_statfs_slow;
452}; 440};
@@ -539,7 +527,6 @@ struct gfs2_sbd {
539 /* StatFS stuff */ 527 /* StatFS stuff */
540 528
541 spinlock_t sd_statfs_spin; 529 spinlock_t sd_statfs_spin;
542 struct mutex sd_statfs_mutex;
543 struct gfs2_statfs_change_host sd_statfs_master; 530 struct gfs2_statfs_change_host sd_statfs_master;
544 struct gfs2_statfs_change_host sd_statfs_local; 531 struct gfs2_statfs_change_host sd_statfs_local;
545 unsigned long sd_statfs_sync_time; 532 unsigned long sd_statfs_sync_time;
@@ -602,20 +589,18 @@ struct gfs2_sbd {
602 unsigned int sd_log_commited_databuf; 589 unsigned int sd_log_commited_databuf;
603 unsigned int sd_log_commited_revoke; 590 unsigned int sd_log_commited_revoke;
604 591
605 unsigned int sd_log_num_gl;
606 unsigned int sd_log_num_buf; 592 unsigned int sd_log_num_buf;
607 unsigned int sd_log_num_revoke; 593 unsigned int sd_log_num_revoke;
608 unsigned int sd_log_num_rg; 594 unsigned int sd_log_num_rg;
609 unsigned int sd_log_num_databuf; 595 unsigned int sd_log_num_databuf;
610 596
611 struct list_head sd_log_le_gl;
612 struct list_head sd_log_le_buf; 597 struct list_head sd_log_le_buf;
613 struct list_head sd_log_le_revoke; 598 struct list_head sd_log_le_revoke;
614 struct list_head sd_log_le_rg; 599 struct list_head sd_log_le_rg;
615 struct list_head sd_log_le_databuf; 600 struct list_head sd_log_le_databuf;
616 struct list_head sd_log_le_ordered; 601 struct list_head sd_log_le_ordered;
617 602
618 unsigned int sd_log_blks_free; 603 atomic_t sd_log_blks_free;
619 struct mutex sd_log_reserve_mutex; 604 struct mutex sd_log_reserve_mutex;
620 605
621 u64 sd_log_sequence; 606 u64 sd_log_sequence;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5f6dc32946cd..728d3169e7bd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -31,7 +31,6 @@
31#include "log.h" 31#include "log.h"
32#include "meta_io.h" 32#include "meta_io.h"
33#include "ops_address.h" 33#include "ops_address.h"
34#include "ops_file.h"
35#include "ops_inode.h" 34#include "ops_inode.h"
36#include "quota.h" 35#include "quota.h"
37#include "rgrp.h" 36#include "rgrp.h"
@@ -132,15 +131,21 @@ static struct inode *gfs2_iget_skip(struct super_block *sb,
132 131
133void gfs2_set_iop(struct inode *inode) 132void gfs2_set_iop(struct inode *inode)
134{ 133{
134 struct gfs2_sbd *sdp = GFS2_SB(inode);
135 umode_t mode = inode->i_mode; 135 umode_t mode = inode->i_mode;
136 136
137 if (S_ISREG(mode)) { 137 if (S_ISREG(mode)) {
138 inode->i_op = &gfs2_file_iops; 138 inode->i_op = &gfs2_file_iops;
139 inode->i_fop = &gfs2_file_fops; 139 if (sdp->sd_args.ar_localflocks)
140 inode->i_mapping->a_ops = &gfs2_file_aops; 140 inode->i_fop = &gfs2_file_fops_nolock;
141 else
142 inode->i_fop = &gfs2_file_fops;
141 } else if (S_ISDIR(mode)) { 143 } else if (S_ISDIR(mode)) {
142 inode->i_op = &gfs2_dir_iops; 144 inode->i_op = &gfs2_dir_iops;
143 inode->i_fop = &gfs2_dir_fops; 145 if (sdp->sd_args.ar_localflocks)
146 inode->i_fop = &gfs2_dir_fops_nolock;
147 else
148 inode->i_fop = &gfs2_dir_fops;
144 } else if (S_ISLNK(mode)) { 149 } else if (S_ISLNK(mode)) {
145 inode->i_op = &gfs2_symlink_iops; 150 inode->i_op = &gfs2_symlink_iops;
146 } else { 151 } else {
@@ -291,12 +296,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
291 di->di_entries = be32_to_cpu(str->di_entries); 296 di->di_entries = be32_to_cpu(str->di_entries);
292 297
293 di->di_eattr = be64_to_cpu(str->di_eattr); 298 di->di_eattr = be64_to_cpu(str->di_eattr);
294 return 0; 299 if (S_ISREG(ip->i_inode.i_mode))
295} 300 gfs2_set_aops(&ip->i_inode);
296 301
297static void gfs2_inode_bh(struct gfs2_inode *ip, struct buffer_head *bh) 302 return 0;
298{
299 ip->i_cache[0] = bh;
300} 303}
301 304
302/** 305/**
@@ -366,7 +369,8 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
366 if (error) 369 if (error)
367 goto out_rg_gunlock; 370 goto out_rg_gunlock;
368 371
369 gfs2_trans_add_gl(ip->i_gl); 372 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
373 set_bit(GLF_LFLUSH, &ip->i_gl->gl_flags);
370 374
371 gfs2_free_di(rgd, ip); 375 gfs2_free_di(rgd, ip);
372 376
@@ -707,9 +711,10 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
707 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 711 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
708 int error; 712 int error;
709 713
710 gfs2_alloc_get(dip); 714 if (gfs2_alloc_get(dip) == NULL)
715 return -ENOMEM;
711 716
712 dip->i_alloc.al_requested = RES_DINODE; 717 dip->i_alloc->al_requested = RES_DINODE;
713 error = gfs2_inplace_reserve(dip); 718 error = gfs2_inplace_reserve(dip);
714 if (error) 719 if (error)
715 goto out; 720 goto out;
@@ -855,7 +860,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
855 860
856 error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name); 861 error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
857 if (alloc_required < 0) 862 if (alloc_required < 0)
858 goto fail; 863 goto fail_quota_locks;
859 if (alloc_required) { 864 if (alloc_required) {
860 error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid); 865 error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
861 if (error) 866 if (error)
@@ -896,7 +901,7 @@ fail_end_trans:
896 gfs2_trans_end(sdp); 901 gfs2_trans_end(sdp);
897 902
898fail_ipreserv: 903fail_ipreserv:
899 if (dip->i_alloc.al_rgd) 904 if (dip->i_alloc->al_rgd)
900 gfs2_inplace_release(dip); 905 gfs2_inplace_release(dip);
901 906
902fail_quota_locks: 907fail_quota_locks:
@@ -966,7 +971,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
966 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; 971 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
967 int error; 972 int error;
968 u64 generation; 973 u64 generation;
969 struct buffer_head *bh=NULL; 974 struct buffer_head *bh = NULL;
970 975
971 if (!name->len || name->len > GFS2_FNAMESIZE) 976 if (!name->len || name->len > GFS2_FNAMESIZE)
972 return ERR_PTR(-ENAMETOOLONG); 977 return ERR_PTR(-ENAMETOOLONG);
@@ -1003,8 +1008,6 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
1003 if (IS_ERR(inode)) 1008 if (IS_ERR(inode))
1004 goto fail_gunlock2; 1009 goto fail_gunlock2;
1005 1010
1006 gfs2_inode_bh(GFS2_I(inode), bh);
1007
1008 error = gfs2_inode_refresh(GFS2_I(inode)); 1011 error = gfs2_inode_refresh(GFS2_I(inode));
1009 if (error) 1012 if (error)
1010 goto fail_gunlock2; 1013 goto fail_gunlock2;
@@ -1021,6 +1024,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
1021 if (error) 1024 if (error)
1022 goto fail_gunlock2; 1025 goto fail_gunlock2;
1023 1026
1027 if (bh)
1028 brelse(bh);
1024 if (!inode) 1029 if (!inode)
1025 return ERR_PTR(-ENOMEM); 1030 return ERR_PTR(-ENOMEM);
1026 return inode; 1031 return inode;
@@ -1032,6 +1037,8 @@ fail_gunlock2:
1032fail_gunlock: 1037fail_gunlock:
1033 gfs2_glock_dq(ghs); 1038 gfs2_glock_dq(ghs);
1034fail: 1039fail:
1040 if (bh)
1041 brelse(bh);
1035 return ERR_PTR(error); 1042 return ERR_PTR(error);
1036} 1043}
1037 1044
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 351ac87ab384..d44650662615 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -20,6 +20,18 @@ static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
20 return ip->i_di.di_flags & GFS2_DIF_JDATA; 20 return ip->i_di.di_flags & GFS2_DIF_JDATA;
21} 21}
22 22
23static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
24{
25 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
26 return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip);
27}
28
29static inline int gfs2_is_ordered(const struct gfs2_inode *ip)
30{
31 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
32 return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip);
33}
34
23static inline int gfs2_is_dir(const struct gfs2_inode *ip) 35static inline int gfs2_is_dir(const struct gfs2_inode *ip)
24{ 36{
25 return S_ISDIR(ip->i_inode.i_mode); 37 return S_ISDIR(ip->i_inode.i_mode);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 41c5b04caaba..f2efff424224 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -67,6 +67,11 @@ static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
67 memset(data, 0, 256); 67 memset(data, 0, 256);
68 strncpy(data, data_arg, 255); 68 strncpy(data, data_arg, 255);
69 69
70 if (!strlen(data)) {
71 log_error("no mount options, (u)mount helpers not installed");
72 return -EINVAL;
73 }
74
70 for (options = data; (x = strsep(&options, ":")); ) { 75 for (options = data; (x = strsep(&options, ":")); ) {
71 if (!*x) 76 if (!*x)
72 continue; 77 continue;
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
index 1f7b038530b4..2ebd374b3143 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -89,15 +89,19 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
89 op->info.number = name->ln_number; 89 op->info.number = name->ln_number;
90 op->info.start = fl->fl_start; 90 op->info.start = fl->fl_start;
91 op->info.end = fl->fl_end; 91 op->info.end = fl->fl_end;
92 op->info.owner = (__u64)(long) fl->fl_owner;
93 if (fl->fl_lmops && fl->fl_lmops->fl_grant) { 92 if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
93 /* fl_owner is lockd which doesn't distinguish
94 processes on the nfs client */
95 op->info.owner = (__u64) fl->fl_pid;
94 xop->callback = fl->fl_lmops->fl_grant; 96 xop->callback = fl->fl_lmops->fl_grant;
95 locks_init_lock(&xop->flc); 97 locks_init_lock(&xop->flc);
96 locks_copy_lock(&xop->flc, fl); 98 locks_copy_lock(&xop->flc, fl);
97 xop->fl = fl; 99 xop->fl = fl;
98 xop->file = file; 100 xop->file = file;
99 } else 101 } else {
102 op->info.owner = (__u64)(long) fl->fl_owner;
100 xop->callback = NULL; 103 xop->callback = NULL;
104 }
101 105
102 send_op(op); 106 send_op(op);
103 107
@@ -203,7 +207,10 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name,
203 op->info.number = name->ln_number; 207 op->info.number = name->ln_number;
204 op->info.start = fl->fl_start; 208 op->info.start = fl->fl_start;
205 op->info.end = fl->fl_end; 209 op->info.end = fl->fl_end;
206 op->info.owner = (__u64)(long) fl->fl_owner; 210 if (fl->fl_lmops && fl->fl_lmops->fl_grant)
211 op->info.owner = (__u64) fl->fl_pid;
212 else
213 op->info.owner = (__u64)(long) fl->fl_owner;
207 214
208 send_op(op); 215 send_op(op);
209 wait_event(recv_wq, (op->done != 0)); 216 wait_event(recv_wq, (op->done != 0));
@@ -242,7 +249,10 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
242 op->info.number = name->ln_number; 249 op->info.number = name->ln_number;
243 op->info.start = fl->fl_start; 250 op->info.start = fl->fl_start;
244 op->info.end = fl->fl_end; 251 op->info.end = fl->fl_end;
245 op->info.owner = (__u64)(long) fl->fl_owner; 252 if (fl->fl_lmops && fl->fl_lmops->fl_grant)
253 op->info.owner = (__u64) fl->fl_pid;
254 else
255 op->info.owner = (__u64)(long) fl->fl_owner;
246 256
247 send_op(op); 257 send_op(op);
248 wait_event(recv_wq, (op->done != 0)); 258 wait_event(recv_wq, (op->done != 0));
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index ae9e6a25fe2b..a87b09839761 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -189,51 +189,39 @@ static struct kobj_type gdlm_ktype = {
189 .sysfs_ops = &gdlm_attr_ops, 189 .sysfs_ops = &gdlm_attr_ops,
190}; 190};
191 191
192static struct kset gdlm_kset = { 192static struct kset *gdlm_kset;
193 .ktype = &gdlm_ktype,
194};
195 193
196int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj) 194int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
197{ 195{
198 int error; 196 int error;
199 197
200 error = kobject_set_name(&ls->kobj, "%s", "lock_module"); 198 ls->kobj.kset = gdlm_kset;
201 if (error) { 199 error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj,
202 log_error("can't set kobj name %d", error); 200 "lock_module");
203 return error;
204 }
205
206 ls->kobj.kset = &gdlm_kset;
207 ls->kobj.ktype = &gdlm_ktype;
208 ls->kobj.parent = fskobj;
209
210 error = kobject_register(&ls->kobj);
211 if (error) 201 if (error)
212 log_error("can't register kobj %d", error); 202 log_error("can't register kobj %d", error);
203 kobject_uevent(&ls->kobj, KOBJ_ADD);
213 204
214 return error; 205 return error;
215} 206}
216 207
217void gdlm_kobject_release(struct gdlm_ls *ls) 208void gdlm_kobject_release(struct gdlm_ls *ls)
218{ 209{
219 kobject_unregister(&ls->kobj); 210 kobject_put(&ls->kobj);
220} 211}
221 212
222int gdlm_sysfs_init(void) 213int gdlm_sysfs_init(void)
223{ 214{
224 int error; 215 gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
225 216 if (!gdlm_kset) {
226 kobject_set_name(&gdlm_kset.kobj, "lock_dlm"); 217 printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
227 kobj_set_kset_s(&gdlm_kset, kernel_subsys); 218 return -ENOMEM;
228 error = kset_register(&gdlm_kset); 219 }
229 if (error) 220 return 0;
230 printk("lock_dlm: cannot register kset %d\n", error);
231
232 return error;
233} 221}
234 222
235void gdlm_sysfs_exit(void) 223void gdlm_sysfs_exit(void)
236{ 224{
237 kset_unregister(&gdlm_kset); 225 kset_unregister(gdlm_kset);
238} 226}
239 227
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index bd938f06481d..521694fc19d6 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -273,18 +273,13 @@ static int gdlm_thread(void *data, int blist)
273 struct gdlm_ls *ls = (struct gdlm_ls *) data; 273 struct gdlm_ls *ls = (struct gdlm_ls *) data;
274 struct gdlm_lock *lp = NULL; 274 struct gdlm_lock *lp = NULL;
275 uint8_t complete, blocking, submit, drop; 275 uint8_t complete, blocking, submit, drop;
276 DECLARE_WAITQUEUE(wait, current);
277 276
278 /* Only thread1 is allowed to do blocking callbacks since gfs 277 /* Only thread1 is allowed to do blocking callbacks since gfs
279 may wait for a completion callback within a blocking cb. */ 278 may wait for a completion callback within a blocking cb. */
280 279
281 while (!kthread_should_stop()) { 280 while (!kthread_should_stop()) {
282 set_current_state(TASK_INTERRUPTIBLE); 281 wait_event_interruptible(ls->thread_wait,
283 add_wait_queue(&ls->thread_wait, &wait); 282 !no_work(ls, blist) || kthread_should_stop());
284 if (no_work(ls, blist))
285 schedule();
286 remove_wait_queue(&ls->thread_wait, &wait);
287 set_current_state(TASK_RUNNING);
288 283
289 complete = blocking = submit = drop = 0; 284 complete = blocking = submit = drop = 0;
290 285
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 7df702473252..161ab6f2058e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -16,6 +16,8 @@
16#include <linux/crc32.h> 16#include <linux/crc32.h>
17#include <linux/lm_interface.h> 17#include <linux/lm_interface.h>
18#include <linux/delay.h> 18#include <linux/delay.h>
19#include <linux/kthread.h>
20#include <linux/freezer.h>
19 21
20#include "gfs2.h" 22#include "gfs2.h"
21#include "incore.h" 23#include "incore.h"
@@ -68,14 +70,12 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
68 * 70 *
69 */ 71 */
70 72
71void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd) 73void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
72{ 74{
73 bd->bd_ail = NULL; 75 bd->bd_ail = NULL;
74 list_del_init(&bd->bd_ail_st_list); 76 list_del_init(&bd->bd_ail_st_list);
75 list_del_init(&bd->bd_ail_gl_list); 77 list_del_init(&bd->bd_ail_gl_list);
76 atomic_dec(&bd->bd_gl->gl_ail_count); 78 atomic_dec(&bd->bd_gl->gl_ail_count);
77 if (mapping)
78 gfs2_meta_cache_flush(GFS2_I(mapping->host));
79 brelse(bd->bd_bh); 79 brelse(bd->bd_bh);
80} 80}
81 81
@@ -92,8 +92,6 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
92 struct buffer_head *bh; 92 struct buffer_head *bh;
93 int retry; 93 int retry;
94 94
95 BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
96
97 do { 95 do {
98 retry = 0; 96 retry = 0;
99 97
@@ -210,7 +208,7 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
210 gfs2_log_unlock(sdp); 208 gfs2_log_unlock(sdp);
211} 209}
212 210
213int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) 211static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
214{ 212{
215 struct gfs2_ail *ai, *s; 213 struct gfs2_ail *ai, *s;
216 int ret; 214 int ret;
@@ -248,7 +246,7 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
248 bd = list_entry(head->prev, struct gfs2_bufdata, 246 bd = list_entry(head->prev, struct gfs2_bufdata,
249 bd_ail_st_list); 247 bd_ail_st_list);
250 gfs2_assert(sdp, bd->bd_ail == ai); 248 gfs2_assert(sdp, bd->bd_ail == ai);
251 gfs2_remove_from_ail(bd->bd_bh->b_page->mapping, bd); 249 gfs2_remove_from_ail(bd);
252 } 250 }
253} 251}
254 252
@@ -303,7 +301,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
303 301
304 mutex_lock(&sdp->sd_log_reserve_mutex); 302 mutex_lock(&sdp->sd_log_reserve_mutex);
305 gfs2_log_lock(sdp); 303 gfs2_log_lock(sdp);
306 while(sdp->sd_log_blks_free <= (blks + reserved_blks)) { 304 while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) {
307 gfs2_log_unlock(sdp); 305 gfs2_log_unlock(sdp);
308 gfs2_ail1_empty(sdp, 0); 306 gfs2_ail1_empty(sdp, 0);
309 gfs2_log_flush(sdp, NULL); 307 gfs2_log_flush(sdp, NULL);
@@ -312,7 +310,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
312 gfs2_ail1_start(sdp, 0); 310 gfs2_ail1_start(sdp, 0);
313 gfs2_log_lock(sdp); 311 gfs2_log_lock(sdp);
314 } 312 }
315 sdp->sd_log_blks_free -= blks; 313 atomic_sub(blks, &sdp->sd_log_blks_free);
316 gfs2_log_unlock(sdp); 314 gfs2_log_unlock(sdp);
317 mutex_unlock(&sdp->sd_log_reserve_mutex); 315 mutex_unlock(&sdp->sd_log_reserve_mutex);
318 316
@@ -332,27 +330,23 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
332{ 330{
333 331
334 gfs2_log_lock(sdp); 332 gfs2_log_lock(sdp);
335 sdp->sd_log_blks_free += blks; 333 atomic_add(blks, &sdp->sd_log_blks_free);
336 gfs2_assert_withdraw(sdp, 334 gfs2_assert_withdraw(sdp,
337 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks); 335 atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
338 gfs2_log_unlock(sdp); 336 gfs2_log_unlock(sdp);
339 up_read(&sdp->sd_log_flush_lock); 337 up_read(&sdp->sd_log_flush_lock);
340} 338}
341 339
342static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) 340static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
343{ 341{
344 struct inode *inode = sdp->sd_jdesc->jd_inode; 342 struct gfs2_journal_extent *je;
345 int error; 343
346 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; 344 list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) {
347 345 if (lbn >= je->lblock && lbn < je->lblock + je->blocks)
348 bh_map.b_size = 1 << inode->i_blkbits; 346 return je->dblock + lbn - je->lblock;
349 error = gfs2_block_map(inode, lbn, 0, &bh_map); 347 }
350 if (error || !bh_map.b_blocknr) 348
351 printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, 349 return -1;
352 (unsigned long long)bh_map.b_blocknr, lbn);
353 gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr);
354
355 return bh_map.b_blocknr;
356} 350}
357 351
358/** 352/**
@@ -561,8 +555,8 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
561 ail2_empty(sdp, new_tail); 555 ail2_empty(sdp, new_tail);
562 556
563 gfs2_log_lock(sdp); 557 gfs2_log_lock(sdp);
564 sdp->sd_log_blks_free += dist; 558 atomic_add(dist, &sdp->sd_log_blks_free);
565 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks); 559 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
566 gfs2_log_unlock(sdp); 560 gfs2_log_unlock(sdp);
567 561
568 sdp->sd_log_tail = new_tail; 562 sdp->sd_log_tail = new_tail;
@@ -652,7 +646,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
652 get_bh(bh); 646 get_bh(bh);
653 gfs2_log_unlock(sdp); 647 gfs2_log_unlock(sdp);
654 lock_buffer(bh); 648 lock_buffer(bh);
655 if (test_clear_buffer_dirty(bh)) { 649 if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
656 bh->b_end_io = end_buffer_write_sync; 650 bh->b_end_io = end_buffer_write_sync;
657 submit_bh(WRITE, bh); 651 submit_bh(WRITE, bh);
658 } else { 652 } else {
@@ -694,20 +688,16 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
694 * 688 *
695 */ 689 */
696 690
697void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) 691void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
698{ 692{
699 struct gfs2_ail *ai; 693 struct gfs2_ail *ai;
700 694
701 down_write(&sdp->sd_log_flush_lock); 695 down_write(&sdp->sd_log_flush_lock);
702 696
703 if (gl) { 697 /* Log might have been flushed while we waited for the flush lock */
704 gfs2_log_lock(sdp); 698 if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
705 if (list_empty(&gl->gl_le.le_list)) { 699 up_write(&sdp->sd_log_flush_lock);
706 gfs2_log_unlock(sdp); 700 return;
707 up_write(&sdp->sd_log_flush_lock);
708 return;
709 }
710 gfs2_log_unlock(sdp);
711 } 701 }
712 702
713 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); 703 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
@@ -739,7 +729,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
739 log_flush_commit(sdp); 729 log_flush_commit(sdp);
740 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ 730 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
741 gfs2_log_lock(sdp); 731 gfs2_log_lock(sdp);
742 sdp->sd_log_blks_free--; /* Adjust for unreserved buffer */ 732 atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
743 gfs2_log_unlock(sdp); 733 gfs2_log_unlock(sdp);
744 log_write_header(sdp, 0, PULL); 734 log_write_header(sdp, 0, PULL);
745 } 735 }
@@ -767,7 +757,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
767static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 757static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
768{ 758{
769 unsigned int reserved; 759 unsigned int reserved;
770 unsigned int old; 760 unsigned int unused;
771 761
772 gfs2_log_lock(sdp); 762 gfs2_log_lock(sdp);
773 763
@@ -779,14 +769,11 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
779 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; 769 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
780 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0); 770 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
781 reserved = calc_reserved(sdp); 771 reserved = calc_reserved(sdp);
782 old = sdp->sd_log_blks_free; 772 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
783 sdp->sd_log_blks_free += tr->tr_reserved - 773 gfs2_assert_withdraw(sdp, unused >= 0);
784 (reserved - sdp->sd_log_blks_reserved); 774 atomic_add(unused, &sdp->sd_log_blks_free);
785 775 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
786 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
787 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <=
788 sdp->sd_jdesc->jd_blocks); 776 sdp->sd_jdesc->jd_blocks);
789
790 sdp->sd_log_blks_reserved = reserved; 777 sdp->sd_log_blks_reserved = reserved;
791 778
792 gfs2_log_unlock(sdp); 779 gfs2_log_unlock(sdp);
@@ -825,7 +812,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
825 down_write(&sdp->sd_log_flush_lock); 812 down_write(&sdp->sd_log_flush_lock);
826 813
827 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); 814 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
828 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
829 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf); 815 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
830 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 816 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
831 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); 817 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
@@ -838,7 +824,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
838 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 824 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT,
839 (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL); 825 (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL);
840 826
841 gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks); 827 gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
842 gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); 828 gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
843 gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); 829 gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
844 830
@@ -866,3 +852,42 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
866 } 852 }
867} 853}
868 854
855
856/**
857 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
858 * @sdp: Pointer to GFS2 superblock
859 *
860 * Also, periodically check to make sure that we're using the most recent
861 * journal index.
862 */
863
864int gfs2_logd(void *data)
865{
866 struct gfs2_sbd *sdp = data;
867 unsigned long t;
868 int need_flush;
869
870 while (!kthread_should_stop()) {
871 /* Advance the log tail */
872
873 t = sdp->sd_log_flush_time +
874 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
875
876 gfs2_ail1_empty(sdp, DIO_ALL);
877 gfs2_log_lock(sdp);
878 need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
879 gfs2_log_unlock(sdp);
880 if (need_flush || time_after_eq(jiffies, t)) {
881 gfs2_log_flush(sdp, NULL);
882 sdp->sd_log_flush_time = jiffies;
883 }
884
885 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
886 if (freezing(current))
887 refrigerator();
888 schedule_timeout_interruptible(t);
889 }
890
891 return 0;
892}
893
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index dae282400627..771152816508 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -48,8 +48,6 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
48unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, 48unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
49 unsigned int ssize); 49 unsigned int ssize);
50 50
51int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
52
53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); 51int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
54void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); 52void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
55void gfs2_log_incr_head(struct gfs2_sbd *sdp); 53void gfs2_log_incr_head(struct gfs2_sbd *sdp);
@@ -57,11 +55,19 @@ void gfs2_log_incr_head(struct gfs2_sbd *sdp);
57struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); 55struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
58struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, 56struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
59 struct buffer_head *real); 57 struct buffer_head *real);
60void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); 58void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
59
60static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl)
61{
62 if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags))
63 __gfs2_log_flush(sbd, gl);
64}
65
61void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); 66void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
62void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd); 67void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
63 68
64void gfs2_log_shutdown(struct gfs2_sbd *sdp); 69void gfs2_log_shutdown(struct gfs2_sbd *sdp);
65void gfs2_meta_syncfs(struct gfs2_sbd *sdp); 70void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
71int gfs2_logd(void *data);
66 72
67#endif /* __LOG_DOT_H__ */ 73#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6c27cea761c6..fae59d69d01a 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -87,6 +87,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
87 } 87 }
88 bd->bd_ail = ai; 88 bd->bd_ail = ai;
89 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); 89 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
90 clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
90 gfs2_log_unlock(sdp); 91 gfs2_log_unlock(sdp);
91 unlock_buffer(bh); 92 unlock_buffer(bh);
92} 93}
@@ -124,49 +125,6 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
124 return bh; 125 return bh;
125} 126}
126 127
127static void __glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
128{
129 struct gfs2_glock *gl;
130 struct gfs2_trans *tr = current->journal_info;
131
132 tr->tr_touched = 1;
133
134 gl = container_of(le, struct gfs2_glock, gl_le);
135 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
136 return;
137
138 if (!list_empty(&le->le_list))
139 return;
140
141 gfs2_glock_hold(gl);
142 set_bit(GLF_DIRTY, &gl->gl_flags);
143 sdp->sd_log_num_gl++;
144 list_add(&le->le_list, &sdp->sd_log_le_gl);
145}
146
147static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
148{
149 gfs2_log_lock(sdp);
150 __glock_lo_add(sdp, le);
151 gfs2_log_unlock(sdp);
152}
153
154static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
155{
156 struct list_head *head = &sdp->sd_log_le_gl;
157 struct gfs2_glock *gl;
158
159 while (!list_empty(head)) {
160 gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
161 list_del_init(&gl->gl_le.le_list);
162 sdp->sd_log_num_gl--;
163
164 gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
165 gfs2_glock_put(gl);
166 }
167 gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
168}
169
170static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) 128static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
171{ 129{
172 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); 130 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
@@ -182,7 +140,8 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
182 list_add(&bd->bd_list_tr, &tr->tr_list_buf); 140 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
183 if (!list_empty(&le->le_list)) 141 if (!list_empty(&le->le_list))
184 goto out; 142 goto out;
185 __glock_lo_add(sdp, &bd->bd_gl->gl_le); 143 set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
144 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
186 gfs2_meta_check(sdp, bd->bd_bh); 145 gfs2_meta_check(sdp, bd->bd_bh);
187 gfs2_pin(sdp, bd->bd_bh); 146 gfs2_pin(sdp, bd->bd_bh);
188 sdp->sd_log_num_buf++; 147 sdp->sd_log_num_buf++;
@@ -556,17 +515,20 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
556 515
557 lock_buffer(bd->bd_bh); 516 lock_buffer(bd->bd_bh);
558 gfs2_log_lock(sdp); 517 gfs2_log_lock(sdp);
559 if (!list_empty(&bd->bd_list_tr)) 518 if (tr) {
560 goto out; 519 if (!list_empty(&bd->bd_list_tr))
561 tr->tr_touched = 1; 520 goto out;
562 if (gfs2_is_jdata(ip)) { 521 tr->tr_touched = 1;
563 tr->tr_num_buf++; 522 if (gfs2_is_jdata(ip)) {
564 list_add(&bd->bd_list_tr, &tr->tr_list_buf); 523 tr->tr_num_buf++;
524 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
525 }
565 } 526 }
566 if (!list_empty(&le->le_list)) 527 if (!list_empty(&le->le_list))
567 goto out; 528 goto out;
568 529
569 __glock_lo_add(sdp, &bd->bd_gl->gl_le); 530 set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
531 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
570 if (gfs2_is_jdata(ip)) { 532 if (gfs2_is_jdata(ip)) {
571 gfs2_pin(sdp, bd->bd_bh); 533 gfs2_pin(sdp, bd->bd_bh);
572 tr->tr_num_databuf_new++; 534 tr->tr_num_databuf_new++;
@@ -773,12 +735,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
773} 735}
774 736
775 737
776const struct gfs2_log_operations gfs2_glock_lops = {
777 .lo_add = glock_lo_add,
778 .lo_after_commit = glock_lo_after_commit,
779 .lo_name = "glock",
780};
781
782const struct gfs2_log_operations gfs2_buf_lops = { 738const struct gfs2_log_operations gfs2_buf_lops = {
783 .lo_add = buf_lo_add, 739 .lo_add = buf_lo_add,
784 .lo_incore_commit = buf_lo_incore_commit, 740 .lo_incore_commit = buf_lo_incore_commit,
@@ -816,7 +772,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
816}; 772};
817 773
818const struct gfs2_log_operations *gfs2_log_ops[] = { 774const struct gfs2_log_operations *gfs2_log_ops[] = {
819 &gfs2_glock_lops,
820 &gfs2_databuf_lops, 775 &gfs2_databuf_lops,
821 &gfs2_buf_lops, 776 &gfs2_buf_lops,
822 &gfs2_rg_lops, 777 &gfs2_rg_lops,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 7ecfe0d3a491..9c7765c12d62 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -29,9 +29,8 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
29 struct gfs2_inode *ip = foo; 29 struct gfs2_inode *ip = foo;
30 30
31 inode_init_once(&ip->i_inode); 31 inode_init_once(&ip->i_inode);
32 spin_lock_init(&ip->i_spin);
33 init_rwsem(&ip->i_rw_mutex); 32 init_rwsem(&ip->i_rw_mutex);
34 memset(ip->i_cache, 0, sizeof(ip->i_cache)); 33 ip->i_alloc = NULL;
35} 34}
36 35
37static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo) 36static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 4da423985e4f..85aea27b4a86 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -50,6 +50,7 @@ static int gfs2_aspace_writepage(struct page *page,
50static const struct address_space_operations aspace_aops = { 50static const struct address_space_operations aspace_aops = {
51 .writepage = gfs2_aspace_writepage, 51 .writepage = gfs2_aspace_writepage,
52 .releasepage = gfs2_releasepage, 52 .releasepage = gfs2_releasepage,
53 .sync_page = block_sync_page,
53}; 54};
54 55
55/** 56/**
@@ -221,13 +222,14 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
221 struct buffer_head **bhp) 222 struct buffer_head **bhp)
222{ 223{
223 *bhp = getbuf(gl, blkno, CREATE); 224 *bhp = getbuf(gl, blkno, CREATE);
224 if (!buffer_uptodate(*bhp)) 225 if (!buffer_uptodate(*bhp)) {
225 ll_rw_block(READ_META, 1, bhp); 226 ll_rw_block(READ_META, 1, bhp);
226 if (flags & DIO_WAIT) { 227 if (flags & DIO_WAIT) {
227 int error = gfs2_meta_wait(gl->gl_sbd, *bhp); 228 int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
228 if (error) { 229 if (error) {
229 brelse(*bhp); 230 brelse(*bhp);
230 return error; 231 return error;
232 }
231 } 233 }
232 } 234 }
233 235
@@ -282,7 +284,7 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
282 return; 284 return;
283 } 285 }
284 286
285 bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL), 287 bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
286 bd->bd_bh = bh; 288 bd->bd_bh = bh;
287 bd->bd_gl = gl; 289 bd->bd_gl = gl;
288 290
@@ -317,7 +319,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
317 } 319 }
318 if (bd) { 320 if (bd) {
319 if (bd->bd_ail) { 321 if (bd->bd_ail) {
320 gfs2_remove_from_ail(NULL, bd); 322 gfs2_remove_from_ail(bd);
321 bh->b_private = NULL; 323 bh->b_private = NULL;
322 bd->bd_bh = NULL; 324 bd->bd_bh = NULL;
323 bd->bd_blkno = bh->b_blocknr; 325 bd->bd_blkno = bh->b_blocknr;
@@ -358,32 +360,6 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
358} 360}
359 361
360/** 362/**
361 * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
362 * @ip: The GFS2 inode
363 *
364 * This releases buffers that are in the most-recently-used array of
365 * blocks used for indirect block addressing for this inode.
366 */
367
368void gfs2_meta_cache_flush(struct gfs2_inode *ip)
369{
370 struct buffer_head **bh_slot;
371 unsigned int x;
372
373 spin_lock(&ip->i_spin);
374
375 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
376 bh_slot = &ip->i_cache[x];
377 if (*bh_slot) {
378 brelse(*bh_slot);
379 *bh_slot = NULL;
380 }
381 }
382
383 spin_unlock(&ip->i_spin);
384}
385
386/**
387 * gfs2_meta_indirect_buffer - Get a metadata buffer 363 * gfs2_meta_indirect_buffer - Get a metadata buffer
388 * @ip: The GFS2 inode 364 * @ip: The GFS2 inode
389 * @height: The level of this buf in the metadata (indir addr) tree (if any) 365 * @height: The level of this buf in the metadata (indir addr) tree (if any)
@@ -391,8 +367,6 @@ void gfs2_meta_cache_flush(struct gfs2_inode *ip)
391 * @new: Non-zero if we may create a new buffer 367 * @new: Non-zero if we may create a new buffer
392 * @bhp: the buffer is returned here 368 * @bhp: the buffer is returned here
393 * 369 *
394 * Try to use the gfs2_inode's MRU metadata tree cache.
395 *
396 * Returns: errno 370 * Returns: errno
397 */ 371 */
398 372
@@ -401,58 +375,25 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
401{ 375{
402 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 376 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
403 struct gfs2_glock *gl = ip->i_gl; 377 struct gfs2_glock *gl = ip->i_gl;
404 struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height; 378 struct buffer_head *bh;
405 int in_cache = 0; 379 int ret = 0;
406
407 BUG_ON(!gl);
408 BUG_ON(!sdp);
409
410 spin_lock(&ip->i_spin);
411 if (*bh_slot && (*bh_slot)->b_blocknr == num) {
412 bh = *bh_slot;
413 get_bh(bh);
414 in_cache = 1;
415 }
416 spin_unlock(&ip->i_spin);
417
418 if (!bh)
419 bh = getbuf(gl, num, CREATE);
420
421 if (!bh)
422 return -ENOBUFS;
423 380
424 if (new) { 381 if (new) {
425 if (gfs2_assert_warn(sdp, height)) 382 BUG_ON(height == 0);
426 goto err; 383 bh = gfs2_meta_new(gl, num);
427 meta_prep_new(bh);
428 gfs2_trans_add_bh(ip->i_gl, bh, 1); 384 gfs2_trans_add_bh(ip->i_gl, bh, 1);
429 gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); 385 gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
430 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); 386 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
431 } else { 387 } else {
432 u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; 388 u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
433 if (!buffer_uptodate(bh)) { 389 ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
434 ll_rw_block(READ_META, 1, &bh); 390 if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
435 if (gfs2_meta_wait(sdp, bh)) 391 brelse(bh);
436 goto err; 392 ret = -EIO;
437 } 393 }
438 if (gfs2_metatype_check(sdp, bh, mtype))
439 goto err;
440 }
441
442 if (!in_cache) {
443 spin_lock(&ip->i_spin);
444 if (*bh_slot)
445 brelse(*bh_slot);
446 *bh_slot = bh;
447 get_bh(bh);
448 spin_unlock(&ip->i_spin);
449 } 394 }
450
451 *bhp = bh; 395 *bhp = bh;
452 return 0; 396 return ret;
453err:
454 brelse(bh);
455 return -EIO;
456} 397}
457 398
458/** 399/**
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index b7048222ebb4..73e3b1c76fe1 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -56,7 +56,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
56 56
57void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); 57void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
58 58
59void gfs2_meta_cache_flush(struct gfs2_inode *ip);
60int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, 59int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
61 int new, struct buffer_head **bhp); 60 int new, struct buffer_head **bhp);
62 61
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 9679f8b9870d..38dbe99a30ed 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -20,6 +20,8 @@
20#include <linux/swap.h> 20#include <linux/swap.h>
21#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
22#include <linux/lm_interface.h> 22#include <linux/lm_interface.h>
23#include <linux/backing-dev.h>
24#include <linux/pagevec.h>
23 25
24#include "gfs2.h" 26#include "gfs2.h"
25#include "incore.h" 27#include "incore.h"
@@ -32,7 +34,6 @@
32#include "quota.h" 34#include "quota.h"
33#include "trans.h" 35#include "trans.h"
34#include "rgrp.h" 36#include "rgrp.h"
35#include "ops_file.h"
36#include "super.h" 37#include "super.h"
37#include "util.h" 38#include "util.h"
38#include "glops.h" 39#include "glops.h"
@@ -58,22 +59,6 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
58} 59}
59 60
60/** 61/**
61 * gfs2_get_block - Fills in a buffer head with details about a block
62 * @inode: The inode
63 * @lblock: The block number to look up
64 * @bh_result: The buffer head to return the result in
65 * @create: Non-zero if we may add block to the file
66 *
67 * Returns: errno
68 */
69
70int gfs2_get_block(struct inode *inode, sector_t lblock,
71 struct buffer_head *bh_result, int create)
72{
73 return gfs2_block_map(inode, lblock, create, bh_result);
74}
75
76/**
77 * gfs2_get_block_noalloc - Fills in a buffer head with details about a block 62 * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
78 * @inode: The inode 63 * @inode: The inode
79 * @lblock: The block number to look up 64 * @lblock: The block number to look up
@@ -88,7 +73,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
88{ 73{
89 int error; 74 int error;
90 75
91 error = gfs2_block_map(inode, lblock, 0, bh_result); 76 error = gfs2_block_map(inode, lblock, bh_result, 0);
92 if (error) 77 if (error)
93 return error; 78 return error;
94 if (!buffer_mapped(bh_result)) 79 if (!buffer_mapped(bh_result))
@@ -99,20 +84,19 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
99static int gfs2_get_block_direct(struct inode *inode, sector_t lblock, 84static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
100 struct buffer_head *bh_result, int create) 85 struct buffer_head *bh_result, int create)
101{ 86{
102 return gfs2_block_map(inode, lblock, 0, bh_result); 87 return gfs2_block_map(inode, lblock, bh_result, 0);
103} 88}
104 89
105/** 90/**
106 * gfs2_writepage - Write complete page 91 * gfs2_writepage_common - Common bits of writepage
107 * @page: Page to write 92 * @page: The page to be written
93 * @wbc: The writeback control
108 * 94 *
109 * Returns: errno 95 * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
110 *
111 * Some of this is copied from block_write_full_page() although we still
112 * call it to do most of the work.
113 */ 96 */
114 97
115static int gfs2_writepage(struct page *page, struct writeback_control *wbc) 98static int gfs2_writepage_common(struct page *page,
99 struct writeback_control *wbc)
116{ 100{
117 struct inode *inode = page->mapping->host; 101 struct inode *inode = page->mapping->host;
118 struct gfs2_inode *ip = GFS2_I(inode); 102 struct gfs2_inode *ip = GFS2_I(inode);
@@ -120,41 +104,133 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
120 loff_t i_size = i_size_read(inode); 104 loff_t i_size = i_size_read(inode);
121 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 105 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
122 unsigned offset; 106 unsigned offset;
123 int error; 107 int ret = -EIO;
124 int done_trans = 0;
125 108
126 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) { 109 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
127 unlock_page(page); 110 goto out;
128 return -EIO; 111 ret = 0;
129 }
130 if (current->journal_info) 112 if (current->journal_info)
131 goto out_ignore; 113 goto redirty;
132
133 /* Is the page fully outside i_size? (truncate in progress) */ 114 /* Is the page fully outside i_size? (truncate in progress) */
134 offset = i_size & (PAGE_CACHE_SIZE-1); 115 offset = i_size & (PAGE_CACHE_SIZE-1);
135 if (page->index > end_index || (page->index == end_index && !offset)) { 116 if (page->index > end_index || (page->index == end_index && !offset)) {
136 page->mapping->a_ops->invalidatepage(page, 0); 117 page->mapping->a_ops->invalidatepage(page, 0);
137 unlock_page(page); 118 goto out;
138 return 0; /* don't care */ 119 }
120 return 1;
121redirty:
122 redirty_page_for_writepage(wbc, page);
123out:
124 unlock_page(page);
125 return 0;
126}
127
128/**
129 * gfs2_writeback_writepage - Write page for writeback mappings
130 * @page: The page
131 * @wbc: The writeback control
132 *
133 */
134
135static int gfs2_writeback_writepage(struct page *page,
136 struct writeback_control *wbc)
137{
138 int ret;
139
140 ret = gfs2_writepage_common(page, wbc);
141 if (ret <= 0)
142 return ret;
143
144 ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
145 if (ret == -EAGAIN)
146 ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
147 return ret;
148}
149
150/**
151 * gfs2_ordered_writepage - Write page for ordered data files
152 * @page: The page to write
153 * @wbc: The writeback control
154 *
155 */
156
157static int gfs2_ordered_writepage(struct page *page,
158 struct writeback_control *wbc)
159{
160 struct inode *inode = page->mapping->host;
161 struct gfs2_inode *ip = GFS2_I(inode);
162 int ret;
163
164 ret = gfs2_writepage_common(page, wbc);
165 if (ret <= 0)
166 return ret;
167
168 if (!page_has_buffers(page)) {
169 create_empty_buffers(page, inode->i_sb->s_blocksize,
170 (1 << BH_Dirty)|(1 << BH_Uptodate));
139 } 171 }
172 gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1);
173 return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
174}
140 175
141 if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) && 176/**
142 PageChecked(page)) { 177 * __gfs2_jdata_writepage - The core of jdata writepage
178 * @page: The page to write
179 * @wbc: The writeback control
180 *
181 * This is shared between writepage and writepages and implements the
182 * core of the writepage operation. If a transaction is required then
183 * PageChecked will have been set and the transaction will have
184 * already been started before this is called.
185 */
186
187static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
188{
189 struct inode *inode = page->mapping->host;
190 struct gfs2_inode *ip = GFS2_I(inode);
191 struct gfs2_sbd *sdp = GFS2_SB(inode);
192
193 if (PageChecked(page)) {
143 ClearPageChecked(page); 194 ClearPageChecked(page);
144 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
145 if (error)
146 goto out_ignore;
147 if (!page_has_buffers(page)) { 195 if (!page_has_buffers(page)) {
148 create_empty_buffers(page, inode->i_sb->s_blocksize, 196 create_empty_buffers(page, inode->i_sb->s_blocksize,
149 (1 << BH_Dirty)|(1 << BH_Uptodate)); 197 (1 << BH_Dirty)|(1 << BH_Uptodate));
150 } 198 }
151 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); 199 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
200 }
201 return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
202}
203
204/**
205 * gfs2_jdata_writepage - Write complete page
206 * @page: Page to write
207 *
208 * Returns: errno
209 *
210 */
211
212static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
213{
214 struct inode *inode = page->mapping->host;
215 struct gfs2_sbd *sdp = GFS2_SB(inode);
216 int error;
217 int done_trans = 0;
218
219 error = gfs2_writepage_common(page, wbc);
220 if (error <= 0)
221 return error;
222
223 if (PageChecked(page)) {
224 if (wbc->sync_mode != WB_SYNC_ALL)
225 goto out_ignore;
226 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
227 if (error)
228 goto out_ignore;
152 done_trans = 1; 229 done_trans = 1;
153 } 230 }
154 error = block_write_full_page(page, gfs2_get_block_noalloc, wbc); 231 error = __gfs2_jdata_writepage(page, wbc);
155 if (done_trans) 232 if (done_trans)
156 gfs2_trans_end(sdp); 233 gfs2_trans_end(sdp);
157 gfs2_meta_cache_flush(ip);
158 return error; 234 return error;
159 235
160out_ignore: 236out_ignore:
@@ -164,29 +240,190 @@ out_ignore:
164} 240}
165 241
166/** 242/**
167 * gfs2_writepages - Write a bunch of dirty pages back to disk 243 * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
168 * @mapping: The mapping to write 244 * @mapping: The mapping to write
169 * @wbc: Write-back control 245 * @wbc: Write-back control
170 * 246 *
171 * For journaled files and/or ordered writes this just falls back to the 247 * For the data=writeback case we can already ignore buffer heads
172 * kernel's default writepages path for now. We will probably want to change
173 * that eventually (i.e. when we look at allocate on flush).
174 *
175 * For the data=writeback case though we can already ignore buffer heads
176 * and write whole extents at once. This is a big reduction in the 248 * and write whole extents at once. This is a big reduction in the
177 * number of I/O requests we send and the bmap calls we make in this case. 249 * number of I/O requests we send and the bmap calls we make in this case.
178 */ 250 */
179static int gfs2_writepages(struct address_space *mapping, 251static int gfs2_writeback_writepages(struct address_space *mapping,
180 struct writeback_control *wbc) 252 struct writeback_control *wbc)
253{
254 return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
255}
256
257/**
258 * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
259 * @mapping: The mapping
260 * @wbc: The writeback control
261 * @writepage: The writepage function to call for each page
262 * @pvec: The vector of pages
263 * @nr_pages: The number of pages to write
264 *
265 * Returns: non-zero if loop should terminate, zero otherwise
266 */
267
268static int gfs2_write_jdata_pagevec(struct address_space *mapping,
269 struct writeback_control *wbc,
270 struct pagevec *pvec,
271 int nr_pages, pgoff_t end)
181{ 272{
182 struct inode *inode = mapping->host; 273 struct inode *inode = mapping->host;
183 struct gfs2_inode *ip = GFS2_I(inode);
184 struct gfs2_sbd *sdp = GFS2_SB(inode); 274 struct gfs2_sbd *sdp = GFS2_SB(inode);
275 loff_t i_size = i_size_read(inode);
276 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
277 unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
278 unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
279 struct backing_dev_info *bdi = mapping->backing_dev_info;
280 int i;
281 int ret;
282
283 ret = gfs2_trans_begin(sdp, nrblocks, 0);
284 if (ret < 0)
285 return ret;
286
287 for(i = 0; i < nr_pages; i++) {
288 struct page *page = pvec->pages[i];
289
290 lock_page(page);
291
292 if (unlikely(page->mapping != mapping)) {
293 unlock_page(page);
294 continue;
295 }
296
297 if (!wbc->range_cyclic && page->index > end) {
298 ret = 1;
299 unlock_page(page);
300 continue;
301 }
302
303 if (wbc->sync_mode != WB_SYNC_NONE)
304 wait_on_page_writeback(page);
305
306 if (PageWriteback(page) ||
307 !clear_page_dirty_for_io(page)) {
308 unlock_page(page);
309 continue;
310 }
311
312 /* Is the page fully outside i_size? (truncate in progress) */
313 if (page->index > end_index || (page->index == end_index && !offset)) {
314 page->mapping->a_ops->invalidatepage(page, 0);
315 unlock_page(page);
316 continue;
317 }
318
319 ret = __gfs2_jdata_writepage(page, wbc);
320
321 if (ret || (--(wbc->nr_to_write) <= 0))
322 ret = 1;
323 if (wbc->nonblocking && bdi_write_congested(bdi)) {
324 wbc->encountered_congestion = 1;
325 ret = 1;
326 }
327
328 }
329 gfs2_trans_end(sdp);
330 return ret;
331}
332
333/**
334 * gfs2_write_cache_jdata - Like write_cache_pages but different
335 * @mapping: The mapping to write
336 * @wbc: The writeback control
337 * @writepage: The writepage function to call
338 * @data: The data to pass to writepage
339 *
340 * The reason that we use our own function here is that we need to
341 * start transactions before we grab page locks. This allows us
342 * to get the ordering right.
343 */
344
345static int gfs2_write_cache_jdata(struct address_space *mapping,
346 struct writeback_control *wbc)
347{
348 struct backing_dev_info *bdi = mapping->backing_dev_info;
349 int ret = 0;
350 int done = 0;
351 struct pagevec pvec;
352 int nr_pages;
353 pgoff_t index;
354 pgoff_t end;
355 int scanned = 0;
356 int range_whole = 0;
357
358 if (wbc->nonblocking && bdi_write_congested(bdi)) {
359 wbc->encountered_congestion = 1;
360 return 0;
361 }
362
363 pagevec_init(&pvec, 0);
364 if (wbc->range_cyclic) {
365 index = mapping->writeback_index; /* Start from prev offset */
366 end = -1;
367 } else {
368 index = wbc->range_start >> PAGE_CACHE_SHIFT;
369 end = wbc->range_end >> PAGE_CACHE_SHIFT;
370 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
371 range_whole = 1;
372 scanned = 1;
373 }
185 374
186 if (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK && !gfs2_is_jdata(ip)) 375retry:
187 return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); 376 while (!done && (index <= end) &&
377 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
378 PAGECACHE_TAG_DIRTY,
379 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
380 scanned = 1;
381 ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
382 if (ret)
383 done = 1;
384 if (ret > 0)
385 ret = 0;
386
387 pagevec_release(&pvec);
388 cond_resched();
389 }
390
391 if (!scanned && !done) {
392 /*
393 * We hit the last page and there is more work to be done: wrap
394 * back to the start of the file
395 */
396 scanned = 1;
397 index = 0;
398 goto retry;
399 }
400
401 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
402 mapping->writeback_index = index;
403 return ret;
404}
405
406
407/**
408 * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk
409 * @mapping: The mapping to write
410 * @wbc: The writeback control
411 *
412 */
188 413
189 return generic_writepages(mapping, wbc); 414static int gfs2_jdata_writepages(struct address_space *mapping,
415 struct writeback_control *wbc)
416{
417 struct gfs2_inode *ip = GFS2_I(mapping->host);
418 struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
419 int ret;
420
421 ret = gfs2_write_cache_jdata(mapping, wbc);
422 if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
423 gfs2_log_flush(sdp, ip->i_gl);
424 ret = gfs2_write_cache_jdata(mapping, wbc);
425 }
426 return ret;
190} 427}
191 428
192/** 429/**
@@ -231,62 +468,107 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
231 468
232 469
233/** 470/**
234 * gfs2_readpage - readpage with locking 471 * __gfs2_readpage - readpage
235 * @file: The file to read a page for. N.B. This may be NULL if we are 472 * @file: The file to read a page for
236 * reading an internal file.
237 * @page: The page to read 473 * @page: The page to read
238 * 474 *
239 * Returns: errno 475 * This is the core of gfs2's readpage. Its used by the internal file
476 * reading code as in that case we already hold the glock. Also its
477 * called by gfs2_readpage() once the required lock has been granted.
478 *
240 */ 479 */
241 480
242static int gfs2_readpage(struct file *file, struct page *page) 481static int __gfs2_readpage(void *file, struct page *page)
243{ 482{
244 struct gfs2_inode *ip = GFS2_I(page->mapping->host); 483 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
245 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); 484 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
246 struct gfs2_file *gf = NULL;
247 struct gfs2_holder gh;
248 int error; 485 int error;
249 int do_unlock = 0;
250
251 if (likely(file != &gfs2_internal_file_sentinel)) {
252 if (file) {
253 gf = file->private_data;
254 if (test_bit(GFF_EXLOCK, &gf->f_flags))
255 /* gfs2_sharewrite_fault has grabbed the ip->i_gl already */
256 goto skip_lock;
257 }
258 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
259 do_unlock = 1;
260 error = gfs2_glock_nq_atime(&gh);
261 if (unlikely(error))
262 goto out_unlock;
263 }
264 486
265skip_lock:
266 if (gfs2_is_stuffed(ip)) { 487 if (gfs2_is_stuffed(ip)) {
267 error = stuffed_readpage(ip, page); 488 error = stuffed_readpage(ip, page);
268 unlock_page(page); 489 unlock_page(page);
269 } else 490 } else {
270 error = mpage_readpage(page, gfs2_get_block); 491 error = mpage_readpage(page, gfs2_block_map);
492 }
271 493
272 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 494 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
273 error = -EIO; 495 return -EIO;
496
497 return error;
498}
499
500/**
501 * gfs2_readpage - read a page of a file
502 * @file: The file to read
503 * @page: The page of the file
504 *
505 * This deals with the locking required. We use a trylock in order to
506 * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
507 * in the event that we are unable to get the lock.
508 */
509
510static int gfs2_readpage(struct file *file, struct page *page)
511{
512 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
513 struct gfs2_holder gh;
514 int error;
274 515
275 if (do_unlock) { 516 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
276 gfs2_glock_dq_m(1, &gh); 517 error = gfs2_glock_nq_atime(&gh);
277 gfs2_holder_uninit(&gh); 518 if (unlikely(error)) {
519 unlock_page(page);
520 goto out;
278 } 521 }
522 error = __gfs2_readpage(file, page);
523 gfs2_glock_dq(&gh);
279out: 524out:
280 return error; 525 gfs2_holder_uninit(&gh);
281out_unlock:
282 unlock_page(page);
283 if (error == GLR_TRYFAILED) { 526 if (error == GLR_TRYFAILED) {
284 error = AOP_TRUNCATED_PAGE;
285 yield(); 527 yield();
528 return AOP_TRUNCATED_PAGE;
286 } 529 }
287 if (do_unlock) 530 return error;
288 gfs2_holder_uninit(&gh); 531}
289 goto out; 532
533/**
534 * gfs2_internal_read - read an internal file
535 * @ip: The gfs2 inode
536 * @ra_state: The readahead state (or NULL for no readahead)
537 * @buf: The buffer to fill
538 * @pos: The file position
539 * @size: The amount to read
540 *
541 */
542
543int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
544 char *buf, loff_t *pos, unsigned size)
545{
546 struct address_space *mapping = ip->i_inode.i_mapping;
547 unsigned long index = *pos / PAGE_CACHE_SIZE;
548 unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
549 unsigned copied = 0;
550 unsigned amt;
551 struct page *page;
552 void *p;
553
554 do {
555 amt = size - copied;
556 if (offset + size > PAGE_CACHE_SIZE)
557 amt = PAGE_CACHE_SIZE - offset;
558 page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
559 if (IS_ERR(page))
560 return PTR_ERR(page);
561 p = kmap_atomic(page, KM_USER0);
562 memcpy(buf + copied, p + offset, amt);
563 kunmap_atomic(p, KM_USER0);
564 mark_page_accessed(page);
565 page_cache_release(page);
566 copied += amt;
567 index++;
568 offset = 0;
569 } while(copied < size);
570 (*pos) += size;
571 return size;
290} 572}
291 573
292/** 574/**
@@ -300,10 +582,9 @@ out_unlock:
300 * Any I/O we ignore at this time will be done via readpage later. 582 * Any I/O we ignore at this time will be done via readpage later.
301 * 2. We don't handle stuffed files here we let readpage do the honours. 583 * 2. We don't handle stuffed files here we let readpage do the honours.
302 * 3. mpage_readpages() does most of the heavy lifting in the common case. 584 * 3. mpage_readpages() does most of the heavy lifting in the common case.
303 * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places. 585 * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
304 * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
305 * well as read-ahead.
306 */ 586 */
587
307static int gfs2_readpages(struct file *file, struct address_space *mapping, 588static int gfs2_readpages(struct file *file, struct address_space *mapping,
308 struct list_head *pages, unsigned nr_pages) 589 struct list_head *pages, unsigned nr_pages)
309{ 590{
@@ -311,42 +592,20 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
311 struct gfs2_inode *ip = GFS2_I(inode); 592 struct gfs2_inode *ip = GFS2_I(inode);
312 struct gfs2_sbd *sdp = GFS2_SB(inode); 593 struct gfs2_sbd *sdp = GFS2_SB(inode);
313 struct gfs2_holder gh; 594 struct gfs2_holder gh;
314 int ret = 0; 595 int ret;
315 int do_unlock = 0;
316 596
317 if (likely(file != &gfs2_internal_file_sentinel)) { 597 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
318 if (file) { 598 ret = gfs2_glock_nq_atime(&gh);
319 struct gfs2_file *gf = file->private_data; 599 if (unlikely(ret))
320 if (test_bit(GFF_EXLOCK, &gf->f_flags)) 600 goto out_uninit;
321 goto skip_lock;
322 }
323 gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
324 LM_FLAG_TRY_1CB|GL_ATIME, &gh);
325 do_unlock = 1;
326 ret = gfs2_glock_nq_atime(&gh);
327 if (ret == GLR_TRYFAILED)
328 goto out_noerror;
329 if (unlikely(ret))
330 goto out_unlock;
331 }
332skip_lock:
333 if (!gfs2_is_stuffed(ip)) 601 if (!gfs2_is_stuffed(ip))
334 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block); 602 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
335 603 gfs2_glock_dq(&gh);
336 if (do_unlock) { 604out_uninit:
337 gfs2_glock_dq_m(1, &gh); 605 gfs2_holder_uninit(&gh);
338 gfs2_holder_uninit(&gh);
339 }
340out:
341 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 606 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
342 ret = -EIO; 607 ret = -EIO;
343 return ret; 608 return ret;
344out_noerror:
345 ret = 0;
346out_unlock:
347 if (do_unlock)
348 gfs2_holder_uninit(&gh);
349 goto out;
350} 609}
351 610
352/** 611/**
@@ -382,20 +641,11 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
382 if (unlikely(error)) 641 if (unlikely(error))
383 goto out_uninit; 642 goto out_uninit;
384 643
385 error = -ENOMEM;
386 page = __grab_cache_page(mapping, index);
387 *pagep = page;
388 if (!page)
389 goto out_unlock;
390
391 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); 644 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
392
393 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); 645 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
394 if (error) 646 if (error)
395 goto out_putpage; 647 goto out_unlock;
396
397 648
398 ip->i_alloc.al_requested = 0;
399 if (alloc_required) { 649 if (alloc_required) {
400 al = gfs2_alloc_get(ip); 650 al = gfs2_alloc_get(ip);
401 651
@@ -424,40 +674,47 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
424 if (error) 674 if (error)
425 goto out_trans_fail; 675 goto out_trans_fail;
426 676
677 error = -ENOMEM;
678 page = __grab_cache_page(mapping, index);
679 *pagep = page;
680 if (unlikely(!page))
681 goto out_endtrans;
682
427 if (gfs2_is_stuffed(ip)) { 683 if (gfs2_is_stuffed(ip)) {
684 error = 0;
428 if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { 685 if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
429 error = gfs2_unstuff_dinode(ip, page); 686 error = gfs2_unstuff_dinode(ip, page);
430 if (error == 0) 687 if (error == 0)
431 goto prepare_write; 688 goto prepare_write;
432 } else if (!PageUptodate(page)) 689 } else if (!PageUptodate(page)) {
433 error = stuffed_readpage(ip, page); 690 error = stuffed_readpage(ip, page);
691 }
434 goto out; 692 goto out;
435 } 693 }
436 694
437prepare_write: 695prepare_write:
438 error = block_prepare_write(page, from, to, gfs2_get_block); 696 error = block_prepare_write(page, from, to, gfs2_block_map);
439
440out: 697out:
441 if (error) { 698 if (error == 0)
442 gfs2_trans_end(sdp); 699 return 0;
700
701 page_cache_release(page);
702 if (pos + len > ip->i_inode.i_size)
703 vmtruncate(&ip->i_inode, ip->i_inode.i_size);
704out_endtrans:
705 gfs2_trans_end(sdp);
443out_trans_fail: 706out_trans_fail:
444 if (alloc_required) { 707 if (alloc_required) {
445 gfs2_inplace_release(ip); 708 gfs2_inplace_release(ip);
446out_qunlock: 709out_qunlock:
447 gfs2_quota_unlock(ip); 710 gfs2_quota_unlock(ip);
448out_alloc_put: 711out_alloc_put:
449 gfs2_alloc_put(ip); 712 gfs2_alloc_put(ip);
450 } 713 }
451out_putpage:
452 page_cache_release(page);
453 if (pos + len > ip->i_inode.i_size)
454 vmtruncate(&ip->i_inode, ip->i_inode.i_size);
455out_unlock: 714out_unlock:
456 gfs2_glock_dq_m(1, &ip->i_gh); 715 gfs2_glock_dq(&ip->i_gh);
457out_uninit: 716out_uninit:
458 gfs2_holder_uninit(&ip->i_gh); 717 gfs2_holder_uninit(&ip->i_gh);
459 }
460
461 return error; 718 return error;
462} 719}
463 720
@@ -565,7 +822,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
565 struct gfs2_inode *ip = GFS2_I(inode); 822 struct gfs2_inode *ip = GFS2_I(inode);
566 struct gfs2_sbd *sdp = GFS2_SB(inode); 823 struct gfs2_sbd *sdp = GFS2_SB(inode);
567 struct buffer_head *dibh; 824 struct buffer_head *dibh;
568 struct gfs2_alloc *al = &ip->i_alloc; 825 struct gfs2_alloc *al = ip->i_alloc;
569 struct gfs2_dinode *di; 826 struct gfs2_dinode *di;
570 unsigned int from = pos & (PAGE_CACHE_SIZE - 1); 827 unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
571 unsigned int to = from + len; 828 unsigned int to = from + len;
@@ -585,19 +842,16 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
585 if (gfs2_is_stuffed(ip)) 842 if (gfs2_is_stuffed(ip))
586 return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); 843 return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
587 844
588 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) 845 if (!gfs2_is_writeback(ip))
589 gfs2_page_add_databufs(ip, page, from, to); 846 gfs2_page_add_databufs(ip, page, from, to);
590 847
591 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 848 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
592 849
593 if (likely(ret >= 0)) { 850 if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) {
594 copied = ret; 851 di = (struct gfs2_dinode *)dibh->b_data;
595 if ((pos + copied) > inode->i_size) { 852 ip->i_di.di_size = inode->i_size;
596 di = (struct gfs2_dinode *)dibh->b_data; 853 di->di_size = cpu_to_be64(inode->i_size);
597 ip->i_di.di_size = inode->i_size; 854 mark_inode_dirty(inode);
598 di->di_size = cpu_to_be64(inode->i_size);
599 mark_inode_dirty(inode);
600 }
601 } 855 }
602 856
603 if (inode == sdp->sd_rindex) 857 if (inode == sdp->sd_rindex)
@@ -606,7 +860,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
606 brelse(dibh); 860 brelse(dibh);
607 gfs2_trans_end(sdp); 861 gfs2_trans_end(sdp);
608failed: 862failed:
609 if (al->al_requested) { 863 if (al) {
610 gfs2_inplace_release(ip); 864 gfs2_inplace_release(ip);
611 gfs2_quota_unlock(ip); 865 gfs2_quota_unlock(ip);
612 gfs2_alloc_put(ip); 866 gfs2_alloc_put(ip);
@@ -625,11 +879,7 @@ failed:
625 879
626static int gfs2_set_page_dirty(struct page *page) 880static int gfs2_set_page_dirty(struct page *page)
627{ 881{
628 struct gfs2_inode *ip = GFS2_I(page->mapping->host); 882 SetPageChecked(page);
629 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
630
631 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
632 SetPageChecked(page);
633 return __set_page_dirty_buffers(page); 883 return __set_page_dirty_buffers(page);
634} 884}
635 885
@@ -653,7 +903,7 @@ static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
653 return 0; 903 return 0;
654 904
655 if (!gfs2_is_stuffed(ip)) 905 if (!gfs2_is_stuffed(ip))
656 dblock = generic_block_bmap(mapping, lblock, gfs2_get_block); 906 dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
657 907
658 gfs2_glock_dq_uninit(&i_gh); 908 gfs2_glock_dq_uninit(&i_gh);
659 909
@@ -719,13 +969,9 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
719{ 969{
720 /* 970 /*
721 * Should we return an error here? I can't see that O_DIRECT for 971 * Should we return an error here? I can't see that O_DIRECT for
722 * a journaled file makes any sense. For now we'll silently fall 972 * a stuffed file makes any sense. For now we'll silently fall
723 * back to buffered I/O, likewise we do the same for stuffed 973 * back to buffered I/O
724 * files since they are (a) small and (b) unaligned.
725 */ 974 */
726 if (gfs2_is_jdata(ip))
727 return 0;
728
729 if (gfs2_is_stuffed(ip)) 975 if (gfs2_is_stuffed(ip))
730 return 0; 976 return 0;
731 977
@@ -836,9 +1082,23 @@ cannot_release:
836 return 0; 1082 return 0;
837} 1083}
838 1084
839const struct address_space_operations gfs2_file_aops = { 1085static const struct address_space_operations gfs2_writeback_aops = {
840 .writepage = gfs2_writepage, 1086 .writepage = gfs2_writeback_writepage,
841 .writepages = gfs2_writepages, 1087 .writepages = gfs2_writeback_writepages,
1088 .readpage = gfs2_readpage,
1089 .readpages = gfs2_readpages,
1090 .sync_page = block_sync_page,
1091 .write_begin = gfs2_write_begin,
1092 .write_end = gfs2_write_end,
1093 .bmap = gfs2_bmap,
1094 .invalidatepage = gfs2_invalidatepage,
1095 .releasepage = gfs2_releasepage,
1096 .direct_IO = gfs2_direct_IO,
1097 .migratepage = buffer_migrate_page,
1098};
1099
1100static const struct address_space_operations gfs2_ordered_aops = {
1101 .writepage = gfs2_ordered_writepage,
842 .readpage = gfs2_readpage, 1102 .readpage = gfs2_readpage,
843 .readpages = gfs2_readpages, 1103 .readpages = gfs2_readpages,
844 .sync_page = block_sync_page, 1104 .sync_page = block_sync_page,
@@ -849,5 +1109,34 @@ const struct address_space_operations gfs2_file_aops = {
849 .invalidatepage = gfs2_invalidatepage, 1109 .invalidatepage = gfs2_invalidatepage,
850 .releasepage = gfs2_releasepage, 1110 .releasepage = gfs2_releasepage,
851 .direct_IO = gfs2_direct_IO, 1111 .direct_IO = gfs2_direct_IO,
1112 .migratepage = buffer_migrate_page,
852}; 1113};
853 1114
1115static const struct address_space_operations gfs2_jdata_aops = {
1116 .writepage = gfs2_jdata_writepage,
1117 .writepages = gfs2_jdata_writepages,
1118 .readpage = gfs2_readpage,
1119 .readpages = gfs2_readpages,
1120 .sync_page = block_sync_page,
1121 .write_begin = gfs2_write_begin,
1122 .write_end = gfs2_write_end,
1123 .set_page_dirty = gfs2_set_page_dirty,
1124 .bmap = gfs2_bmap,
1125 .invalidatepage = gfs2_invalidatepage,
1126 .releasepage = gfs2_releasepage,
1127};
1128
1129void gfs2_set_aops(struct inode *inode)
1130{
1131 struct gfs2_inode *ip = GFS2_I(inode);
1132
1133 if (gfs2_is_writeback(ip))
1134 inode->i_mapping->a_ops = &gfs2_writeback_aops;
1135 else if (gfs2_is_ordered(ip))
1136 inode->i_mapping->a_ops = &gfs2_ordered_aops;
1137 else if (gfs2_is_jdata(ip))
1138 inode->i_mapping->a_ops = &gfs2_jdata_aops;
1139 else
1140 BUG();
1141}
1142
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
index fa1b5b3d28b9..5da21285bba4 100644
--- a/fs/gfs2/ops_address.h
+++ b/fs/gfs2/ops_address.h
@@ -14,9 +14,10 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16 16
17extern const struct address_space_operations gfs2_file_aops;
18extern int gfs2_get_block(struct inode *inode, sector_t lblock,
19 struct buffer_head *bh_result, int create);
20extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); 17extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
18extern int gfs2_internal_read(struct gfs2_inode *ip,
19 struct file_ra_state *ra_state,
20 char *buf, loff_t *pos, unsigned size);
21extern void gfs2_set_aops(struct inode *inode);
21 22
22#endif /* __OPS_ADDRESS_DOT_H__ */ 23#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index bb11fd6752d3..f4842f2548cd 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -33,57 +33,12 @@
33#include "lm.h" 33#include "lm.h"
34#include "log.h" 34#include "log.h"
35#include "meta_io.h" 35#include "meta_io.h"
36#include "ops_file.h"
37#include "ops_vm.h"
38#include "quota.h" 36#include "quota.h"
39#include "rgrp.h" 37#include "rgrp.h"
40#include "trans.h" 38#include "trans.h"
41#include "util.h" 39#include "util.h"
42#include "eaops.h" 40#include "eaops.h"
43 41#include "ops_address.h"
44/*
45 * Most fields left uninitialised to catch anybody who tries to
46 * use them. f_flags set to prevent file_accessed() from touching
47 * any other part of this. Its use is purely as a flag so that we
48 * know (in readpage()) whether or not do to locking.
49 */
50struct file gfs2_internal_file_sentinel = {
51 .f_flags = O_NOATIME|O_RDONLY,
52};
53
54static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
55 unsigned long offset, unsigned long size)
56{
57 char *kaddr;
58 unsigned long count = desc->count;
59
60 if (size > count)
61 size = count;
62
63 kaddr = kmap(page);
64 memcpy(desc->arg.data, kaddr + offset, size);
65 kunmap(page);
66
67 desc->count = count - size;
68 desc->written += size;
69 desc->arg.buf += size;
70 return size;
71}
72
73int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
74 char *buf, loff_t *pos, unsigned size)
75{
76 struct inode *inode = &ip->i_inode;
77 read_descriptor_t desc;
78 desc.written = 0;
79 desc.arg.data = buf;
80 desc.count = size;
81 desc.error = 0;
82 do_generic_mapping_read(inode->i_mapping, ra_state,
83 &gfs2_internal_file_sentinel, pos, &desc,
84 gfs2_read_actor);
85 return desc.written ? desc.written : desc.error;
86}
87 42
88/** 43/**
89 * gfs2_llseek - seek to a location in a file 44 * gfs2_llseek - seek to a location in a file
@@ -214,7 +169,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
214 if (put_user(fsflags, ptr)) 169 if (put_user(fsflags, ptr))
215 error = -EFAULT; 170 error = -EFAULT;
216 171
217 gfs2_glock_dq_m(1, &gh); 172 gfs2_glock_dq(&gh);
218 gfs2_holder_uninit(&gh); 173 gfs2_holder_uninit(&gh);
219 return error; 174 return error;
220} 175}
@@ -291,7 +246,16 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
291 if (error) 246 if (error)
292 goto out; 247 goto out;
293 } 248 }
294 249 if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
250 if (flags & GFS2_DIF_JDATA)
251 gfs2_log_flush(sdp, ip->i_gl);
252 error = filemap_fdatawrite(inode->i_mapping);
253 if (error)
254 goto out;
255 error = filemap_fdatawait(inode->i_mapping);
256 if (error)
257 goto out;
258 }
295 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 259 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
296 if (error) 260 if (error)
297 goto out; 261 goto out;
@@ -303,6 +267,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
303 gfs2_dinode_out(ip, bh->b_data); 267 gfs2_dinode_out(ip, bh->b_data);
304 brelse(bh); 268 brelse(bh);
305 gfs2_set_inode_flags(inode); 269 gfs2_set_inode_flags(inode);
270 gfs2_set_aops(inode);
306out_trans_end: 271out_trans_end:
307 gfs2_trans_end(sdp); 272 gfs2_trans_end(sdp);
308out: 273out:
@@ -338,6 +303,128 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
338 return -ENOTTY; 303 return -ENOTTY;
339} 304}
340 305
306/**
307 * gfs2_allocate_page_backing - Use bmap to allocate blocks
308 * @page: The (locked) page to allocate backing for
309 *
310 * We try to allocate all the blocks required for the page in
311 * one go. This might fail for various reasons, so we keep
312 * trying until all the blocks to back this page are allocated.
313 * If some of the blocks are already allocated, thats ok too.
314 */
315
316static int gfs2_allocate_page_backing(struct page *page)
317{
318 struct inode *inode = page->mapping->host;
319 struct buffer_head bh;
320 unsigned long size = PAGE_CACHE_SIZE;
321 u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
322
323 do {
324 bh.b_state = 0;
325 bh.b_size = size;
326 gfs2_block_map(inode, lblock, &bh, 1);
327 if (!buffer_mapped(&bh))
328 return -EIO;
329 size -= bh.b_size;
330 lblock += (bh.b_size >> inode->i_blkbits);
331 } while(size > 0);
332 return 0;
333}
334
335/**
336 * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
337 * @vma: The virtual memory area
338 * @page: The page which is about to become writable
339 *
340 * When the page becomes writable, we need to ensure that we have
341 * blocks allocated on disk to back that page.
342 */
343
344static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
345{
346 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
347 struct gfs2_inode *ip = GFS2_I(inode);
348 struct gfs2_sbd *sdp = GFS2_SB(inode);
349 unsigned long last_index;
350 u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits);
351 unsigned int data_blocks, ind_blocks, rblocks;
352 int alloc_required = 0;
353 struct gfs2_holder gh;
354 struct gfs2_alloc *al;
355 int ret;
356
357 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh);
358 ret = gfs2_glock_nq_atime(&gh);
359 if (ret)
360 goto out;
361
362 set_bit(GIF_SW_PAGED, &ip->i_flags);
363 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
364 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
365 if (ret || !alloc_required)
366 goto out_unlock;
367 ret = -ENOMEM;
368 al = gfs2_alloc_get(ip);
369 if (al == NULL)
370 goto out_unlock;
371
372 ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
373 if (ret)
374 goto out_alloc_put;
375 ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
376 if (ret)
377 goto out_quota_unlock;
378 al->al_requested = data_blocks + ind_blocks;
379 ret = gfs2_inplace_reserve(ip);
380 if (ret)
381 goto out_quota_unlock;
382
383 rblocks = RES_DINODE + ind_blocks;
384 if (gfs2_is_jdata(ip))
385 rblocks += data_blocks ? data_blocks : 1;
386 if (ind_blocks || data_blocks)
387 rblocks += RES_STATFS + RES_QUOTA;
388 ret = gfs2_trans_begin(sdp, rblocks, 0);
389 if (ret)
390 goto out_trans_fail;
391
392 lock_page(page);
393 ret = -EINVAL;
394 last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
395 if (page->index > last_index)
396 goto out_unlock_page;
397 ret = 0;
398 if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
399 goto out_unlock_page;
400 if (gfs2_is_stuffed(ip)) {
401 ret = gfs2_unstuff_dinode(ip, page);
402 if (ret)
403 goto out_unlock_page;
404 }
405 ret = gfs2_allocate_page_backing(page);
406
407out_unlock_page:
408 unlock_page(page);
409 gfs2_trans_end(sdp);
410out_trans_fail:
411 gfs2_inplace_release(ip);
412out_quota_unlock:
413 gfs2_quota_unlock(ip);
414out_alloc_put:
415 gfs2_alloc_put(ip);
416out_unlock:
417 gfs2_glock_dq(&gh);
418out:
419 gfs2_holder_uninit(&gh);
420 return ret;
421}
422
423static struct vm_operations_struct gfs2_vm_ops = {
424 .fault = filemap_fault,
425 .page_mkwrite = gfs2_page_mkwrite,
426};
427
341 428
342/** 429/**
343 * gfs2_mmap - 430 * gfs2_mmap -
@@ -360,14 +447,7 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
360 return error; 447 return error;
361 } 448 }
362 449
363 /* This is VM_MAYWRITE instead of VM_WRITE because a call 450 vma->vm_ops = &gfs2_vm_ops;
364 to mprotect() can turn on VM_WRITE later. */
365
366 if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
367 (VM_MAYSHARE | VM_MAYWRITE))
368 vma->vm_ops = &gfs2_vm_ops_sharewrite;
369 else
370 vma->vm_ops = &gfs2_vm_ops_private;
371 451
372 gfs2_glock_dq_uninit(&i_gh); 452 gfs2_glock_dq_uninit(&i_gh);
373 453
@@ -538,15 +618,6 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
538 if (__mandatory_lock(&ip->i_inode)) 618 if (__mandatory_lock(&ip->i_inode))
539 return -ENOLCK; 619 return -ENOLCK;
540 620
541 if (sdp->sd_args.ar_localflocks) {
542 if (IS_GETLK(cmd)) {
543 posix_test_lock(file, fl);
544 return 0;
545 } else {
546 return posix_lock_file_wait(file, fl);
547 }
548 }
549
550 if (cmd == F_CANCELLK) { 621 if (cmd == F_CANCELLK) {
551 /* Hack: */ 622 /* Hack: */
552 cmd = F_SETLK; 623 cmd = F_SETLK;
@@ -632,16 +703,12 @@ static void do_unflock(struct file *file, struct file_lock *fl)
632static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) 703static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
633{ 704{
634 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 705 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
635 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
636 706
637 if (!(fl->fl_flags & FL_FLOCK)) 707 if (!(fl->fl_flags & FL_FLOCK))
638 return -ENOLCK; 708 return -ENOLCK;
639 if (__mandatory_lock(&ip->i_inode)) 709 if (__mandatory_lock(&ip->i_inode))
640 return -ENOLCK; 710 return -ENOLCK;
641 711
642 if (sdp->sd_args.ar_localflocks)
643 return flock_lock_file_wait(file, fl);
644
645 if (fl->fl_type == F_UNLCK) { 712 if (fl->fl_type == F_UNLCK) {
646 do_unflock(file, fl); 713 do_unflock(file, fl);
647 return 0; 714 return 0;
@@ -678,3 +745,27 @@ const struct file_operations gfs2_dir_fops = {
678 .flock = gfs2_flock, 745 .flock = gfs2_flock,
679}; 746};
680 747
748const struct file_operations gfs2_file_fops_nolock = {
749 .llseek = gfs2_llseek,
750 .read = do_sync_read,
751 .aio_read = generic_file_aio_read,
752 .write = do_sync_write,
753 .aio_write = generic_file_aio_write,
754 .unlocked_ioctl = gfs2_ioctl,
755 .mmap = gfs2_mmap,
756 .open = gfs2_open,
757 .release = gfs2_close,
758 .fsync = gfs2_fsync,
759 .splice_read = generic_file_splice_read,
760 .splice_write = generic_file_splice_write,
761 .setlease = gfs2_setlease,
762};
763
764const struct file_operations gfs2_dir_fops_nolock = {
765 .readdir = gfs2_readdir,
766 .unlocked_ioctl = gfs2_ioctl,
767 .open = gfs2_open,
768 .release = gfs2_close,
769 .fsync = gfs2_fsync,
770};
771
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
deleted file mode 100644
index 7e5d8ec9c846..000000000000
--- a/fs/gfs2/ops_file.h
+++ /dev/null
@@ -1,24 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_FILE_DOT_H__
11#define __OPS_FILE_DOT_H__
12
13#include <linux/fs.h>
14struct gfs2_inode;
15
16extern struct file gfs2_internal_file_sentinel;
17extern int gfs2_internal_read(struct gfs2_inode *ip,
18 struct file_ra_state *ra_state,
19 char *buf, loff_t *pos, unsigned size);
20extern void gfs2_set_inode_flags(struct inode *inode);
21extern const struct file_operations gfs2_file_fops;
22extern const struct file_operations gfs2_dir_fops;
23
24#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 17de58e83d92..43d511bba52d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -21,6 +21,7 @@
21 21
22#include "gfs2.h" 22#include "gfs2.h"
23#include "incore.h" 23#include "incore.h"
24#include "bmap.h"
24#include "daemon.h" 25#include "daemon.h"
25#include "glock.h" 26#include "glock.h"
26#include "glops.h" 27#include "glops.h"
@@ -59,7 +60,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
59 60
60 mutex_init(&sdp->sd_inum_mutex); 61 mutex_init(&sdp->sd_inum_mutex);
61 spin_lock_init(&sdp->sd_statfs_spin); 62 spin_lock_init(&sdp->sd_statfs_spin);
62 mutex_init(&sdp->sd_statfs_mutex);
63 63
64 spin_lock_init(&sdp->sd_rindex_spin); 64 spin_lock_init(&sdp->sd_rindex_spin);
65 mutex_init(&sdp->sd_rindex_mutex); 65 mutex_init(&sdp->sd_rindex_mutex);
@@ -77,7 +77,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
77 77
78 spin_lock_init(&sdp->sd_log_lock); 78 spin_lock_init(&sdp->sd_log_lock);
79 79
80 INIT_LIST_HEAD(&sdp->sd_log_le_gl);
81 INIT_LIST_HEAD(&sdp->sd_log_le_buf); 80 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
82 INIT_LIST_HEAD(&sdp->sd_log_le_revoke); 81 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
83 INIT_LIST_HEAD(&sdp->sd_log_le_rg); 82 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
@@ -303,6 +302,67 @@ out:
303 return error; 302 return error;
304} 303}
305 304
305/**
306 * map_journal_extents - create a reusable "extent" mapping from all logical
307 * blocks to all physical blocks for the given journal. This will save
308 * us time when writing journal blocks. Most journals will have only one
309 * extent that maps all their logical blocks. That's because gfs2.mkfs
310 * arranges the journal blocks sequentially to maximize performance.
311 * So the extent would map the first block for the entire file length.
312 * However, gfs2_jadd can happen while file activity is happening, so
313 * those journals may not be sequential. Less likely is the case where
314 * the users created their own journals by mounting the metafs and
315 * laying it out. But it's still possible. These journals might have
316 * several extents.
317 *
318 * TODO: This should be done in bigger chunks rather than one block at a time,
319 * but since it's only done at mount time, I'm not worried about the
320 * time it takes.
321 */
322static int map_journal_extents(struct gfs2_sbd *sdp)
323{
324 struct gfs2_jdesc *jd = sdp->sd_jdesc;
325 unsigned int lb;
326 u64 db, prev_db; /* logical block, disk block, prev disk block */
327 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
328 struct gfs2_journal_extent *jext = NULL;
329 struct buffer_head bh;
330 int rc = 0;
331
332 prev_db = 0;
333
334 for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) {
335 bh.b_state = 0;
336 bh.b_blocknr = 0;
337 bh.b_size = 1 << ip->i_inode.i_blkbits;
338 rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0);
339 db = bh.b_blocknr;
340 if (rc || !db) {
341 printk(KERN_INFO "GFS2 journal mapping error %d: lb="
342 "%u db=%llu\n", rc, lb, (unsigned long long)db);
343 break;
344 }
345 if (!prev_db || db != prev_db + 1) {
346 jext = kzalloc(sizeof(struct gfs2_journal_extent),
347 GFP_KERNEL);
348 if (!jext) {
349 printk(KERN_INFO "GFS2 error: out of memory "
350 "mapping journal extents.\n");
351 rc = -ENOMEM;
352 break;
353 }
354 jext->dblock = db;
355 jext->lblock = lb;
356 jext->blocks = 1;
357 list_add_tail(&jext->extent_list, &jd->extent_list);
358 } else {
359 jext->blocks++;
360 }
361 prev_db = db;
362 }
363 return rc;
364}
365
306static int init_journal(struct gfs2_sbd *sdp, int undo) 366static int init_journal(struct gfs2_sbd *sdp, int undo)
307{ 367{
308 struct gfs2_holder ji_gh; 368 struct gfs2_holder ji_gh;
@@ -340,7 +400,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
340 400
341 if (sdp->sd_args.ar_spectator) { 401 if (sdp->sd_args.ar_spectator) {
342 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); 402 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
343 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks; 403 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
344 } else { 404 } else {
345 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { 405 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
346 fs_err(sdp, "can't mount journal #%u\n", 406 fs_err(sdp, "can't mount journal #%u\n",
@@ -377,7 +437,10 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
377 sdp->sd_jdesc->jd_jid, error); 437 sdp->sd_jdesc->jd_jid, error);
378 goto fail_jinode_gh; 438 goto fail_jinode_gh;
379 } 439 }
380 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks; 440 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
441
442 /* Map the extents for this journal's blocks */
443 map_journal_extents(sdp);
381 } 444 }
382 445
383 if (sdp->sd_lockstruct.ls_first) { 446 if (sdp->sd_lockstruct.ls_first) {
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 291f0c7eaa3b..9f71372c1757 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -61,7 +61,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
61 inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0); 61 inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0);
62 if (!IS_ERR(inode)) { 62 if (!IS_ERR(inode)) {
63 gfs2_trans_end(sdp); 63 gfs2_trans_end(sdp);
64 if (dip->i_alloc.al_rgd) 64 if (dip->i_alloc->al_rgd)
65 gfs2_inplace_release(dip); 65 gfs2_inplace_release(dip);
66 gfs2_quota_unlock(dip); 66 gfs2_quota_unlock(dip);
67 gfs2_alloc_put(dip); 67 gfs2_alloc_put(dip);
@@ -113,8 +113,18 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
113 if (inode && IS_ERR(inode)) 113 if (inode && IS_ERR(inode))
114 return ERR_PTR(PTR_ERR(inode)); 114 return ERR_PTR(PTR_ERR(inode));
115 115
116 if (inode) 116 if (inode) {
117 struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
118 struct gfs2_holder gh;
119 int error;
120 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
121 if (error) {
122 iput(inode);
123 return ERR_PTR(error);
124 }
125 gfs2_glock_dq_uninit(&gh);
117 return d_splice_alias(inode, dentry); 126 return d_splice_alias(inode, dentry);
127 }
118 d_add(dentry, inode); 128 d_add(dentry, inode);
119 129
120 return NULL; 130 return NULL;
@@ -366,7 +376,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
366 } 376 }
367 377
368 gfs2_trans_end(sdp); 378 gfs2_trans_end(sdp);
369 if (dip->i_alloc.al_rgd) 379 if (dip->i_alloc->al_rgd)
370 gfs2_inplace_release(dip); 380 gfs2_inplace_release(dip);
371 gfs2_quota_unlock(dip); 381 gfs2_quota_unlock(dip);
372 gfs2_alloc_put(dip); 382 gfs2_alloc_put(dip);
@@ -442,7 +452,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
442 gfs2_assert_withdraw(sdp, !error); /* dip already pinned */ 452 gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
443 453
444 gfs2_trans_end(sdp); 454 gfs2_trans_end(sdp);
445 if (dip->i_alloc.al_rgd) 455 if (dip->i_alloc->al_rgd)
446 gfs2_inplace_release(dip); 456 gfs2_inplace_release(dip);
447 gfs2_quota_unlock(dip); 457 gfs2_quota_unlock(dip);
448 gfs2_alloc_put(dip); 458 gfs2_alloc_put(dip);
@@ -548,7 +558,7 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
548 } 558 }
549 559
550 gfs2_trans_end(sdp); 560 gfs2_trans_end(sdp);
551 if (dip->i_alloc.al_rgd) 561 if (dip->i_alloc->al_rgd)
552 gfs2_inplace_release(dip); 562 gfs2_inplace_release(dip);
553 gfs2_quota_unlock(dip); 563 gfs2_quota_unlock(dip);
554 gfs2_alloc_put(dip); 564 gfs2_alloc_put(dip);
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
index 34f0caac1a03..fd8cee231e1d 100644
--- a/fs/gfs2/ops_inode.h
+++ b/fs/gfs2/ops_inode.h
@@ -16,5 +16,11 @@ extern const struct inode_operations gfs2_file_iops;
16extern const struct inode_operations gfs2_dir_iops; 16extern const struct inode_operations gfs2_dir_iops;
17extern const struct inode_operations gfs2_symlink_iops; 17extern const struct inode_operations gfs2_symlink_iops;
18extern const struct inode_operations gfs2_dev_iops; 18extern const struct inode_operations gfs2_dev_iops;
19extern const struct file_operations gfs2_file_fops;
20extern const struct file_operations gfs2_dir_fops;
21extern const struct file_operations gfs2_file_fops_nolock;
22extern const struct file_operations gfs2_dir_fops_nolock;
23
24extern void gfs2_set_inode_flags(struct inode *inode);
19 25
20#endif /* __OPS_INODE_DOT_H__ */ 26#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 950f31460e8b..5e524217944a 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -487,7 +487,6 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
487 if (ip) { 487 if (ip) {
488 ip->i_flags = 0; 488 ip->i_flags = 0;
489 ip->i_gl = NULL; 489 ip->i_gl = NULL;
490 ip->i_last_pfault = jiffies;
491 } 490 }
492 return &ip->i_inode; 491 return &ip->i_inode;
493} 492}
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
deleted file mode 100644
index 927d739d4685..000000000000
--- a/fs/gfs2/ops_vm.c
+++ /dev/null
@@ -1,169 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/slab.h>
11#include <linux/spinlock.h>
12#include <linux/completion.h>
13#include <linux/buffer_head.h>
14#include <linux/mm.h>
15#include <linux/pagemap.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "inode.h"
24#include "ops_vm.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "trans.h"
28#include "util.h"
29
30static int gfs2_private_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
31{
32 struct gfs2_inode *ip = GFS2_I(vma->vm_file->f_mapping->host);
33
34 set_bit(GIF_PAGED, &ip->i_flags);
35 return filemap_fault(vma, vmf);
36}
37
38static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
39{
40 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
41 unsigned long index = page->index;
42 u64 lblock = index << (PAGE_CACHE_SHIFT -
43 sdp->sd_sb.sb_bsize_shift);
44 unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
45 struct gfs2_alloc *al;
46 unsigned int data_blocks, ind_blocks;
47 unsigned int x;
48 int error;
49
50 al = gfs2_alloc_get(ip);
51
52 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
53 if (error)
54 goto out;
55
56 error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
57 if (error)
58 goto out_gunlock_q;
59
60 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
61
62 al->al_requested = data_blocks + ind_blocks;
63
64 error = gfs2_inplace_reserve(ip);
65 if (error)
66 goto out_gunlock_q;
67
68 error = gfs2_trans_begin(sdp, al->al_rgd->rd_length +
69 ind_blocks + RES_DINODE +
70 RES_STATFS + RES_QUOTA, 0);
71 if (error)
72 goto out_ipres;
73
74 if (gfs2_is_stuffed(ip)) {
75 error = gfs2_unstuff_dinode(ip, NULL);
76 if (error)
77 goto out_trans;
78 }
79
80 for (x = 0; x < blocks; ) {
81 u64 dblock;
82 unsigned int extlen;
83 int new = 1;
84
85 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
86 if (error)
87 goto out_trans;
88
89 lblock += extlen;
90 x += extlen;
91 }
92
93 gfs2_assert_warn(sdp, al->al_alloced);
94
95out_trans:
96 gfs2_trans_end(sdp);
97out_ipres:
98 gfs2_inplace_release(ip);
99out_gunlock_q:
100 gfs2_quota_unlock(ip);
101out:
102 gfs2_alloc_put(ip);
103 return error;
104}
105
106static int gfs2_sharewrite_fault(struct vm_area_struct *vma,
107 struct vm_fault *vmf)
108{
109 struct file *file = vma->vm_file;
110 struct gfs2_file *gf = file->private_data;
111 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
112 struct gfs2_holder i_gh;
113 int alloc_required;
114 int error;
115 int ret = 0;
116
117 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
118 if (error)
119 goto out;
120
121 set_bit(GIF_PAGED, &ip->i_flags);
122 set_bit(GIF_SW_PAGED, &ip->i_flags);
123
124 error = gfs2_write_alloc_required(ip,
125 (u64)vmf->pgoff << PAGE_CACHE_SHIFT,
126 PAGE_CACHE_SIZE, &alloc_required);
127 if (error) {
128 ret = VM_FAULT_OOM; /* XXX: are these right? */
129 goto out_unlock;
130 }
131
132 set_bit(GFF_EXLOCK, &gf->f_flags);
133 ret = filemap_fault(vma, vmf);
134 clear_bit(GFF_EXLOCK, &gf->f_flags);
135 if (ret & VM_FAULT_ERROR)
136 goto out_unlock;
137
138 if (alloc_required) {
139 /* XXX: do we need to drop page lock around alloc_page_backing?*/
140 error = alloc_page_backing(ip, vmf->page);
141 if (error) {
142 /*
143 * VM_FAULT_LOCKED should always be the case for
144 * filemap_fault, but it may not be in a future
145 * implementation.
146 */
147 if (ret & VM_FAULT_LOCKED)
148 unlock_page(vmf->page);
149 page_cache_release(vmf->page);
150 ret = VM_FAULT_OOM;
151 goto out_unlock;
152 }
153 set_page_dirty(vmf->page);
154 }
155
156out_unlock:
157 gfs2_glock_dq_uninit(&i_gh);
158out:
159 return ret;
160}
161
162struct vm_operations_struct gfs2_vm_ops_private = {
163 .fault = gfs2_private_fault,
164};
165
166struct vm_operations_struct gfs2_vm_ops_sharewrite = {
167 .fault = gfs2_sharewrite_fault,
168};
169
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
deleted file mode 100644
index 4ae8f43ed5e3..000000000000
--- a/fs/gfs2/ops_vm.h
+++ /dev/null
@@ -1,18 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_VM_DOT_H__
11#define __OPS_VM_DOT_H__
12
13#include <linux/mm.h>
14
15extern struct vm_operations_struct gfs2_vm_ops_private;
16extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
17
18#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index addb51e0f135..a08dabd6ce90 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -59,7 +59,6 @@
59#include "super.h" 59#include "super.h"
60#include "trans.h" 60#include "trans.h"
61#include "inode.h" 61#include "inode.h"
62#include "ops_file.h"
63#include "ops_address.h" 62#include "ops_address.h"
64#include "util.h" 63#include "util.h"
65 64
@@ -274,10 +273,10 @@ static int bh_get(struct gfs2_quota_data *qd)
274 } 273 }
275 274
276 block = qd->qd_slot / sdp->sd_qc_per_block; 275 block = qd->qd_slot / sdp->sd_qc_per_block;
277 offset = qd->qd_slot % sdp->sd_qc_per_block;; 276 offset = qd->qd_slot % sdp->sd_qc_per_block;
278 277
279 bh_map.b_size = 1 << ip->i_inode.i_blkbits; 278 bh_map.b_size = 1 << ip->i_inode.i_blkbits;
280 error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map); 279 error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
281 if (error) 280 if (error)
282 goto fail; 281 goto fail;
283 error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh); 282 error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
@@ -454,7 +453,7 @@ static void qdsb_put(struct gfs2_quota_data *qd)
454int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) 453int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
455{ 454{
456 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 455 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
457 struct gfs2_alloc *al = &ip->i_alloc; 456 struct gfs2_alloc *al = ip->i_alloc;
458 struct gfs2_quota_data **qd = al->al_qd; 457 struct gfs2_quota_data **qd = al->al_qd;
459 int error; 458 int error;
460 459
@@ -502,7 +501,7 @@ out:
502void gfs2_quota_unhold(struct gfs2_inode *ip) 501void gfs2_quota_unhold(struct gfs2_inode *ip)
503{ 502{
504 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 503 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
505 struct gfs2_alloc *al = &ip->i_alloc; 504 struct gfs2_alloc *al = ip->i_alloc;
506 unsigned int x; 505 unsigned int x;
507 506
508 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); 507 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
@@ -646,7 +645,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
646 } 645 }
647 646
648 if (!buffer_mapped(bh)) { 647 if (!buffer_mapped(bh)) {
649 gfs2_get_block(inode, iblock, bh, 1); 648 gfs2_block_map(inode, iblock, bh, 1);
650 if (!buffer_mapped(bh)) 649 if (!buffer_mapped(bh))
651 goto unlock; 650 goto unlock;
652 } 651 }
@@ -793,11 +792,9 @@ static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
793 struct gfs2_holder i_gh; 792 struct gfs2_holder i_gh;
794 struct gfs2_quota_host q; 793 struct gfs2_quota_host q;
795 char buf[sizeof(struct gfs2_quota)]; 794 char buf[sizeof(struct gfs2_quota)];
796 struct file_ra_state ra_state;
797 int error; 795 int error;
798 struct gfs2_quota_lvb *qlvb; 796 struct gfs2_quota_lvb *qlvb;
799 797
800 file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
801restart: 798restart:
802 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); 799 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
803 if (error) 800 if (error)
@@ -820,8 +817,8 @@ restart:
820 817
821 memset(buf, 0, sizeof(struct gfs2_quota)); 818 memset(buf, 0, sizeof(struct gfs2_quota));
822 pos = qd2offset(qd); 819 pos = qd2offset(qd);
823 error = gfs2_internal_read(ip, &ra_state, buf, 820 error = gfs2_internal_read(ip, NULL, buf, &pos,
824 &pos, sizeof(struct gfs2_quota)); 821 sizeof(struct gfs2_quota));
825 if (error < 0) 822 if (error < 0)
826 goto fail_gunlock; 823 goto fail_gunlock;
827 824
@@ -856,7 +853,7 @@ fail:
856int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) 853int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
857{ 854{
858 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 855 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
859 struct gfs2_alloc *al = &ip->i_alloc; 856 struct gfs2_alloc *al = ip->i_alloc;
860 unsigned int x; 857 unsigned int x;
861 int error = 0; 858 int error = 0;
862 859
@@ -924,7 +921,7 @@ static int need_sync(struct gfs2_quota_data *qd)
924 921
925void gfs2_quota_unlock(struct gfs2_inode *ip) 922void gfs2_quota_unlock(struct gfs2_inode *ip)
926{ 923{
927 struct gfs2_alloc *al = &ip->i_alloc; 924 struct gfs2_alloc *al = ip->i_alloc;
928 struct gfs2_quota_data *qda[4]; 925 struct gfs2_quota_data *qda[4];
929 unsigned int count = 0; 926 unsigned int count = 0;
930 unsigned int x; 927 unsigned int x;
@@ -972,7 +969,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
972int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) 969int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
973{ 970{
974 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 971 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
975 struct gfs2_alloc *al = &ip->i_alloc; 972 struct gfs2_alloc *al = ip->i_alloc;
976 struct gfs2_quota_data *qd; 973 struct gfs2_quota_data *qd;
977 s64 value; 974 s64 value;
978 unsigned int x; 975 unsigned int x;
@@ -1016,10 +1013,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
1016void gfs2_quota_change(struct gfs2_inode *ip, s64 change, 1013void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
1017 u32 uid, u32 gid) 1014 u32 uid, u32 gid)
1018{ 1015{
1019 struct gfs2_alloc *al = &ip->i_alloc; 1016 struct gfs2_alloc *al = ip->i_alloc;
1020 struct gfs2_quota_data *qd; 1017 struct gfs2_quota_data *qd;
1021 unsigned int x; 1018 unsigned int x;
1022 unsigned int found = 0;
1023 1019
1024 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) 1020 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
1025 return; 1021 return;
@@ -1032,7 +1028,6 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
1032 if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || 1028 if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
1033 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { 1029 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
1034 do_qc(qd, change); 1030 do_qc(qd, change);
1035 found++;
1036 } 1031 }
1037 } 1032 }
1038} 1033}
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index beb6c7ac0086..b249e294a95b 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -391,7 +391,7 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
391 lblock = head->lh_blkno; 391 lblock = head->lh_blkno;
392 gfs2_replay_incr_blk(sdp, &lblock); 392 gfs2_replay_incr_blk(sdp, &lblock);
393 bh_map.b_size = 1 << ip->i_inode.i_blkbits; 393 bh_map.b_size = 1 << ip->i_inode.i_blkbits;
394 error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map); 394 error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0);
395 if (error) 395 if (error)
396 return error; 396 return error;
397 if (!bh_map.b_blocknr) { 397 if (!bh_map.b_blocknr) {
@@ -504,13 +504,21 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
504 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) 504 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
505 ro = 1; 505 ro = 1;
506 } else { 506 } else {
507 if (sdp->sd_vfs->s_flags & MS_RDONLY) 507 if (sdp->sd_vfs->s_flags & MS_RDONLY) {
508 ro = 1; 508 /* check if device itself is read-only */
509 ro = bdev_read_only(sdp->sd_vfs->s_bdev);
510 if (!ro) {
511 fs_info(sdp, "recovery required on "
512 "read-only filesystem.\n");
513 fs_info(sdp, "write access will be "
514 "enabled during recovery.\n");
515 }
516 }
509 } 517 }
510 518
511 if (ro) { 519 if (ro) {
512 fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n", 520 fs_warn(sdp, "jid=%u: Can't replay: read-only block "
513 jd->jd_jid); 521 "device\n", jd->jd_jid);
514 error = -EROFS; 522 error = -EROFS;
515 goto fail_gunlock_tr; 523 goto fail_gunlock_tr;
516 } 524 }
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 708c287e1d0e..3552110b2e5f 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -25,10 +25,10 @@
25#include "rgrp.h" 25#include "rgrp.h"
26#include "super.h" 26#include "super.h"
27#include "trans.h" 27#include "trans.h"
28#include "ops_file.h"
29#include "util.h" 28#include "util.h"
30#include "log.h" 29#include "log.h"
31#include "inode.h" 30#include "inode.h"
31#include "ops_address.h"
32 32
33#define BFITNOENT ((u32)~0) 33#define BFITNOENT ((u32)~0)
34#define NO_BLOCK ((u64)~0) 34#define NO_BLOCK ((u64)~0)
@@ -126,41 +126,43 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
126 * Return: the block number (bitmap buffer scope) that was found 126 * Return: the block number (bitmap buffer scope) that was found
127 */ 127 */
128 128
129static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer, 129static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
130 unsigned int buflen, u32 goal, 130 unsigned char old_state)
131 unsigned char old_state)
132{ 131{
133 unsigned char *byte, *end, alloc; 132 unsigned char *byte;
134 u32 blk = goal; 133 u32 blk = goal;
135 unsigned int bit; 134 unsigned int bit, bitlong;
135 unsigned long *plong, plong55;
136 136
137 byte = buffer + (goal / GFS2_NBBY); 137 byte = buffer + (goal / GFS2_NBBY);
138 plong = (unsigned long *)(buffer + (goal / GFS2_NBBY));
138 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; 139 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
139 end = buffer + buflen; 140 bitlong = bit;
140 alloc = (old_state == GFS2_BLKST_FREE) ? 0x55 : 0; 141#if BITS_PER_LONG == 32
141 142 plong55 = 0x55555555;
142 while (byte < end) { 143#else
143 /* If we're looking for a free block we can eliminate all 144 plong55 = 0x5555555555555555;
144 bitmap settings with 0x55, which represents four data 145#endif
145 blocks in a row. If we're looking for a data block, we can 146 while (byte < buffer + buflen) {
146 eliminate 0x00 which corresponds to four free blocks. */ 147
147 if ((*byte & 0x55) == alloc) { 148 if (bitlong == 0 && old_state == 0 && *plong == plong55) {
148 blk += (8 - bit) >> 1; 149 plong++;
149 150 byte += sizeof(unsigned long);
150 bit = 0; 151 blk += sizeof(unsigned long) * GFS2_NBBY;
151 byte++;
152
153 continue; 152 continue;
154 } 153 }
155
156 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) 154 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
157 return blk; 155 return blk;
158
159 bit += GFS2_BIT_SIZE; 156 bit += GFS2_BIT_SIZE;
160 if (bit >= 8) { 157 if (bit >= 8) {
161 bit = 0; 158 bit = 0;
162 byte++; 159 byte++;
163 } 160 }
161 bitlong += GFS2_BIT_SIZE;
162 if (bitlong >= sizeof(unsigned long) * 8) {
163 bitlong = 0;
164 plong++;
165 }
164 166
165 blk++; 167 blk++;
166 } 168 }
@@ -817,11 +819,9 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
817 819
818struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) 820struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
819{ 821{
820 struct gfs2_alloc *al = &ip->i_alloc; 822 BUG_ON(ip->i_alloc != NULL);
821 823 ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL);
822 /* FIXME: Should assert that the correct locks are held here... */ 824 return ip->i_alloc;
823 memset(al, 0, sizeof(*al));
824 return al;
825} 825}
826 826
827/** 827/**
@@ -1059,26 +1059,34 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1059 struct inode *inode = NULL; 1059 struct inode *inode = NULL;
1060 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1060 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1061 struct gfs2_rgrpd *rgd, *begin = NULL; 1061 struct gfs2_rgrpd *rgd, *begin = NULL;
1062 struct gfs2_alloc *al = &ip->i_alloc; 1062 struct gfs2_alloc *al = ip->i_alloc;
1063 int flags = LM_FLAG_TRY; 1063 int flags = LM_FLAG_TRY;
1064 int skipped = 0; 1064 int skipped = 0;
1065 int loops = 0; 1065 int loops = 0;
1066 int error; 1066 int error, rg_locked;
1067 1067
1068 /* Try recently successful rgrps */ 1068 /* Try recently successful rgrps */
1069 1069
1070 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc); 1070 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
1071 1071
1072 while (rgd) { 1072 while (rgd) {
1073 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 1073 rg_locked = 0;
1074 LM_FLAG_TRY, &al->al_rgd_gh); 1074
1075 if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
1076 rg_locked = 1;
1077 error = 0;
1078 } else {
1079 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1080 LM_FLAG_TRY, &al->al_rgd_gh);
1081 }
1075 switch (error) { 1082 switch (error) {
1076 case 0: 1083 case 0:
1077 if (try_rgrp_fit(rgd, al)) 1084 if (try_rgrp_fit(rgd, al))
1078 goto out; 1085 goto out;
1079 if (rgd->rd_flags & GFS2_RDF_CHECK) 1086 if (rgd->rd_flags & GFS2_RDF_CHECK)
1080 inode = try_rgrp_unlink(rgd, last_unlinked); 1087 inode = try_rgrp_unlink(rgd, last_unlinked);
1081 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1088 if (!rg_locked)
1089 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1082 if (inode) 1090 if (inode)
1083 return inode; 1091 return inode;
1084 rgd = recent_rgrp_next(rgd, 1); 1092 rgd = recent_rgrp_next(rgd, 1);
@@ -1098,15 +1106,23 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1098 begin = rgd = forward_rgrp_get(sdp); 1106 begin = rgd = forward_rgrp_get(sdp);
1099 1107
1100 for (;;) { 1108 for (;;) {
1101 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags, 1109 rg_locked = 0;
1102 &al->al_rgd_gh); 1110
1111 if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
1112 rg_locked = 1;
1113 error = 0;
1114 } else {
1115 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
1116 &al->al_rgd_gh);
1117 }
1103 switch (error) { 1118 switch (error) {
1104 case 0: 1119 case 0:
1105 if (try_rgrp_fit(rgd, al)) 1120 if (try_rgrp_fit(rgd, al))
1106 goto out; 1121 goto out;
1107 if (rgd->rd_flags & GFS2_RDF_CHECK) 1122 if (rgd->rd_flags & GFS2_RDF_CHECK)
1108 inode = try_rgrp_unlink(rgd, last_unlinked); 1123 inode = try_rgrp_unlink(rgd, last_unlinked);
1109 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1124 if (!rg_locked)
1125 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1110 if (inode) 1126 if (inode)
1111 return inode; 1127 return inode;
1112 break; 1128 break;
@@ -1158,7 +1174,7 @@ out:
1158int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) 1174int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1159{ 1175{
1160 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1176 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1161 struct gfs2_alloc *al = &ip->i_alloc; 1177 struct gfs2_alloc *al = ip->i_alloc;
1162 struct inode *inode; 1178 struct inode *inode;
1163 int error = 0; 1179 int error = 0;
1164 u64 last_unlinked = NO_BLOCK; 1180 u64 last_unlinked = NO_BLOCK;
@@ -1204,7 +1220,7 @@ try_again:
1204void gfs2_inplace_release(struct gfs2_inode *ip) 1220void gfs2_inplace_release(struct gfs2_inode *ip)
1205{ 1221{
1206 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1222 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1207 struct gfs2_alloc *al = &ip->i_alloc; 1223 struct gfs2_alloc *al = ip->i_alloc;
1208 1224
1209 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1) 1225 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
1210 fs_warn(sdp, "al_alloced = %u, al_requested = %u " 1226 fs_warn(sdp, "al_alloced = %u, al_requested = %u "
@@ -1213,7 +1229,8 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1213 al->al_line); 1229 al->al_line);
1214 1230
1215 al->al_rgd = NULL; 1231 al->al_rgd = NULL;
1216 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1232 if (al->al_rgd_gh.gh_gl)
1233 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1217 if (ip != GFS2_I(sdp->sd_rindex)) 1234 if (ip != GFS2_I(sdp->sd_rindex))
1218 gfs2_glock_dq_uninit(&al->al_ri_gh); 1235 gfs2_glock_dq_uninit(&al->al_ri_gh);
1219} 1236}
@@ -1301,11 +1318,10 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1301 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone 1318 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
1302 bitmaps, so we must search the originals for that. */ 1319 bitmaps, so we must search the originals for that. */
1303 if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) 1320 if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1304 blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset, 1321 blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset,
1305 bi->bi_len, goal, old_state); 1322 bi->bi_len, goal, old_state);
1306 else 1323 else
1307 blk = gfs2_bitfit(rgd, 1324 blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset,
1308 bi->bi_bh->b_data + bi->bi_offset,
1309 bi->bi_len, goal, old_state); 1325 bi->bi_len, goal, old_state);
1310 if (blk != BFITNOENT) 1326 if (blk != BFITNOENT)
1311 break; 1327 break;
@@ -1394,7 +1410,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1394u64 gfs2_alloc_data(struct gfs2_inode *ip) 1410u64 gfs2_alloc_data(struct gfs2_inode *ip)
1395{ 1411{
1396 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1412 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1397 struct gfs2_alloc *al = &ip->i_alloc; 1413 struct gfs2_alloc *al = ip->i_alloc;
1398 struct gfs2_rgrpd *rgd = al->al_rgd; 1414 struct gfs2_rgrpd *rgd = al->al_rgd;
1399 u32 goal, blk; 1415 u32 goal, blk;
1400 u64 block; 1416 u64 block;
@@ -1439,7 +1455,7 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
1439u64 gfs2_alloc_meta(struct gfs2_inode *ip) 1455u64 gfs2_alloc_meta(struct gfs2_inode *ip)
1440{ 1456{
1441 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1457 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1442 struct gfs2_alloc *al = &ip->i_alloc; 1458 struct gfs2_alloc *al = ip->i_alloc;
1443 struct gfs2_rgrpd *rgd = al->al_rgd; 1459 struct gfs2_rgrpd *rgd = al->al_rgd;
1444 u32 goal, blk; 1460 u32 goal, blk;
1445 u64 block; 1461 u64 block;
@@ -1485,7 +1501,7 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
1485u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) 1501u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1486{ 1502{
1487 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 1503 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1488 struct gfs2_alloc *al = &dip->i_alloc; 1504 struct gfs2_alloc *al = dip->i_alloc;
1489 struct gfs2_rgrpd *rgd = al->al_rgd; 1505 struct gfs2_rgrpd *rgd = al->al_rgd;
1490 u32 blk; 1506 u32 blk;
1491 u64 block; 1507 u64 block;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4c6adfc6f2e..149bb161f4b6 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -32,7 +32,9 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
32struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); 32struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
33static inline void gfs2_alloc_put(struct gfs2_inode *ip) 33static inline void gfs2_alloc_put(struct gfs2_inode *ip)
34{ 34{
35 return; /* So we can see where ip->i_alloc is used */ 35 BUG_ON(ip->i_alloc == NULL);
36 kfree(ip->i_alloc);
37 ip->i_alloc = NULL;
36} 38}
37 39
38int gfs2_inplace_reserve_i(struct gfs2_inode *ip, 40int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index dd3e737f528e..ef0562c3bc71 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -51,13 +51,9 @@ void gfs2_tune_init(struct gfs2_tune *gt)
51{ 51{
52 spin_lock_init(&gt->gt_spin); 52 spin_lock_init(&gt->gt_spin);
53 53
54 gt->gt_ilimit = 100;
55 gt->gt_ilimit_tries = 3;
56 gt->gt_ilimit_min = 1;
57 gt->gt_demote_secs = 300; 54 gt->gt_demote_secs = 300;
58 gt->gt_incore_log_blocks = 1024; 55 gt->gt_incore_log_blocks = 1024;
59 gt->gt_log_flush_secs = 60; 56 gt->gt_log_flush_secs = 60;
60 gt->gt_jindex_refresh_secs = 60;
61 gt->gt_recoverd_secs = 60; 57 gt->gt_recoverd_secs = 60;
62 gt->gt_logd_secs = 1; 58 gt->gt_logd_secs = 1;
63 gt->gt_quotad_secs = 5; 59 gt->gt_quotad_secs = 5;
@@ -71,10 +67,8 @@ void gfs2_tune_init(struct gfs2_tune *gt)
71 gt->gt_new_files_jdata = 0; 67 gt->gt_new_files_jdata = 0;
72 gt->gt_new_files_directio = 0; 68 gt->gt_new_files_directio = 0;
73 gt->gt_max_readahead = 1 << 18; 69 gt->gt_max_readahead = 1 << 18;
74 gt->gt_lockdump_size = 131072;
75 gt->gt_stall_secs = 600; 70 gt->gt_stall_secs = 600;
76 gt->gt_complain_secs = 10; 71 gt->gt_complain_secs = 10;
77 gt->gt_reclaim_limit = 5000;
78 gt->gt_statfs_quantum = 30; 72 gt->gt_statfs_quantum = 30;
79 gt->gt_statfs_slow = 0; 73 gt->gt_statfs_slow = 0;
80} 74}
@@ -393,6 +387,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
393 if (!jd) 387 if (!jd)
394 break; 388 break;
395 389
390 INIT_LIST_HEAD(&jd->extent_list);
396 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL); 391 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
397 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { 392 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
398 if (!jd->jd_inode) 393 if (!jd->jd_inode)
@@ -422,8 +417,9 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
422 417
423void gfs2_jindex_free(struct gfs2_sbd *sdp) 418void gfs2_jindex_free(struct gfs2_sbd *sdp)
424{ 419{
425 struct list_head list; 420 struct list_head list, *head;
426 struct gfs2_jdesc *jd; 421 struct gfs2_jdesc *jd;
422 struct gfs2_journal_extent *jext;
427 423
428 spin_lock(&sdp->sd_jindex_spin); 424 spin_lock(&sdp->sd_jindex_spin);
429 list_add(&list, &sdp->sd_jindex_list); 425 list_add(&list, &sdp->sd_jindex_list);
@@ -433,6 +429,14 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp)
433 429
434 while (!list_empty(&list)) { 430 while (!list_empty(&list)) {
435 jd = list_entry(list.next, struct gfs2_jdesc, jd_list); 431 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
432 head = &jd->extent_list;
433 while (!list_empty(head)) {
434 jext = list_entry(head->next,
435 struct gfs2_journal_extent,
436 extent_list);
437 list_del(&jext->extent_list);
438 kfree(jext);
439 }
436 list_del(&jd->jd_list); 440 list_del(&jd->jd_list);
437 iput(jd->jd_inode); 441 iput(jd->jd_inode);
438 kfree(jd); 442 kfree(jd);
@@ -543,7 +547,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
543 if (error) 547 if (error)
544 return error; 548 return error;
545 549
546 gfs2_meta_cache_flush(ip);
547 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); 550 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
548 551
549 error = gfs2_find_jhead(sdp->sd_jdesc, &head); 552 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -686,9 +689,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
686 if (error) 689 if (error)
687 return; 690 return;
688 691
689 mutex_lock(&sdp->sd_statfs_mutex);
690 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); 692 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
691 mutex_unlock(&sdp->sd_statfs_mutex);
692 693
693 spin_lock(&sdp->sd_statfs_spin); 694 spin_lock(&sdp->sd_statfs_spin);
694 l_sc->sc_total += total; 695 l_sc->sc_total += total;
@@ -736,9 +737,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
736 if (error) 737 if (error)
737 goto out_bh2; 738 goto out_bh2;
738 739
739 mutex_lock(&sdp->sd_statfs_mutex);
740 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); 740 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
741 mutex_unlock(&sdp->sd_statfs_mutex);
742 741
743 spin_lock(&sdp->sd_statfs_spin); 742 spin_lock(&sdp->sd_statfs_spin);
744 m_sc->sc_total += l_sc->sc_total; 743 m_sc->sc_total += l_sc->sc_total;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 06e0b7768d97..eaa3b7b2f99e 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -32,7 +32,8 @@ spinlock_t gfs2_sys_margs_lock;
32 32
33static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) 33static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
34{ 34{
35 return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id); 35 return snprintf(buf, PAGE_SIZE, "%u:%u\n",
36 MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev));
36} 37}
37 38
38static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) 39static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
@@ -221,9 +222,7 @@ static struct kobj_type gfs2_ktype = {
221 .sysfs_ops = &gfs2_attr_ops, 222 .sysfs_ops = &gfs2_attr_ops,
222}; 223};
223 224
224static struct kset gfs2_kset = { 225static struct kset *gfs2_kset;
225 .ktype = &gfs2_ktype,
226};
227 226
228/* 227/*
229 * display struct lm_lockstruct fields 228 * display struct lm_lockstruct fields
@@ -427,13 +426,11 @@ TUNE_ATTR_2(name, name##_store)
427TUNE_ATTR(demote_secs, 0); 426TUNE_ATTR(demote_secs, 0);
428TUNE_ATTR(incore_log_blocks, 0); 427TUNE_ATTR(incore_log_blocks, 0);
429TUNE_ATTR(log_flush_secs, 0); 428TUNE_ATTR(log_flush_secs, 0);
430TUNE_ATTR(jindex_refresh_secs, 0);
431TUNE_ATTR(quota_warn_period, 0); 429TUNE_ATTR(quota_warn_period, 0);
432TUNE_ATTR(quota_quantum, 0); 430TUNE_ATTR(quota_quantum, 0);
433TUNE_ATTR(atime_quantum, 0); 431TUNE_ATTR(atime_quantum, 0);
434TUNE_ATTR(max_readahead, 0); 432TUNE_ATTR(max_readahead, 0);
435TUNE_ATTR(complain_secs, 0); 433TUNE_ATTR(complain_secs, 0);
436TUNE_ATTR(reclaim_limit, 0);
437TUNE_ATTR(statfs_slow, 0); 434TUNE_ATTR(statfs_slow, 0);
438TUNE_ATTR(new_files_jdata, 0); 435TUNE_ATTR(new_files_jdata, 0);
439TUNE_ATTR(new_files_directio, 0); 436TUNE_ATTR(new_files_directio, 0);
@@ -450,13 +447,11 @@ static struct attribute *tune_attrs[] = {
450 &tune_attr_demote_secs.attr, 447 &tune_attr_demote_secs.attr,
451 &tune_attr_incore_log_blocks.attr, 448 &tune_attr_incore_log_blocks.attr,
452 &tune_attr_log_flush_secs.attr, 449 &tune_attr_log_flush_secs.attr,
453 &tune_attr_jindex_refresh_secs.attr,
454 &tune_attr_quota_warn_period.attr, 450 &tune_attr_quota_warn_period.attr,
455 &tune_attr_quota_quantum.attr, 451 &tune_attr_quota_quantum.attr,
456 &tune_attr_atime_quantum.attr, 452 &tune_attr_atime_quantum.attr,
457 &tune_attr_max_readahead.attr, 453 &tune_attr_max_readahead.attr,
458 &tune_attr_complain_secs.attr, 454 &tune_attr_complain_secs.attr,
459 &tune_attr_reclaim_limit.attr,
460 &tune_attr_statfs_slow.attr, 455 &tune_attr_statfs_slow.attr,
461 &tune_attr_quota_simul_sync.attr, 456 &tune_attr_quota_simul_sync.attr,
462 &tune_attr_quota_cache_secs.attr, 457 &tune_attr_quota_cache_secs.attr,
@@ -495,14 +490,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
495{ 490{
496 int error; 491 int error;
497 492
498 sdp->sd_kobj.kset = &gfs2_kset; 493 sdp->sd_kobj.kset = gfs2_kset;
499 sdp->sd_kobj.ktype = &gfs2_ktype; 494 error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
500 495 "%s", sdp->sd_table_name);
501 error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
502 if (error)
503 goto fail;
504
505 error = kobject_register(&sdp->sd_kobj);
506 if (error) 496 if (error)
507 goto fail; 497 goto fail;
508 498
@@ -522,6 +512,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
522 if (error) 512 if (error)
523 goto fail_args; 513 goto fail_args;
524 514
515 kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
525 return 0; 516 return 0;
526 517
527fail_args: 518fail_args:
@@ -531,7 +522,7 @@ fail_counters:
531fail_lockstruct: 522fail_lockstruct:
532 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); 523 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
533fail_reg: 524fail_reg:
534 kobject_unregister(&sdp->sd_kobj); 525 kobject_put(&sdp->sd_kobj);
535fail: 526fail:
536 fs_err(sdp, "error %d adding sysfs files", error); 527 fs_err(sdp, "error %d adding sysfs files", error);
537 return error; 528 return error;
@@ -543,21 +534,22 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
543 sysfs_remove_group(&sdp->sd_kobj, &args_group); 534 sysfs_remove_group(&sdp->sd_kobj, &args_group);
544 sysfs_remove_group(&sdp->sd_kobj, &counters_group); 535 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
545 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); 536 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
546 kobject_unregister(&sdp->sd_kobj); 537 kobject_put(&sdp->sd_kobj);
547} 538}
548 539
549int gfs2_sys_init(void) 540int gfs2_sys_init(void)
550{ 541{
551 gfs2_sys_margs = NULL; 542 gfs2_sys_margs = NULL;
552 spin_lock_init(&gfs2_sys_margs_lock); 543 spin_lock_init(&gfs2_sys_margs_lock);
553 kobject_set_name(&gfs2_kset.kobj, "gfs2"); 544 gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
554 kobj_set_kset_s(&gfs2_kset, fs_subsys); 545 if (!gfs2_kset)
555 return kset_register(&gfs2_kset); 546 return -ENOMEM;
547 return 0;
556} 548}
557 549
558void gfs2_sys_uninit(void) 550void gfs2_sys_uninit(void)
559{ 551{
560 kfree(gfs2_sys_margs); 552 kfree(gfs2_sys_margs);
561 kset_unregister(&gfs2_kset); 553 kset_unregister(gfs2_kset);
562} 554}
563 555
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 717983e2c2ae..73e5d92a657c 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -114,11 +114,6 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
114 gfs2_log_flush(sdp, NULL); 114 gfs2_log_flush(sdp, NULL);
115} 115}
116 116
117void gfs2_trans_add_gl(struct gfs2_glock *gl)
118{
119 lops_add(gl->gl_sbd, &gl->gl_le);
120}
121
122/** 117/**
123 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction 118 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
124 * @gl: the glock the buffer belongs to 119 * @gl: the glock the buffer belongs to
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index 043d5f4b9c4c..e826f0dab80a 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -30,7 +30,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
30 30
31void gfs2_trans_end(struct gfs2_sbd *sdp); 31void gfs2_trans_end(struct gfs2_sbd *sdp);
32 32
33void gfs2_trans_add_gl(struct gfs2_glock *gl);
34void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); 33void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
35void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); 34void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
36void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno); 35void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index f13f1494d4fe..f8452a0eab56 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
52 rec = (e + b) / 2; 52 rec = (e + b) / 2;
53 len = hfs_brec_lenoff(bnode, rec, &off); 53 len = hfs_brec_lenoff(bnode, rec, &off);
54 keylen = hfs_brec_keylen(bnode, rec); 54 keylen = hfs_brec_keylen(bnode, rec);
55 if (keylen == HFS_BAD_KEYLEN) {
56 res = -EINVAL;
57 goto done;
58 }
55 hfs_bnode_read(bnode, fd->key, off, keylen); 59 hfs_bnode_read(bnode, fd->key, off, keylen);
56 cmpval = bnode->tree->keycmp(fd->key, fd->search_key); 60 cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
57 if (!cmpval) { 61 if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
67 if (rec != e && e >= 0) { 71 if (rec != e && e >= 0) {
68 len = hfs_brec_lenoff(bnode, e, &off); 72 len = hfs_brec_lenoff(bnode, e, &off);
69 keylen = hfs_brec_keylen(bnode, e); 73 keylen = hfs_brec_keylen(bnode, e);
74 if (keylen == HFS_BAD_KEYLEN) {
75 res = -EINVAL;
76 goto done;
77 }
70 hfs_bnode_read(bnode, fd->key, off, keylen); 78 hfs_bnode_read(bnode, fd->key, off, keylen);
71 } 79 }
72done: 80done:
@@ -198,6 +206,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
198 206
199 len = hfs_brec_lenoff(bnode, fd->record, &off); 207 len = hfs_brec_lenoff(bnode, fd->record, &off);
200 keylen = hfs_brec_keylen(bnode, fd->record); 208 keylen = hfs_brec_keylen(bnode, fd->record);
209 if (keylen == HFS_BAD_KEYLEN) {
210 res = -EINVAL;
211 goto out;
212 }
201 fd->keyoffset = off; 213 fd->keyoffset = off;
202 fd->keylength = keylen; 214 fd->keylength = keylen;
203 fd->entryoffset = off + keylen; 215 fd->entryoffset = off + keylen;
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 5c87cf4801fc..8626ee375ea8 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -44,10 +44,21 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
44 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); 44 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
45 if (!recoff) 45 if (!recoff)
46 return 0; 46 return 0;
47 if (node->tree->attributes & HFS_TREE_BIGKEYS) 47 if (node->tree->attributes & HFS_TREE_BIGKEYS) {
48 retval = hfs_bnode_read_u16(node, recoff) + 2; 48 retval = hfs_bnode_read_u16(node, recoff) + 2;
49 else 49 if (retval > node->tree->max_key_len + 2) {
50 printk(KERN_ERR "hfs: keylen %d too large\n",
51 retval);
52 retval = HFS_BAD_KEYLEN;
53 }
54 } else {
50 retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; 55 retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
56 if (retval > node->tree->max_key_len + 1) {
57 printk(KERN_ERR "hfs: keylen %d too large\n",
58 retval);
59 retval = HFS_BAD_KEYLEN;
60 }
61 }
51 } 62 }
52 return retval; 63 return retval;
53} 64}
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 8a3a650abc87..110dd3515dc8 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -61,7 +61,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
61 mapping = tree->inode->i_mapping; 61 mapping = tree->inode->i_mapping;
62 page = read_mapping_page(mapping, 0, NULL); 62 page = read_mapping_page(mapping, 0, NULL);
63 if (IS_ERR(page)) 63 if (IS_ERR(page))
64 goto free_tree; 64 goto free_inode;
65 65
66 /* Load the header */ 66 /* Load the header */
67 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); 67 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -81,6 +81,17 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
81 goto fail_page; 81 goto fail_page;
82 if (!tree->node_count) 82 if (!tree->node_count)
83 goto fail_page; 83 goto fail_page;
84 if ((id == HFS_EXT_CNID) && (tree->max_key_len != HFS_MAX_EXT_KEYLEN)) {
85 printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
86 tree->max_key_len);
87 goto fail_page;
88 }
89 if ((id == HFS_CAT_CNID) && (tree->max_key_len != HFS_MAX_CAT_KEYLEN)) {
90 printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
91 tree->max_key_len);
92 goto fail_page;
93 }
94
84 tree->node_size_shift = ffs(size) - 1; 95 tree->node_size_shift = ffs(size) - 1;
85 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 96 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
86 97
@@ -88,11 +99,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
88 page_cache_release(page); 99 page_cache_release(page);
89 return tree; 100 return tree;
90 101
91 fail_page: 102fail_page:
92 tree->inode->i_mapping->a_ops = &hfs_aops;
93 page_cache_release(page); 103 page_cache_release(page);
94 free_tree: 104free_inode:
105 tree->inode->i_mapping->a_ops = &hfs_aops;
95 iput(tree->inode); 106 iput(tree->inode);
107free_tree:
96 kfree(tree); 108 kfree(tree);
97 return NULL; 109 return NULL;
98} 110}
diff --git a/fs/hfs/hfs.h b/fs/hfs/hfs.h
index 1445e3a56ed4..c6aae61adfe6 100644
--- a/fs/hfs/hfs.h
+++ b/fs/hfs/hfs.h
@@ -28,6 +28,8 @@
28#define HFS_MAX_NAMELEN 128 28#define HFS_MAX_NAMELEN 128
29#define HFS_MAX_VALENCE 32767U 29#define HFS_MAX_VALENCE 32767U
30 30
31#define HFS_BAD_KEYLEN 0xFF
32
31/* Meanings of the drAtrb field of the MDB, 33/* Meanings of the drAtrb field of the MDB,
32 * Reference: _Inside Macintosh: Files_ p. 2-61 34 * Reference: _Inside Macintosh: Files_ p. 2-61
33 */ 35 */
@@ -167,6 +169,9 @@ typedef union hfs_btree_key {
167 struct hfs_ext_key ext; 169 struct hfs_ext_key ext;
168} hfs_btree_key; 170} hfs_btree_key;
169 171
172#define HFS_MAX_CAT_KEYLEN (sizeof(struct hfs_cat_key) - sizeof(u8))
173#define HFS_MAX_EXT_KEYLEN (sizeof(struct hfs_ext_key) - sizeof(u8))
174
170typedef union hfs_btree_key btree_key; 175typedef union hfs_btree_key btree_key;
171 176
172struct hfs_extent { 177struct hfs_extent {
diff --git a/fs/inode.c b/fs/inode.c
index ed35383d0b6c..276ffd6b6fdd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1276,6 +1276,11 @@ void file_update_time(struct file *file)
1276 sync_it = 1; 1276 sync_it = 1;
1277 } 1277 }
1278 1278
1279 if (IS_I_VERSION(inode)) {
1280 inode_inc_iversion(inode);
1281 sync_it = 1;
1282 }
1283
1279 if (sync_it) 1284 if (sync_it)
1280 mark_inode_dirty_sync(inode); 1285 mark_inode_dirty_sync(inode);
1281} 1286}
diff --git a/fs/ioprio.c b/fs/ioprio.c
index e4e01bc7f338..c4a1c3c65aac 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -41,18 +41,28 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
41 return err; 41 return err;
42 42
43 task_lock(task); 43 task_lock(task);
44 do {
45 ioc = task->io_context;
46 /* see wmb() in current_io_context() */
47 smp_read_barrier_depends();
48 if (ioc)
49 break;
44 50
45 task->ioprio = ioprio; 51 ioc = alloc_io_context(GFP_ATOMIC, -1);
46 52 if (!ioc) {
47 ioc = task->io_context; 53 err = -ENOMEM;
48 /* see wmb() in current_io_context() */ 54 break;
49 smp_read_barrier_depends(); 55 }
56 task->io_context = ioc;
57 } while (1);
50 58
51 if (ioc) 59 if (!err) {
60 ioc->ioprio = ioprio;
52 ioc->ioprio_changed = 1; 61 ioc->ioprio_changed = 1;
62 }
53 63
54 task_unlock(task); 64 task_unlock(task);
55 return 0; 65 return err;
56} 66}
57 67
58asmlinkage long sys_ioprio_set(int which, int who, int ioprio) 68asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
@@ -75,8 +85,6 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
75 85
76 break; 86 break;
77 case IOPRIO_CLASS_IDLE: 87 case IOPRIO_CLASS_IDLE:
78 if (!capable(CAP_SYS_ADMIN))
79 return -EPERM;
80 break; 88 break;
81 case IOPRIO_CLASS_NONE: 89 case IOPRIO_CLASS_NONE:
82 if (data) 90 if (data)
@@ -148,7 +156,9 @@ static int get_task_ioprio(struct task_struct *p)
148 ret = security_task_getioprio(p); 156 ret = security_task_getioprio(p);
149 if (ret) 157 if (ret)
150 goto out; 158 goto out;
151 ret = p->ioprio; 159 ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
160 if (p->io_context)
161 ret = p->io_context->ioprio;
152out: 162out:
153 return ret; 163 return ret;
154} 164}
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 0f69c416eebc..a5432bbbfb88 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -347,7 +347,8 @@ restart:
347 break; 347 break;
348 } 348 }
349 retry = __process_buffer(journal, jh, bhs,&batch_count); 349 retry = __process_buffer(journal, jh, bhs,&batch_count);
350 if (!retry && lock_need_resched(&journal->j_list_lock)){ 350 if (!retry && (need_resched() ||
351 spin_needbreak(&journal->j_list_lock))) {
351 spin_unlock(&journal->j_list_lock); 352 spin_unlock(&journal->j_list_lock);
352 retry = 1; 353 retry = 1;
353 break; 354 break;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 610264b99a8e..31853eb65b4c 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -265,7 +265,7 @@ write_out_data:
265 put_bh(bh); 265 put_bh(bh);
266 } 266 }
267 267
268 if (lock_need_resched(&journal->j_list_lock)) { 268 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
269 spin_unlock(&journal->j_list_lock); 269 spin_unlock(&journal->j_list_lock);
270 goto write_out_data; 270 goto write_out_data;
271 } 271 }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 08ff6c7028cc..038ed7436199 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -288,10 +288,12 @@ handle_t *journal_start(journal_t *journal, int nblocks)
288 jbd_free_handle(handle); 288 jbd_free_handle(handle);
289 current->journal_info = NULL; 289 current->journal_info = NULL;
290 handle = ERR_PTR(err); 290 handle = ERR_PTR(err);
291 goto out;
291 } 292 }
292 293
293 lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_); 294 lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
294 295
296out:
295 return handle; 297 return handle;
296} 298}
297 299
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 3fccde7ba008..6914598022ce 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -232,7 +232,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
233 */ 233 */
234static int __process_buffer(journal_t *journal, struct journal_head *jh, 234static int __process_buffer(journal_t *journal, struct journal_head *jh,
235 struct buffer_head **bhs, int *batch_count) 235 struct buffer_head **bhs, int *batch_count,
236 transaction_t *transaction)
236{ 237{
237 struct buffer_head *bh = jh2bh(jh); 238 struct buffer_head *bh = jh2bh(jh);
238 int ret = 0; 239 int ret = 0;
@@ -250,6 +251,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
250 transaction_t *t = jh->b_transaction; 251 transaction_t *t = jh->b_transaction;
251 tid_t tid = t->t_tid; 252 tid_t tid = t->t_tid;
252 253
254 transaction->t_chp_stats.cs_forced_to_close++;
253 spin_unlock(&journal->j_list_lock); 255 spin_unlock(&journal->j_list_lock);
254 jbd_unlock_bh_state(bh); 256 jbd_unlock_bh_state(bh);
255 jbd2_log_start_commit(journal, tid); 257 jbd2_log_start_commit(journal, tid);
@@ -279,6 +281,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
279 bhs[*batch_count] = bh; 281 bhs[*batch_count] = bh;
280 __buffer_relink_io(jh); 282 __buffer_relink_io(jh);
281 jbd_unlock_bh_state(bh); 283 jbd_unlock_bh_state(bh);
284 transaction->t_chp_stats.cs_written++;
282 (*batch_count)++; 285 (*batch_count)++;
283 if (*batch_count == NR_BATCH) { 286 if (*batch_count == NR_BATCH) {
284 spin_unlock(&journal->j_list_lock); 287 spin_unlock(&journal->j_list_lock);
@@ -322,6 +325,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
322 if (!journal->j_checkpoint_transactions) 325 if (!journal->j_checkpoint_transactions)
323 goto out; 326 goto out;
324 transaction = journal->j_checkpoint_transactions; 327 transaction = journal->j_checkpoint_transactions;
328 if (transaction->t_chp_stats.cs_chp_time == 0)
329 transaction->t_chp_stats.cs_chp_time = jiffies;
325 this_tid = transaction->t_tid; 330 this_tid = transaction->t_tid;
326restart: 331restart:
327 /* 332 /*
@@ -346,8 +351,10 @@ restart:
346 retry = 1; 351 retry = 1;
347 break; 352 break;
348 } 353 }
349 retry = __process_buffer(journal, jh, bhs,&batch_count); 354 retry = __process_buffer(journal, jh, bhs, &batch_count,
350 if (!retry && lock_need_resched(&journal->j_list_lock)){ 355 transaction);
356 if (!retry && (need_resched() ||
357 spin_needbreak(&journal->j_list_lock))) {
351 spin_unlock(&journal->j_list_lock); 358 spin_unlock(&journal->j_list_lock);
352 retry = 1; 359 retry = 1;
353 break; 360 break;
@@ -602,15 +609,15 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
602 609
603 /* 610 /*
604 * There is one special case to worry about: if we have just pulled the 611 * There is one special case to worry about: if we have just pulled the
605 * buffer off a committing transaction's forget list, then even if the 612 * buffer off a running or committing transaction's checkpoing list,
606 * checkpoint list is empty, the transaction obviously cannot be 613 * then even if the checkpoint list is empty, the transaction obviously
607 * dropped! 614 * cannot be dropped!
608 * 615 *
609 * The locking here around j_committing_transaction is a bit sleazy. 616 * The locking here around t_state is a bit sleazy.
610 * See the comment at the end of jbd2_journal_commit_transaction(). 617 * See the comment at the end of jbd2_journal_commit_transaction().
611 */ 618 */
612 if (transaction == journal->j_committing_transaction) { 619 if (transaction->t_state != T_FINISHED) {
613 JBUFFER_TRACE(jh, "belongs to committing transaction"); 620 JBUFFER_TRACE(jh, "belongs to running/committing transaction");
614 goto out; 621 goto out;
615 } 622 }
616 623
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6986f334c643..4f302d279279 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -20,6 +20,8 @@
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/jiffies.h>
24#include <linux/crc32.h>
23 25
24/* 26/*
25 * Default IO end handler for temporary BJ_IO buffer_heads. 27 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -92,19 +94,23 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh)
92 return 1; 94 return 1;
93} 95}
94 96
95/* Done it all: now write the commit record. We should have 97/*
98 * Done it all: now submit the commit record. We should have
96 * cleaned up our previous buffers by now, so if we are in abort 99 * cleaned up our previous buffers by now, so if we are in abort
97 * mode we can now just skip the rest of the journal write 100 * mode we can now just skip the rest of the journal write
98 * entirely. 101 * entirely.
99 * 102 *
100 * Returns 1 if the journal needs to be aborted or 0 on success 103 * Returns 1 if the journal needs to be aborted or 0 on success
101 */ 104 */
102static int journal_write_commit_record(journal_t *journal, 105static int journal_submit_commit_record(journal_t *journal,
103 transaction_t *commit_transaction) 106 transaction_t *commit_transaction,
107 struct buffer_head **cbh,
108 __u32 crc32_sum)
104{ 109{
105 struct journal_head *descriptor; 110 struct journal_head *descriptor;
111 struct commit_header *tmp;
106 struct buffer_head *bh; 112 struct buffer_head *bh;
107 int i, ret; 113 int ret;
108 int barrier_done = 0; 114 int barrier_done = 0;
109 115
110 if (is_journal_aborted(journal)) 116 if (is_journal_aborted(journal))
@@ -116,21 +122,33 @@ static int journal_write_commit_record(journal_t *journal,
116 122
117 bh = jh2bh(descriptor); 123 bh = jh2bh(descriptor);
118 124
119 /* AKPM: buglet - add `i' to tmp! */ 125 tmp = (struct commit_header *)bh->b_data;
120 for (i = 0; i < bh->b_size; i += 512) { 126 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
121 journal_header_t *tmp = (journal_header_t*)bh->b_data; 127 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
122 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 128 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
123 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); 129
124 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); 130 if (JBD2_HAS_COMPAT_FEATURE(journal,
131 JBD2_FEATURE_COMPAT_CHECKSUM)) {
132 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
133 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
134 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
125 } 135 }
126 136
127 JBUFFER_TRACE(descriptor, "write commit block"); 137 JBUFFER_TRACE(descriptor, "submit commit block");
138 lock_buffer(bh);
139
128 set_buffer_dirty(bh); 140 set_buffer_dirty(bh);
129 if (journal->j_flags & JBD2_BARRIER) { 141 set_buffer_uptodate(bh);
142 bh->b_end_io = journal_end_buffer_io_sync;
143
144 if (journal->j_flags & JBD2_BARRIER &&
145 !JBD2_HAS_COMPAT_FEATURE(journal,
146 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
130 set_buffer_ordered(bh); 147 set_buffer_ordered(bh);
131 barrier_done = 1; 148 barrier_done = 1;
132 } 149 }
133 ret = sync_dirty_buffer(bh); 150 ret = submit_bh(WRITE, bh);
151
134 /* is it possible for another commit to fail at roughly 152 /* is it possible for another commit to fail at roughly
135 * the same time as this one? If so, we don't want to 153 * the same time as this one? If so, we don't want to
136 * trust the barrier flag in the super, but instead want 154 * trust the barrier flag in the super, but instead want
@@ -151,14 +169,72 @@ static int journal_write_commit_record(journal_t *journal,
151 clear_buffer_ordered(bh); 169 clear_buffer_ordered(bh);
152 set_buffer_uptodate(bh); 170 set_buffer_uptodate(bh);
153 set_buffer_dirty(bh); 171 set_buffer_dirty(bh);
154 ret = sync_dirty_buffer(bh); 172 ret = submit_bh(WRITE, bh);
155 } 173 }
156 put_bh(bh); /* One for getblk() */ 174 *cbh = bh;
157 jbd2_journal_put_journal_head(descriptor); 175 return ret;
176}
177
178/*
179 * This function along with journal_submit_commit_record
180 * allows to write the commit record asynchronously.
181 */
182static int journal_wait_on_commit_record(struct buffer_head *bh)
183{
184 int ret = 0;
185
186 clear_buffer_dirty(bh);
187 wait_on_buffer(bh);
188
189 if (unlikely(!buffer_uptodate(bh)))
190 ret = -EIO;
191 put_bh(bh); /* One for getblk() */
192 jbd2_journal_put_journal_head(bh2jh(bh));
158 193
159 return (ret == -EIO); 194 return ret;
160} 195}
161 196
197/*
198 * Wait for all submitted IO to complete.
199 */
200static int journal_wait_on_locked_list(journal_t *journal,
201 transaction_t *commit_transaction)
202{
203 int ret = 0;
204 struct journal_head *jh;
205
206 while (commit_transaction->t_locked_list) {
207 struct buffer_head *bh;
208
209 jh = commit_transaction->t_locked_list->b_tprev;
210 bh = jh2bh(jh);
211 get_bh(bh);
212 if (buffer_locked(bh)) {
213 spin_unlock(&journal->j_list_lock);
214 wait_on_buffer(bh);
215 if (unlikely(!buffer_uptodate(bh)))
216 ret = -EIO;
217 spin_lock(&journal->j_list_lock);
218 }
219 if (!inverted_lock(journal, bh)) {
220 put_bh(bh);
221 spin_lock(&journal->j_list_lock);
222 continue;
223 }
224 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
225 __jbd2_journal_unfile_buffer(jh);
226 jbd_unlock_bh_state(bh);
227 jbd2_journal_remove_journal_head(bh);
228 put_bh(bh);
229 } else {
230 jbd_unlock_bh_state(bh);
231 }
232 put_bh(bh);
233 cond_resched_lock(&journal->j_list_lock);
234 }
235 return ret;
236 }
237
162static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) 238static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
163{ 239{
164 int i; 240 int i;
@@ -265,7 +341,7 @@ write_out_data:
265 put_bh(bh); 341 put_bh(bh);
266 } 342 }
267 343
268 if (lock_need_resched(&journal->j_list_lock)) { 344 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
269 spin_unlock(&journal->j_list_lock); 345 spin_unlock(&journal->j_list_lock);
270 goto write_out_data; 346 goto write_out_data;
271 } 347 }
@@ -274,7 +350,21 @@ write_out_data:
274 journal_do_submit_data(wbuf, bufs); 350 journal_do_submit_data(wbuf, bufs);
275} 351}
276 352
277static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, 353static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
354{
355 struct page *page = bh->b_page;
356 char *addr;
357 __u32 checksum;
358
359 addr = kmap_atomic(page, KM_USER0);
360 checksum = crc32_be(crc32_sum,
361 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
362 kunmap_atomic(addr, KM_USER0);
363
364 return checksum;
365}
366
367static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
278 unsigned long long block) 368 unsigned long long block)
279{ 369{
280 tag->t_blocknr = cpu_to_be32(block & (u32)~0); 370 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
@@ -290,6 +380,7 @@ static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
290 */ 380 */
291void jbd2_journal_commit_transaction(journal_t *journal) 381void jbd2_journal_commit_transaction(journal_t *journal)
292{ 382{
383 struct transaction_stats_s stats;
293 transaction_t *commit_transaction; 384 transaction_t *commit_transaction;
294 struct journal_head *jh, *new_jh, *descriptor; 385 struct journal_head *jh, *new_jh, *descriptor;
295 struct buffer_head **wbuf = journal->j_wbuf; 386 struct buffer_head **wbuf = journal->j_wbuf;
@@ -305,6 +396,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
305 int tag_flag; 396 int tag_flag;
306 int i; 397 int i;
307 int tag_bytes = journal_tag_bytes(journal); 398 int tag_bytes = journal_tag_bytes(journal);
399 struct buffer_head *cbh = NULL; /* For transactional checksums */
400 __u32 crc32_sum = ~0;
308 401
309 /* 402 /*
310 * First job: lock down the current transaction and wait for 403 * First job: lock down the current transaction and wait for
@@ -337,6 +430,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
337 spin_lock(&journal->j_state_lock); 430 spin_lock(&journal->j_state_lock);
338 commit_transaction->t_state = T_LOCKED; 431 commit_transaction->t_state = T_LOCKED;
339 432
433 stats.u.run.rs_wait = commit_transaction->t_max_wait;
434 stats.u.run.rs_locked = jiffies;
435 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
436 stats.u.run.rs_locked);
437
340 spin_lock(&commit_transaction->t_handle_lock); 438 spin_lock(&commit_transaction->t_handle_lock);
341 while (commit_transaction->t_updates) { 439 while (commit_transaction->t_updates) {
342 DEFINE_WAIT(wait); 440 DEFINE_WAIT(wait);
@@ -407,6 +505,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
407 */ 505 */
408 jbd2_journal_switch_revoke_table(journal); 506 jbd2_journal_switch_revoke_table(journal);
409 507
508 stats.u.run.rs_flushing = jiffies;
509 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
510 stats.u.run.rs_flushing);
511
410 commit_transaction->t_state = T_FLUSH; 512 commit_transaction->t_state = T_FLUSH;
411 journal->j_committing_transaction = commit_transaction; 513 journal->j_committing_transaction = commit_transaction;
412 journal->j_running_transaction = NULL; 514 journal->j_running_transaction = NULL;
@@ -440,38 +542,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
440 journal_submit_data_buffers(journal, commit_transaction); 542 journal_submit_data_buffers(journal, commit_transaction);
441 543
442 /* 544 /*
443 * Wait for all previously submitted IO to complete. 545 * Wait for all previously submitted IO to complete if commit
546 * record is to be written synchronously.
444 */ 547 */
445 spin_lock(&journal->j_list_lock); 548 spin_lock(&journal->j_list_lock);
446 while (commit_transaction->t_locked_list) { 549 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
447 struct buffer_head *bh; 550 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
551 err = journal_wait_on_locked_list(journal,
552 commit_transaction);
448 553
449 jh = commit_transaction->t_locked_list->b_tprev;
450 bh = jh2bh(jh);
451 get_bh(bh);
452 if (buffer_locked(bh)) {
453 spin_unlock(&journal->j_list_lock);
454 wait_on_buffer(bh);
455 if (unlikely(!buffer_uptodate(bh)))
456 err = -EIO;
457 spin_lock(&journal->j_list_lock);
458 }
459 if (!inverted_lock(journal, bh)) {
460 put_bh(bh);
461 spin_lock(&journal->j_list_lock);
462 continue;
463 }
464 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
465 __jbd2_journal_unfile_buffer(jh);
466 jbd_unlock_bh_state(bh);
467 jbd2_journal_remove_journal_head(bh);
468 put_bh(bh);
469 } else {
470 jbd_unlock_bh_state(bh);
471 }
472 put_bh(bh);
473 cond_resched_lock(&journal->j_list_lock);
474 }
475 spin_unlock(&journal->j_list_lock); 554 spin_unlock(&journal->j_list_lock);
476 555
477 if (err) 556 if (err)
@@ -498,6 +577,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
498 */ 577 */
499 commit_transaction->t_state = T_COMMIT; 578 commit_transaction->t_state = T_COMMIT;
500 579
580 stats.u.run.rs_logging = jiffies;
581 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
582 stats.u.run.rs_logging);
583 stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
584 stats.u.run.rs_blocks_logged = 0;
585
501 descriptor = NULL; 586 descriptor = NULL;
502 bufs = 0; 587 bufs = 0;
503 while (commit_transaction->t_buffers) { 588 while (commit_transaction->t_buffers) {
@@ -639,6 +724,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
639start_journal_io: 724start_journal_io:
640 for (i = 0; i < bufs; i++) { 725 for (i = 0; i < bufs; i++) {
641 struct buffer_head *bh = wbuf[i]; 726 struct buffer_head *bh = wbuf[i];
727 /*
728 * Compute checksum.
729 */
730 if (JBD2_HAS_COMPAT_FEATURE(journal,
731 JBD2_FEATURE_COMPAT_CHECKSUM)) {
732 crc32_sum =
733 jbd2_checksum_data(crc32_sum, bh);
734 }
735
642 lock_buffer(bh); 736 lock_buffer(bh);
643 clear_buffer_dirty(bh); 737 clear_buffer_dirty(bh);
644 set_buffer_uptodate(bh); 738 set_buffer_uptodate(bh);
@@ -646,6 +740,7 @@ start_journal_io:
646 submit_bh(WRITE, bh); 740 submit_bh(WRITE, bh);
647 } 741 }
648 cond_resched(); 742 cond_resched();
743 stats.u.run.rs_blocks_logged += bufs;
649 744
650 /* Force a new descriptor to be generated next 745 /* Force a new descriptor to be generated next
651 time round the loop. */ 746 time round the loop. */
@@ -654,6 +749,23 @@ start_journal_io:
654 } 749 }
655 } 750 }
656 751
752 /* Done it all: now write the commit record asynchronously. */
753
754 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
755 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
756 err = journal_submit_commit_record(journal, commit_transaction,
757 &cbh, crc32_sum);
758 if (err)
759 __jbd2_journal_abort_hard(journal);
760
761 spin_lock(&journal->j_list_lock);
762 err = journal_wait_on_locked_list(journal,
763 commit_transaction);
764 spin_unlock(&journal->j_list_lock);
765 if (err)
766 __jbd2_journal_abort_hard(journal);
767 }
768
657 /* Lo and behold: we have just managed to send a transaction to 769 /* Lo and behold: we have just managed to send a transaction to
658 the log. Before we can commit it, wait for the IO so far to 770 the log. Before we can commit it, wait for the IO so far to
659 complete. Control buffers being written are on the 771 complete. Control buffers being written are on the
@@ -753,8 +865,14 @@ wait_for_iobuf:
753 865
754 jbd_debug(3, "JBD: commit phase 6\n"); 866 jbd_debug(3, "JBD: commit phase 6\n");
755 867
756 if (journal_write_commit_record(journal, commit_transaction)) 868 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
757 err = -EIO; 869 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
870 err = journal_submit_commit_record(journal, commit_transaction,
871 &cbh, crc32_sum);
872 if (err)
873 __jbd2_journal_abort_hard(journal);
874 }
875 err = journal_wait_on_commit_record(cbh);
758 876
759 if (err) 877 if (err)
760 jbd2_journal_abort(journal, err); 878 jbd2_journal_abort(journal, err);
@@ -816,6 +934,7 @@ restart_loop:
816 cp_transaction = jh->b_cp_transaction; 934 cp_transaction = jh->b_cp_transaction;
817 if (cp_transaction) { 935 if (cp_transaction) {
818 JBUFFER_TRACE(jh, "remove from old cp transaction"); 936 JBUFFER_TRACE(jh, "remove from old cp transaction");
937 cp_transaction->t_chp_stats.cs_dropped++;
819 __jbd2_journal_remove_checkpoint(jh); 938 __jbd2_journal_remove_checkpoint(jh);
820 } 939 }
821 940
@@ -867,10 +986,10 @@ restart_loop:
867 } 986 }
868 spin_unlock(&journal->j_list_lock); 987 spin_unlock(&journal->j_list_lock);
869 /* 988 /*
870 * This is a bit sleazy. We borrow j_list_lock to protect 989 * This is a bit sleazy. We use j_list_lock to protect transition
871 * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint. 990 * of a transaction into T_FINISHED state and calling
872 * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but 991 * __jbd2_journal_drop_transaction(). Otherwise we could race with
873 * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint 992 * other checkpointing code processing the transaction...
874 */ 993 */
875 spin_lock(&journal->j_state_lock); 994 spin_lock(&journal->j_state_lock);
876 spin_lock(&journal->j_list_lock); 995 spin_lock(&journal->j_list_lock);
@@ -890,6 +1009,36 @@ restart_loop:
890 1009
891 J_ASSERT(commit_transaction->t_state == T_COMMIT); 1010 J_ASSERT(commit_transaction->t_state == T_COMMIT);
892 1011
1012 commit_transaction->t_start = jiffies;
1013 stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
1014 commit_transaction->t_start);
1015
1016 /*
1017 * File the transaction for history
1018 */
1019 stats.ts_type = JBD2_STATS_RUN;
1020 stats.ts_tid = commit_transaction->t_tid;
1021 stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
1022 spin_lock(&journal->j_history_lock);
1023 memcpy(journal->j_history + journal->j_history_cur, &stats,
1024 sizeof(stats));
1025 if (++journal->j_history_cur == journal->j_history_max)
1026 journal->j_history_cur = 0;
1027
1028 /*
1029 * Calculate overall stats
1030 */
1031 journal->j_stats.ts_tid++;
1032 journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1033 journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1034 journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1035 journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1036 journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1037 journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1038 journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1039 journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1040 spin_unlock(&journal->j_history_lock);
1041
893 commit_transaction->t_state = T_FINISHED; 1042 commit_transaction->t_state = T_FINISHED;
894 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1043 J_ASSERT(commit_transaction == journal->j_committing_transaction);
895 journal->j_commit_sequence = commit_transaction->t_tid; 1044 journal->j_commit_sequence = commit_transaction->t_tid;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 6ddc5531587c..96ba846992e9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -36,6 +36,7 @@
36#include <linux/poison.h> 36#include <linux/poison.h>
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/seq_file.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include <asm/page.h> 42#include <asm/page.h>
@@ -640,6 +641,312 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
640 return jbd2_journal_add_journal_head(bh); 641 return jbd2_journal_add_journal_head(bh);
641} 642}
642 643
644struct jbd2_stats_proc_session {
645 journal_t *journal;
646 struct transaction_stats_s *stats;
647 int start;
648 int max;
649};
650
651static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
652 struct transaction_stats_s *ts,
653 int first)
654{
655 if (ts == s->stats + s->max)
656 ts = s->stats;
657 if (!first && ts == s->stats + s->start)
658 return NULL;
659 while (ts->ts_type == 0) {
660 ts++;
661 if (ts == s->stats + s->max)
662 ts = s->stats;
663 if (ts == s->stats + s->start)
664 return NULL;
665 }
666 return ts;
667
668}
669
670static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
671{
672 struct jbd2_stats_proc_session *s = seq->private;
673 struct transaction_stats_s *ts;
674 int l = *pos;
675
676 if (l == 0)
677 return SEQ_START_TOKEN;
678 ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
679 if (!ts)
680 return NULL;
681 l--;
682 while (l) {
683 ts = jbd2_history_skip_empty(s, ++ts, 0);
684 if (!ts)
685 break;
686 l--;
687 }
688 return ts;
689}
690
691static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
692{
693 struct jbd2_stats_proc_session *s = seq->private;
694 struct transaction_stats_s *ts = v;
695
696 ++*pos;
697 if (v == SEQ_START_TOKEN)
698 return jbd2_history_skip_empty(s, s->stats + s->start, 1);
699 else
700 return jbd2_history_skip_empty(s, ++ts, 0);
701}
702
703static int jbd2_seq_history_show(struct seq_file *seq, void *v)
704{
705 struct transaction_stats_s *ts = v;
706 if (v == SEQ_START_TOKEN) {
707 seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
708 "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
709 "wait", "run", "lock", "flush", "log", "hndls",
710 "block", "inlog", "ctime", "write", "drop",
711 "close");
712 return 0;
713 }
714 if (ts->ts_type == JBD2_STATS_RUN)
715 seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
716 "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
717 jiffies_to_msecs(ts->u.run.rs_wait),
718 jiffies_to_msecs(ts->u.run.rs_running),
719 jiffies_to_msecs(ts->u.run.rs_locked),
720 jiffies_to_msecs(ts->u.run.rs_flushing),
721 jiffies_to_msecs(ts->u.run.rs_logging),
722 ts->u.run.rs_handle_count,
723 ts->u.run.rs_blocks,
724 ts->u.run.rs_blocks_logged);
725 else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
726 seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
727 "C", ts->ts_tid, " ",
728 jiffies_to_msecs(ts->u.chp.cs_chp_time),
729 ts->u.chp.cs_written, ts->u.chp.cs_dropped,
730 ts->u.chp.cs_forced_to_close);
731 else
732 J_ASSERT(0);
733 return 0;
734}
735
736static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
737{
738}
739
740static struct seq_operations jbd2_seq_history_ops = {
741 .start = jbd2_seq_history_start,
742 .next = jbd2_seq_history_next,
743 .stop = jbd2_seq_history_stop,
744 .show = jbd2_seq_history_show,
745};
746
747static int jbd2_seq_history_open(struct inode *inode, struct file *file)
748{
749 journal_t *journal = PDE(inode)->data;
750 struct jbd2_stats_proc_session *s;
751 int rc, size;
752
753 s = kmalloc(sizeof(*s), GFP_KERNEL);
754 if (s == NULL)
755 return -ENOMEM;
756 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
757 s->stats = kmalloc(size, GFP_KERNEL);
758 if (s->stats == NULL) {
759 kfree(s);
760 return -ENOMEM;
761 }
762 spin_lock(&journal->j_history_lock);
763 memcpy(s->stats, journal->j_history, size);
764 s->max = journal->j_history_max;
765 s->start = journal->j_history_cur % s->max;
766 spin_unlock(&journal->j_history_lock);
767
768 rc = seq_open(file, &jbd2_seq_history_ops);
769 if (rc == 0) {
770 struct seq_file *m = file->private_data;
771 m->private = s;
772 } else {
773 kfree(s->stats);
774 kfree(s);
775 }
776 return rc;
777
778}
779
780static int jbd2_seq_history_release(struct inode *inode, struct file *file)
781{
782 struct seq_file *seq = file->private_data;
783 struct jbd2_stats_proc_session *s = seq->private;
784
785 kfree(s->stats);
786 kfree(s);
787 return seq_release(inode, file);
788}
789
790static struct file_operations jbd2_seq_history_fops = {
791 .owner = THIS_MODULE,
792 .open = jbd2_seq_history_open,
793 .read = seq_read,
794 .llseek = seq_lseek,
795 .release = jbd2_seq_history_release,
796};
797
798static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
799{
800 return *pos ? NULL : SEQ_START_TOKEN;
801}
802
803static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
804{
805 return NULL;
806}
807
808static int jbd2_seq_info_show(struct seq_file *seq, void *v)
809{
810 struct jbd2_stats_proc_session *s = seq->private;
811
812 if (v != SEQ_START_TOKEN)
813 return 0;
814 seq_printf(seq, "%lu transaction, each upto %u blocks\n",
815 s->stats->ts_tid,
816 s->journal->j_max_transaction_buffers);
817 if (s->stats->ts_tid == 0)
818 return 0;
819 seq_printf(seq, "average: \n %ums waiting for transaction\n",
820 jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid));
821 seq_printf(seq, " %ums running transaction\n",
822 jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid));
823 seq_printf(seq, " %ums transaction was being locked\n",
824 jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid));
825 seq_printf(seq, " %ums flushing data (in ordered mode)\n",
826 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
827 seq_printf(seq, " %ums logging transaction\n",
828 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
829 seq_printf(seq, " %lu handles per transaction\n",
830 s->stats->u.run.rs_handle_count / s->stats->ts_tid);
831 seq_printf(seq, " %lu blocks per transaction\n",
832 s->stats->u.run.rs_blocks / s->stats->ts_tid);
833 seq_printf(seq, " %lu logged blocks per transaction\n",
834 s->stats->u.run.rs_blocks_logged / s->stats->ts_tid);
835 return 0;
836}
837
838static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
839{
840}
841
842static struct seq_operations jbd2_seq_info_ops = {
843 .start = jbd2_seq_info_start,
844 .next = jbd2_seq_info_next,
845 .stop = jbd2_seq_info_stop,
846 .show = jbd2_seq_info_show,
847};
848
849static int jbd2_seq_info_open(struct inode *inode, struct file *file)
850{
851 journal_t *journal = PDE(inode)->data;
852 struct jbd2_stats_proc_session *s;
853 int rc, size;
854
855 s = kmalloc(sizeof(*s), GFP_KERNEL);
856 if (s == NULL)
857 return -ENOMEM;
858 size = sizeof(struct transaction_stats_s);
859 s->stats = kmalloc(size, GFP_KERNEL);
860 if (s->stats == NULL) {
861 kfree(s);
862 return -ENOMEM;
863 }
864 spin_lock(&journal->j_history_lock);
865 memcpy(s->stats, &journal->j_stats, size);
866 s->journal = journal;
867 spin_unlock(&journal->j_history_lock);
868
869 rc = seq_open(file, &jbd2_seq_info_ops);
870 if (rc == 0) {
871 struct seq_file *m = file->private_data;
872 m->private = s;
873 } else {
874 kfree(s->stats);
875 kfree(s);
876 }
877 return rc;
878
879}
880
881static int jbd2_seq_info_release(struct inode *inode, struct file *file)
882{
883 struct seq_file *seq = file->private_data;
884 struct jbd2_stats_proc_session *s = seq->private;
885 kfree(s->stats);
886 kfree(s);
887 return seq_release(inode, file);
888}
889
890static struct file_operations jbd2_seq_info_fops = {
891 .owner = THIS_MODULE,
892 .open = jbd2_seq_info_open,
893 .read = seq_read,
894 .llseek = seq_lseek,
895 .release = jbd2_seq_info_release,
896};
897
898static struct proc_dir_entry *proc_jbd2_stats;
899
900static void jbd2_stats_proc_init(journal_t *journal)
901{
902 char name[BDEVNAME_SIZE];
903
904 snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
905 journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
906 if (journal->j_proc_entry) {
907 struct proc_dir_entry *p;
908 p = create_proc_entry("history", S_IRUGO,
909 journal->j_proc_entry);
910 if (p) {
911 p->proc_fops = &jbd2_seq_history_fops;
912 p->data = journal;
913 p = create_proc_entry("info", S_IRUGO,
914 journal->j_proc_entry);
915 if (p) {
916 p->proc_fops = &jbd2_seq_info_fops;
917 p->data = journal;
918 }
919 }
920 }
921}
922
923static void jbd2_stats_proc_exit(journal_t *journal)
924{
925 char name[BDEVNAME_SIZE];
926
927 snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
928 remove_proc_entry("info", journal->j_proc_entry);
929 remove_proc_entry("history", journal->j_proc_entry);
930 remove_proc_entry(name, proc_jbd2_stats);
931}
932
933static void journal_init_stats(journal_t *journal)
934{
935 int size;
936
937 if (!proc_jbd2_stats)
938 return;
939
940 journal->j_history_max = 100;
941 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
942 journal->j_history = kzalloc(size, GFP_KERNEL);
943 if (!journal->j_history) {
944 journal->j_history_max = 0;
945 return;
946 }
947 spin_lock_init(&journal->j_history_lock);
948}
949
643/* 950/*
644 * Management for journal control blocks: functions to create and 951 * Management for journal control blocks: functions to create and
645 * destroy journal_t structures, and to initialise and read existing 952 * destroy journal_t structures, and to initialise and read existing
@@ -681,6 +988,9 @@ static journal_t * journal_init_common (void)
681 kfree(journal); 988 kfree(journal);
682 goto fail; 989 goto fail;
683 } 990 }
991
992 journal_init_stats(journal);
993
684 return journal; 994 return journal;
685fail: 995fail:
686 return NULL; 996 return NULL;
@@ -735,6 +1045,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
735 journal->j_fs_dev = fs_dev; 1045 journal->j_fs_dev = fs_dev;
736 journal->j_blk_offset = start; 1046 journal->j_blk_offset = start;
737 journal->j_maxlen = len; 1047 journal->j_maxlen = len;
1048 jbd2_stats_proc_init(journal);
738 1049
739 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 1050 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
740 J_ASSERT(bh != NULL); 1051 J_ASSERT(bh != NULL);
@@ -773,6 +1084,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
773 1084
774 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; 1085 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
775 journal->j_blocksize = inode->i_sb->s_blocksize; 1086 journal->j_blocksize = inode->i_sb->s_blocksize;
1087 jbd2_stats_proc_init(journal);
776 1088
777 /* journal descriptor can store up to n blocks -bzzz */ 1089 /* journal descriptor can store up to n blocks -bzzz */
778 n = journal->j_blocksize / sizeof(journal_block_tag_t); 1090 n = journal->j_blocksize / sizeof(journal_block_tag_t);
@@ -1153,6 +1465,8 @@ void jbd2_journal_destroy(journal_t *journal)
1153 brelse(journal->j_sb_buffer); 1465 brelse(journal->j_sb_buffer);
1154 } 1466 }
1155 1467
1468 if (journal->j_proc_entry)
1469 jbd2_stats_proc_exit(journal);
1156 if (journal->j_inode) 1470 if (journal->j_inode)
1157 iput(journal->j_inode); 1471 iput(journal->j_inode);
1158 if (journal->j_revoke) 1472 if (journal->j_revoke)
@@ -1264,6 +1578,32 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1264 return 1; 1578 return 1;
1265} 1579}
1266 1580
1581/*
1582 * jbd2_journal_clear_features () - Clear a given journal feature in the
1583 * superblock
1584 * @journal: Journal to act on.
1585 * @compat: bitmask of compatible features
1586 * @ro: bitmask of features that force read-only mount
1587 * @incompat: bitmask of incompatible features
1588 *
1589 * Clear a given journal feature as present on the
1590 * superblock.
1591 */
1592void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
1593 unsigned long ro, unsigned long incompat)
1594{
1595 journal_superblock_t *sb;
1596
1597 jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
1598 compat, ro, incompat);
1599
1600 sb = journal->j_superblock;
1601
1602 sb->s_feature_compat &= ~cpu_to_be32(compat);
1603 sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
1604 sb->s_feature_incompat &= ~cpu_to_be32(incompat);
1605}
1606EXPORT_SYMBOL(jbd2_journal_clear_features);
1267 1607
1268/** 1608/**
1269 * int jbd2_journal_update_format () - Update on-disk journal structure. 1609 * int jbd2_journal_update_format () - Update on-disk journal structure.
@@ -1633,7 +1973,7 @@ static int journal_init_jbd2_journal_head_cache(void)
1633 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", 1973 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
1634 sizeof(struct journal_head), 1974 sizeof(struct journal_head),
1635 0, /* offset */ 1975 0, /* offset */
1636 0, /* flags */ 1976 SLAB_TEMPORARY, /* flags */
1637 NULL); /* ctor */ 1977 NULL); /* ctor */
1638 retval = 0; 1978 retval = 0;
1639 if (jbd2_journal_head_cache == 0) { 1979 if (jbd2_journal_head_cache == 0) {
@@ -1900,6 +2240,28 @@ static void __exit jbd2_remove_debugfs_entry(void)
1900 2240
1901#endif 2241#endif
1902 2242
2243#ifdef CONFIG_PROC_FS
2244
2245#define JBD2_STATS_PROC_NAME "fs/jbd2"
2246
2247static void __init jbd2_create_jbd_stats_proc_entry(void)
2248{
2249 proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
2250}
2251
2252static void __exit jbd2_remove_jbd_stats_proc_entry(void)
2253{
2254 if (proc_jbd2_stats)
2255 remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
2256}
2257
2258#else
2259
2260#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
2261#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)
2262
2263#endif
2264
1903struct kmem_cache *jbd2_handle_cache; 2265struct kmem_cache *jbd2_handle_cache;
1904 2266
1905static int __init journal_init_handle_cache(void) 2267static int __init journal_init_handle_cache(void)
@@ -1907,7 +2269,7 @@ static int __init journal_init_handle_cache(void)
1907 jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle", 2269 jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
1908 sizeof(handle_t), 2270 sizeof(handle_t),
1909 0, /* offset */ 2271 0, /* offset */
1910 0, /* flags */ 2272 SLAB_TEMPORARY, /* flags */
1911 NULL); /* ctor */ 2273 NULL); /* ctor */
1912 if (jbd2_handle_cache == NULL) { 2274 if (jbd2_handle_cache == NULL) {
1913 printk(KERN_EMERG "JBD: failed to create handle cache\n"); 2275 printk(KERN_EMERG "JBD: failed to create handle cache\n");
@@ -1955,6 +2317,7 @@ static int __init journal_init(void)
1955 if (ret != 0) 2317 if (ret != 0)
1956 jbd2_journal_destroy_caches(); 2318 jbd2_journal_destroy_caches();
1957 jbd2_create_debugfs_entry(); 2319 jbd2_create_debugfs_entry();
2320 jbd2_create_jbd_stats_proc_entry();
1958 return ret; 2321 return ret;
1959} 2322}
1960 2323
@@ -1966,6 +2329,7 @@ static void __exit journal_exit(void)
1966 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); 2329 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
1967#endif 2330#endif
1968 jbd2_remove_debugfs_entry(); 2331 jbd2_remove_debugfs_entry();
2332 jbd2_remove_jbd_stats_proc_entry();
1969 jbd2_journal_destroy_caches(); 2333 jbd2_journal_destroy_caches();
1970} 2334}
1971 2335
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index d0ce627539ef..921680663fa2 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/crc32.h>
24#endif 25#endif
25 26
26/* 27/*
@@ -316,6 +317,37 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
316 return block; 317 return block;
317} 318}
318 319
320/*
321 * calc_chksums calculates the checksums for the blocks described in the
322 * descriptor block.
323 */
324static int calc_chksums(journal_t *journal, struct buffer_head *bh,
325 unsigned long *next_log_block, __u32 *crc32_sum)
326{
327 int i, num_blks, err;
328 unsigned long io_block;
329 struct buffer_head *obh;
330
331 num_blks = count_tags(journal, bh);
332 /* Calculate checksum of the descriptor block. */
333 *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
334
335 for (i = 0; i < num_blks; i++) {
336 io_block = (*next_log_block)++;
337 wrap(journal, *next_log_block);
338 err = jread(&obh, journal, io_block);
339 if (err) {
340 printk(KERN_ERR "JBD: IO error %d recovering block "
341 "%lu in log\n", err, io_block);
342 return 1;
343 } else {
344 *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
345 obh->b_size);
346 }
347 }
348 return 0;
349}
350
319static int do_one_pass(journal_t *journal, 351static int do_one_pass(journal_t *journal,
320 struct recovery_info *info, enum passtype pass) 352 struct recovery_info *info, enum passtype pass)
321{ 353{
@@ -328,6 +360,7 @@ static int do_one_pass(journal_t *journal,
328 unsigned int sequence; 360 unsigned int sequence;
329 int blocktype; 361 int blocktype;
330 int tag_bytes = journal_tag_bytes(journal); 362 int tag_bytes = journal_tag_bytes(journal);
363 __u32 crc32_sum = ~0; /* Transactional Checksums */
331 364
332 /* Precompute the maximum metadata descriptors in a descriptor block */ 365 /* Precompute the maximum metadata descriptors in a descriptor block */
333 int MAX_BLOCKS_PER_DESC; 366 int MAX_BLOCKS_PER_DESC;
@@ -419,12 +452,26 @@ static int do_one_pass(journal_t *journal,
419 switch(blocktype) { 452 switch(blocktype) {
420 case JBD2_DESCRIPTOR_BLOCK: 453 case JBD2_DESCRIPTOR_BLOCK:
421 /* If it is a valid descriptor block, replay it 454 /* If it is a valid descriptor block, replay it
422 * in pass REPLAY; otherwise, just skip over the 455 * in pass REPLAY; if journal_checksums enabled, then
423 * blocks it describes. */ 456 * calculate checksums in PASS_SCAN, otherwise,
457 * just skip over the blocks it describes. */
424 if (pass != PASS_REPLAY) { 458 if (pass != PASS_REPLAY) {
459 if (pass == PASS_SCAN &&
460 JBD2_HAS_COMPAT_FEATURE(journal,
461 JBD2_FEATURE_COMPAT_CHECKSUM) &&
462 !info->end_transaction) {
463 if (calc_chksums(journal, bh,
464 &next_log_block,
465 &crc32_sum)) {
466 put_bh(bh);
467 break;
468 }
469 put_bh(bh);
470 continue;
471 }
425 next_log_block += count_tags(journal, bh); 472 next_log_block += count_tags(journal, bh);
426 wrap(journal, next_log_block); 473 wrap(journal, next_log_block);
427 brelse(bh); 474 put_bh(bh);
428 continue; 475 continue;
429 } 476 }
430 477
@@ -516,9 +563,96 @@ static int do_one_pass(journal_t *journal,
516 continue; 563 continue;
517 564
518 case JBD2_COMMIT_BLOCK: 565 case JBD2_COMMIT_BLOCK:
519 /* Found an expected commit block: not much to 566 /* How to differentiate between interrupted commit
520 * do other than move on to the next sequence 567 * and journal corruption ?
568 *
569 * {nth transaction}
570 * Checksum Verification Failed
571 * |
572 * ____________________
573 * | |
574 * async_commit sync_commit
575 * | |
576 * | GO TO NEXT "Journal Corruption"
577 * | TRANSACTION
578 * |
579 * {(n+1)th transanction}
580 * |
581 * _______|______________
582 * | |
583 * Commit block found Commit block not found
584 * | |
585 * "Journal Corruption" |
586 * _____________|_________
587 * | |
588 * nth trans corrupt OR nth trans
589 * and (n+1)th interrupted interrupted
590 * before commit block
591 * could reach the disk.
592 * (Cannot find the difference in above
593 * mentioned conditions. Hence assume
594 * "Interrupted Commit".)
595 */
596
597 /* Found an expected commit block: if checksums
598 * are present verify them in PASS_SCAN; else not
599 * much to do other than move on to the next sequence
521 * number. */ 600 * number. */
601 if (pass == PASS_SCAN &&
602 JBD2_HAS_COMPAT_FEATURE(journal,
603 JBD2_FEATURE_COMPAT_CHECKSUM)) {
604 int chksum_err, chksum_seen;
605 struct commit_header *cbh =
606 (struct commit_header *)bh->b_data;
607 unsigned found_chksum =
608 be32_to_cpu(cbh->h_chksum[0]);
609
610 chksum_err = chksum_seen = 0;
611
612 if (info->end_transaction) {
613 printk(KERN_ERR "JBD: Transaction %u "
614 "found to be corrupt.\n",
615 next_commit_ID - 1);
616 brelse(bh);
617 break;
618 }
619
620 if (crc32_sum == found_chksum &&
621 cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
622 cbh->h_chksum_size ==
623 JBD2_CRC32_CHKSUM_SIZE)
624 chksum_seen = 1;
625 else if (!(cbh->h_chksum_type == 0 &&
626 cbh->h_chksum_size == 0 &&
627 found_chksum == 0 &&
628 !chksum_seen))
629 /*
630 * If fs is mounted using an old kernel and then
631 * kernel with journal_chksum is used then we
632 * get a situation where the journal flag has
633 * checksum flag set but checksums are not
634 * present i.e chksum = 0, in the individual
635 * commit blocks.
636 * Hence to avoid checksum failures, in this
637 * situation, this extra check is added.
638 */
639 chksum_err = 1;
640
641 if (chksum_err) {
642 info->end_transaction = next_commit_ID;
643
644 if (!JBD2_HAS_COMPAT_FEATURE(journal,
645 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
646 printk(KERN_ERR
647 "JBD: Transaction %u "
648 "found to be corrupt.\n",
649 next_commit_ID);
650 brelse(bh);
651 break;
652 }
653 }
654 crc32_sum = ~0;
655 }
522 brelse(bh); 656 brelse(bh);
523 next_commit_ID++; 657 next_commit_ID++;
524 continue; 658 continue;
@@ -554,9 +688,10 @@ static int do_one_pass(journal_t *journal,
554 * transaction marks the end of the valid log. 688 * transaction marks the end of the valid log.
555 */ 689 */
556 690
557 if (pass == PASS_SCAN) 691 if (pass == PASS_SCAN) {
558 info->end_transaction = next_commit_ID; 692 if (!info->end_transaction)
559 else { 693 info->end_transaction = next_commit_ID;
694 } else {
560 /* It's really bad news if different passes end up at 695 /* It's really bad news if different passes end up at
561 * different places (but possible due to IO errors). */ 696 * different places (but possible due to IO errors). */
562 if (info->end_transaction != next_commit_ID) { 697 if (info->end_transaction != next_commit_ID) {
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 3595fd432d5b..df36f42e19e1 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -171,13 +171,15 @@ int __init jbd2_journal_init_revoke_caches(void)
171{ 171{
172 jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", 172 jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
173 sizeof(struct jbd2_revoke_record_s), 173 sizeof(struct jbd2_revoke_record_s),
174 0, SLAB_HWCACHE_ALIGN, NULL); 174 0,
175 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
176 NULL);
175 if (jbd2_revoke_record_cache == 0) 177 if (jbd2_revoke_record_cache == 0)
176 return -ENOMEM; 178 return -ENOMEM;
177 179
178 jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", 180 jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
179 sizeof(struct jbd2_revoke_table_s), 181 sizeof(struct jbd2_revoke_table_s),
180 0, 0, NULL); 182 0, SLAB_TEMPORARY, NULL);
181 if (jbd2_revoke_table_cache == 0) { 183 if (jbd2_revoke_table_cache == 0) {
182 kmem_cache_destroy(jbd2_revoke_record_cache); 184 kmem_cache_destroy(jbd2_revoke_record_cache);
183 jbd2_revoke_record_cache = NULL; 185 jbd2_revoke_record_cache = NULL;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index b1fcf2b3dca3..b9b0b6f899b9 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -54,11 +54,13 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
54 spin_lock_init(&transaction->t_handle_lock); 54 spin_lock_init(&transaction->t_handle_lock);
55 55
56 /* Set up the commit timer for the new transaction. */ 56 /* Set up the commit timer for the new transaction. */
57 journal->j_commit_timer.expires = transaction->t_expires; 57 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
58 add_timer(&journal->j_commit_timer); 58 add_timer(&journal->j_commit_timer);
59 59
60 J_ASSERT(journal->j_running_transaction == NULL); 60 J_ASSERT(journal->j_running_transaction == NULL);
61 journal->j_running_transaction = transaction; 61 journal->j_running_transaction = transaction;
62 transaction->t_max_wait = 0;
63 transaction->t_start = jiffies;
62 64
63 return transaction; 65 return transaction;
64} 66}
@@ -85,6 +87,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
85 int nblocks = handle->h_buffer_credits; 87 int nblocks = handle->h_buffer_credits;
86 transaction_t *new_transaction = NULL; 88 transaction_t *new_transaction = NULL;
87 int ret = 0; 89 int ret = 0;
90 unsigned long ts = jiffies;
88 91
89 if (nblocks > journal->j_max_transaction_buffers) { 92 if (nblocks > journal->j_max_transaction_buffers) {
90 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 93 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -217,6 +220,12 @@ repeat_locked:
217 /* OK, account for the buffers that this operation expects to 220 /* OK, account for the buffers that this operation expects to
218 * use and add the handle to the running transaction. */ 221 * use and add the handle to the running transaction. */
219 222
223 if (time_after(transaction->t_start, ts)) {
224 ts = jbd2_time_diff(ts, transaction->t_start);
225 if (ts > transaction->t_max_wait)
226 transaction->t_max_wait = ts;
227 }
228
220 handle->h_transaction = transaction; 229 handle->h_transaction = transaction;
221 transaction->t_outstanding_credits += nblocks; 230 transaction->t_outstanding_credits += nblocks;
222 transaction->t_updates++; 231 transaction->t_updates++;
@@ -232,6 +241,8 @@ out:
232 return ret; 241 return ret;
233} 242}
234 243
244static struct lock_class_key jbd2_handle_key;
245
235/* Allocate a new handle. This should probably be in a slab... */ 246/* Allocate a new handle. This should probably be in a slab... */
236static handle_t *new_handle(int nblocks) 247static handle_t *new_handle(int nblocks)
237{ 248{
@@ -242,6 +253,9 @@ static handle_t *new_handle(int nblocks)
242 handle->h_buffer_credits = nblocks; 253 handle->h_buffer_credits = nblocks;
243 handle->h_ref = 1; 254 handle->h_ref = 1;
244 255
256 lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle",
257 &jbd2_handle_key, 0);
258
245 return handle; 259 return handle;
246} 260}
247 261
@@ -284,7 +298,11 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
284 jbd2_free_handle(handle); 298 jbd2_free_handle(handle);
285 current->journal_info = NULL; 299 current->journal_info = NULL;
286 handle = ERR_PTR(err); 300 handle = ERR_PTR(err);
301 goto out;
287 } 302 }
303
304 lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
305out:
288 return handle; 306 return handle;
289} 307}
290 308
@@ -1164,7 +1182,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1164 } 1182 }
1165 1183
1166 /* That test should have eliminated the following case: */ 1184 /* That test should have eliminated the following case: */
1167 J_ASSERT_JH(jh, jh->b_frozen_data == 0); 1185 J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
1168 1186
1169 JBUFFER_TRACE(jh, "file as BJ_Metadata"); 1187 JBUFFER_TRACE(jh, "file as BJ_Metadata");
1170 spin_lock(&journal->j_list_lock); 1188 spin_lock(&journal->j_list_lock);
@@ -1410,6 +1428,8 @@ int jbd2_journal_stop(handle_t *handle)
1410 spin_unlock(&journal->j_state_lock); 1428 spin_unlock(&journal->j_state_lock);
1411 } 1429 }
1412 1430
1431 lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
1432
1413 jbd2_free_handle(handle); 1433 jbd2_free_handle(handle);
1414 return err; 1434 return err;
1415} 1435}
@@ -1512,7 +1532,7 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1512 1532
1513 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); 1533 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1514 if (jh->b_jlist != BJ_None) 1534 if (jh->b_jlist != BJ_None)
1515 J_ASSERT_JH(jh, transaction != 0); 1535 J_ASSERT_JH(jh, transaction != NULL);
1516 1536
1517 switch (jh->b_jlist) { 1537 switch (jh->b_jlist) {
1518 case BJ_None: 1538 case BJ_None:
@@ -1581,11 +1601,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1581 if (buffer_locked(bh) || buffer_dirty(bh)) 1601 if (buffer_locked(bh) || buffer_dirty(bh))
1582 goto out; 1602 goto out;
1583 1603
1584 if (jh->b_next_transaction != 0) 1604 if (jh->b_next_transaction != NULL)
1585 goto out; 1605 goto out;
1586 1606
1587 spin_lock(&journal->j_list_lock); 1607 spin_lock(&journal->j_list_lock);
1588 if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { 1608 if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
1589 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { 1609 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1590 /* A written-back ordered data buffer */ 1610 /* A written-back ordered data buffer */
1591 JBUFFER_TRACE(jh, "release data"); 1611 JBUFFER_TRACE(jh, "release data");
@@ -1593,7 +1613,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1593 jbd2_journal_remove_journal_head(bh); 1613 jbd2_journal_remove_journal_head(bh);
1594 __brelse(bh); 1614 __brelse(bh);
1595 } 1615 }
1596 } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { 1616 } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1597 /* written-back checkpointed metadata buffer */ 1617 /* written-back checkpointed metadata buffer */
1598 if (jh->b_jlist == BJ_None) { 1618 if (jh->b_jlist == BJ_None) {
1599 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1619 JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1953,7 +1973,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
1953 1973
1954 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); 1974 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1955 J_ASSERT_JH(jh, jh->b_transaction == transaction || 1975 J_ASSERT_JH(jh, jh->b_transaction == transaction ||
1956 jh->b_transaction == 0); 1976 jh->b_transaction == NULL);
1957 1977
1958 if (jh->b_transaction && jh->b_jlist == jlist) 1978 if (jh->b_transaction && jh->b_jlist == jlist)
1959 return; 1979 return;
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index d568ae846741..8adebd3e43c6 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -105,7 +105,7 @@ static int jffs2_garbage_collect_thread(void *_c)
105 105
106 /* Put_super will send a SIGKILL and then wait on the sem. 106 /* Put_super will send a SIGKILL and then wait on the sem.
107 */ 107 */
108 while (signal_pending(current)) { 108 while (signal_pending(current) || freezing(current)) {
109 siginfo_t info; 109 siginfo_t info;
110 unsigned long signr; 110 unsigned long signr;
111 111
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index df25ecc418af..4dcc05819998 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -284,11 +284,11 @@ static struct dir_table_slot *find_index(struct inode *ip, u32 index,
284 release_metapage(*mp); 284 release_metapage(*mp);
285 *mp = NULL; 285 *mp = NULL;
286 } 286 }
287 if (*mp == 0) { 287 if (!(*mp)) {
288 *lblock = blkno; 288 *lblock = blkno;
289 *mp = read_index_page(ip, blkno); 289 *mp = read_index_page(ip, blkno);
290 } 290 }
291 if (*mp == 0) { 291 if (!(*mp)) {
292 jfs_err("free_index: error reading directory table"); 292 jfs_err("free_index: error reading directory table");
293 return NULL; 293 return NULL;
294 } 294 }
@@ -413,7 +413,8 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
413 } 413 }
414 ip->i_size = PSIZE; 414 ip->i_size = PSIZE;
415 415
416 if ((mp = get_index_page(ip, 0)) == 0) { 416 mp = get_index_page(ip, 0);
417 if (!mp) {
417 jfs_err("add_index: get_metapage failed!"); 418 jfs_err("add_index: get_metapage failed!");
418 xtTruncate(tid, ip, 0, COMMIT_PWMAP); 419 xtTruncate(tid, ip, 0, COMMIT_PWMAP);
419 memcpy(&jfs_ip->i_dirtable, temp_table, 420 memcpy(&jfs_ip->i_dirtable, temp_table,
@@ -461,7 +462,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
461 } else 462 } else
462 mp = read_index_page(ip, blkno); 463 mp = read_index_page(ip, blkno);
463 464
464 if (mp == 0) { 465 if (!mp) {
465 jfs_err("add_index: get/read_metapage failed!"); 466 jfs_err("add_index: get/read_metapage failed!");
466 goto clean_up; 467 goto clean_up;
467 } 468 }
@@ -499,7 +500,7 @@ static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
499 500
500 dirtab_slot = find_index(ip, index, &mp, &lblock); 501 dirtab_slot = find_index(ip, index, &mp, &lblock);
501 502
502 if (dirtab_slot == 0) 503 if (!dirtab_slot)
503 return; 504 return;
504 505
505 dirtab_slot->flag = DIR_INDEX_FREE; 506 dirtab_slot->flag = DIR_INDEX_FREE;
@@ -526,7 +527,7 @@ static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
526 527
527 dirtab_slot = find_index(ip, index, mp, lblock); 528 dirtab_slot = find_index(ip, index, mp, lblock);
528 529
529 if (dirtab_slot == 0) 530 if (!dirtab_slot)
530 return; 531 return;
531 532
532 DTSaddress(dirtab_slot, bn); 533 DTSaddress(dirtab_slot, bn);
@@ -552,7 +553,7 @@ static int read_index(struct inode *ip, u32 index,
552 struct dir_table_slot *slot; 553 struct dir_table_slot *slot;
553 554
554 slot = find_index(ip, index, &mp, &lblock); 555 slot = find_index(ip, index, &mp, &lblock);
555 if (slot == 0) { 556 if (!slot) {
556 return -EIO; 557 return -EIO;
557 } 558 }
558 559
@@ -592,10 +593,8 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
592 struct component_name ciKey; 593 struct component_name ciKey;
593 struct super_block *sb = ip->i_sb; 594 struct super_block *sb = ip->i_sb;
594 595
595 ciKey.name = 596 ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS);
596 (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), 597 if (!ciKey.name) {
597 GFP_NOFS);
598 if (ciKey.name == 0) {
599 rc = -ENOMEM; 598 rc = -ENOMEM;
600 goto dtSearch_Exit2; 599 goto dtSearch_Exit2;
601 } 600 }
@@ -957,10 +956,8 @@ static int dtSplitUp(tid_t tid,
957 smp = split->mp; 956 smp = split->mp;
958 sp = DT_PAGE(ip, smp); 957 sp = DT_PAGE(ip, smp);
959 958
960 key.name = 959 key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS);
961 (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), 960 if (!key.name) {
962 GFP_NOFS);
963 if (key.name == 0) {
964 DT_PUTPAGE(smp); 961 DT_PUTPAGE(smp);
965 rc = -ENOMEM; 962 rc = -ENOMEM;
966 goto dtSplitUp_Exit; 963 goto dtSplitUp_Exit;
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 8561c6ecece0..cdac2d5bafeb 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -74,7 +74,7 @@ struct idtentry {
74#define DTIHDRDATALEN 11 74#define DTIHDRDATALEN 11
75 75
76/* compute number of slots for entry */ 76/* compute number of slots for entry */
77#define NDTINTERNAL(klen) ( ((4 + (klen)) + (15 - 1)) / 15 ) 77#define NDTINTERNAL(klen) (DIV_ROUND_UP((4 + (klen)), 15))
78 78
79 79
80/* 80/*
@@ -133,7 +133,7 @@ struct dir_table_slot {
133 ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) ) 133 ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) )
134 134
135/* compute number of slots for entry */ 135/* compute number of slots for entry */
136#define NDTLEAF_LEGACY(klen) ( ((2 + (klen)) + (15 - 1)) / 15 ) 136#define NDTLEAF_LEGACY(klen) (DIV_ROUND_UP((2 + (klen)), 15))
137#define NDTLEAF NDTINTERNAL 137#define NDTLEAF NDTINTERNAL
138 138
139 139
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 3870ba8b9086..9bf29f771737 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -381,7 +381,7 @@ int diRead(struct inode *ip)
381 381
382 /* read the page of disk inode */ 382 /* read the page of disk inode */
383 mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); 383 mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
384 if (mp == 0) { 384 if (!mp) {
385 jfs_err("diRead: read_metapage failed"); 385 jfs_err("diRead: read_metapage failed");
386 return -EIO; 386 return -EIO;
387 } 387 }
@@ -654,7 +654,7 @@ int diWrite(tid_t tid, struct inode *ip)
654 /* read the page of disk inode */ 654 /* read the page of disk inode */
655 retry: 655 retry:
656 mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); 656 mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
657 if (mp == 0) 657 if (!mp)
658 return -EIO; 658 return -EIO;
659 659
660 /* get the pointer to the disk inode */ 660 /* get the pointer to the disk inode */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 15a3974cdeeb..325a9679b95a 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -208,6 +208,17 @@ static struct lmStat {
208} lmStat; 208} lmStat;
209#endif 209#endif
210 210
211static void write_special_inodes(struct jfs_log *log,
212 int (*writer)(struct address_space *))
213{
214 struct jfs_sb_info *sbi;
215
216 list_for_each_entry(sbi, &log->sb_list, log_list) {
217 writer(sbi->ipbmap->i_mapping);
218 writer(sbi->ipimap->i_mapping);
219 writer(sbi->direct_inode->i_mapping);
220 }
221}
211 222
212/* 223/*
213 * NAME: lmLog() 224 * NAME: lmLog()
@@ -935,22 +946,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
935 struct lrd lrd; 946 struct lrd lrd;
936 int lsn; 947 int lsn;
937 struct logsyncblk *lp; 948 struct logsyncblk *lp;
938 struct jfs_sb_info *sbi;
939 unsigned long flags; 949 unsigned long flags;
940 950
941 /* push dirty metapages out to disk */ 951 /* push dirty metapages out to disk */
942 if (hard_sync) 952 if (hard_sync)
943 list_for_each_entry(sbi, &log->sb_list, log_list) { 953 write_special_inodes(log, filemap_fdatawrite);
944 filemap_fdatawrite(sbi->ipbmap->i_mapping);
945 filemap_fdatawrite(sbi->ipimap->i_mapping);
946 filemap_fdatawrite(sbi->direct_inode->i_mapping);
947 }
948 else 954 else
949 list_for_each_entry(sbi, &log->sb_list, log_list) { 955 write_special_inodes(log, filemap_flush);
950 filemap_flush(sbi->ipbmap->i_mapping);
951 filemap_flush(sbi->ipimap->i_mapping);
952 filemap_flush(sbi->direct_inode->i_mapping);
953 }
954 956
955 /* 957 /*
956 * forward syncpt 958 * forward syncpt
@@ -1536,7 +1538,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
1536{ 1538{
1537 int i; 1539 int i;
1538 struct tblock *target = NULL; 1540 struct tblock *target = NULL;
1539 struct jfs_sb_info *sbi;
1540 1541
1541 /* jfs_write_inode may call us during read-only mount */ 1542 /* jfs_write_inode may call us during read-only mount */
1542 if (!log) 1543 if (!log)
@@ -1598,11 +1599,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
1598 if (wait < 2) 1599 if (wait < 2)
1599 return; 1600 return;
1600 1601
1601 list_for_each_entry(sbi, &log->sb_list, log_list) { 1602 write_special_inodes(log, filemap_fdatawrite);
1602 filemap_fdatawrite(sbi->ipbmap->i_mapping);
1603 filemap_fdatawrite(sbi->ipimap->i_mapping);
1604 filemap_fdatawrite(sbi->direct_inode->i_mapping);
1605 }
1606 1603
1607 /* 1604 /*
1608 * If there was recent activity, we may need to wait 1605 * If there was recent activity, we may need to wait
@@ -1611,6 +1608,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
1611 if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) { 1608 if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) {
1612 for (i = 0; i < 200; i++) { /* Too much? */ 1609 for (i = 0; i < 200; i++) { /* Too much? */
1613 msleep(250); 1610 msleep(250);
1611 write_special_inodes(log, filemap_fdatawrite);
1614 if (list_empty(&log->cqueue) && 1612 if (list_empty(&log->cqueue) &&
1615 list_empty(&log->synclist)) 1613 list_empty(&log->synclist))
1616 break; 1614 break;
@@ -2347,7 +2345,7 @@ int jfsIOWait(void *arg)
2347 2345
2348 do { 2346 do {
2349 spin_lock_irq(&log_redrive_lock); 2347 spin_lock_irq(&log_redrive_lock);
2350 while ((bp = log_redrive_list) != 0) { 2348 while ((bp = log_redrive_list)) {
2351 log_redrive_list = bp->l_redrive_next; 2349 log_redrive_list = bp->l_redrive_next;
2352 bp->l_redrive_next = NULL; 2350 bp->l_redrive_next = NULL;
2353 spin_unlock_irq(&log_redrive_lock); 2351 spin_unlock_irq(&log_redrive_lock);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index f5cd8d38af7a..d1e64f2f2fcd 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -39,11 +39,11 @@ static struct {
39#endif 39#endif
40 40
41#define metapage_locked(mp) test_bit(META_locked, &(mp)->flag) 41#define metapage_locked(mp) test_bit(META_locked, &(mp)->flag)
42#define trylock_metapage(mp) test_and_set_bit(META_locked, &(mp)->flag) 42#define trylock_metapage(mp) test_and_set_bit_lock(META_locked, &(mp)->flag)
43 43
44static inline void unlock_metapage(struct metapage *mp) 44static inline void unlock_metapage(struct metapage *mp)
45{ 45{
46 clear_bit(META_locked, &mp->flag); 46 clear_bit_unlock(META_locked, &mp->flag);
47 wake_up(&mp->wait); 47 wake_up(&mp->wait);
48} 48}
49 49
@@ -88,7 +88,7 @@ struct meta_anchor {
88}; 88};
89#define mp_anchor(page) ((struct meta_anchor *)page_private(page)) 89#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
90 90
91static inline struct metapage *page_to_mp(struct page *page, uint offset) 91static inline struct metapage *page_to_mp(struct page *page, int offset)
92{ 92{
93 if (!PagePrivate(page)) 93 if (!PagePrivate(page))
94 return NULL; 94 return NULL;
@@ -153,7 +153,7 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
153} 153}
154 154
155#else 155#else
156static inline struct metapage *page_to_mp(struct page *page, uint offset) 156static inline struct metapage *page_to_mp(struct page *page, int offset)
157{ 157{
158 return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL; 158 return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
159} 159}
@@ -249,7 +249,7 @@ static inline void drop_metapage(struct page *page, struct metapage *mp)
249 */ 249 */
250 250
251static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock, 251static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock,
252 unsigned int *len) 252 int *len)
253{ 253{
254 int rc = 0; 254 int rc = 0;
255 int xflag; 255 int xflag;
@@ -352,25 +352,27 @@ static void metapage_write_end_io(struct bio *bio, int err)
352static int metapage_writepage(struct page *page, struct writeback_control *wbc) 352static int metapage_writepage(struct page *page, struct writeback_control *wbc)
353{ 353{
354 struct bio *bio = NULL; 354 struct bio *bio = NULL;
355 unsigned int block_offset; /* block offset of mp within page */ 355 int block_offset; /* block offset of mp within page */
356 struct inode *inode = page->mapping->host; 356 struct inode *inode = page->mapping->host;
357 unsigned int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage; 357 int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage;
358 unsigned int len; 358 int len;
359 unsigned int xlen; 359 int xlen;
360 struct metapage *mp; 360 struct metapage *mp;
361 int redirty = 0; 361 int redirty = 0;
362 sector_t lblock; 362 sector_t lblock;
363 int nr_underway = 0;
363 sector_t pblock; 364 sector_t pblock;
364 sector_t next_block = 0; 365 sector_t next_block = 0;
365 sector_t page_start; 366 sector_t page_start;
366 unsigned long bio_bytes = 0; 367 unsigned long bio_bytes = 0;
367 unsigned long bio_offset = 0; 368 unsigned long bio_offset = 0;
368 unsigned int offset; 369 int offset;
369 370
370 page_start = (sector_t)page->index << 371 page_start = (sector_t)page->index <<
371 (PAGE_CACHE_SHIFT - inode->i_blkbits); 372 (PAGE_CACHE_SHIFT - inode->i_blkbits);
372 BUG_ON(!PageLocked(page)); 373 BUG_ON(!PageLocked(page));
373 BUG_ON(PageWriteback(page)); 374 BUG_ON(PageWriteback(page));
375 set_page_writeback(page);
374 376
375 for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { 377 for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
376 mp = page_to_mp(page, offset); 378 mp = page_to_mp(page, offset);
@@ -413,11 +415,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
413 if (!bio->bi_size) 415 if (!bio->bi_size)
414 goto dump_bio; 416 goto dump_bio;
415 submit_bio(WRITE, bio); 417 submit_bio(WRITE, bio);
418 nr_underway++;
416 bio = NULL; 419 bio = NULL;
417 } else { 420 } else
418 set_page_writeback(page);
419 inc_io(page); 421 inc_io(page);
420 }
421 xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits; 422 xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits;
422 pblock = metapage_get_blocks(inode, lblock, &xlen); 423 pblock = metapage_get_blocks(inode, lblock, &xlen);
423 if (!pblock) { 424 if (!pblock) {
@@ -427,7 +428,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
427 continue; 428 continue;
428 } 429 }
429 set_bit(META_io, &mp->flag); 430 set_bit(META_io, &mp->flag);
430 len = min(xlen, (uint) JFS_SBI(inode->i_sb)->nbperpage); 431 len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
431 432
432 bio = bio_alloc(GFP_NOFS, 1); 433 bio = bio_alloc(GFP_NOFS, 1);
433 bio->bi_bdev = inode->i_sb->s_bdev; 434 bio->bi_bdev = inode->i_sb->s_bdev;
@@ -449,12 +450,16 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
449 goto dump_bio; 450 goto dump_bio;
450 451
451 submit_bio(WRITE, bio); 452 submit_bio(WRITE, bio);
453 nr_underway++;
452 } 454 }
453 if (redirty) 455 if (redirty)
454 redirty_page_for_writepage(wbc, page); 456 redirty_page_for_writepage(wbc, page);
455 457
456 unlock_page(page); 458 unlock_page(page);
457 459
460 if (nr_underway == 0)
461 end_page_writeback(page);
462
458 return 0; 463 return 0;
459add_failed: 464add_failed:
460 /* We should never reach here, since we're only adding one vec */ 465 /* We should never reach here, since we're only adding one vec */
@@ -475,13 +480,13 @@ static int metapage_readpage(struct file *fp, struct page *page)
475{ 480{
476 struct inode *inode = page->mapping->host; 481 struct inode *inode = page->mapping->host;
477 struct bio *bio = NULL; 482 struct bio *bio = NULL;
478 unsigned int block_offset; 483 int block_offset;
479 unsigned int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; 484 int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
480 sector_t page_start; /* address of page in fs blocks */ 485 sector_t page_start; /* address of page in fs blocks */
481 sector_t pblock; 486 sector_t pblock;
482 unsigned int xlen; 487 int xlen;
483 unsigned int len; 488 unsigned int len;
484 unsigned int offset; 489 int offset;
485 490
486 BUG_ON(!PageLocked(page)); 491 BUG_ON(!PageLocked(page));
487 page_start = (sector_t)page->index << 492 page_start = (sector_t)page->index <<
@@ -530,7 +535,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
530{ 535{
531 struct metapage *mp; 536 struct metapage *mp;
532 int ret = 1; 537 int ret = 1;
533 unsigned int offset; 538 int offset;
534 539
535 for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { 540 for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
536 mp = page_to_mp(page, offset); 541 mp = page_to_mp(page, offset);
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 644429acb8c0..7b698f2ec45a 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -147,7 +147,7 @@ int jfs_mount(struct super_block *sb)
147 */ 147 */
148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) { 148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1); 149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
150 if (ipaimap2 == 0) { 150 if (!ipaimap2) {
151 jfs_err("jfs_mount: Faild to read AGGREGATE_I"); 151 jfs_err("jfs_mount: Faild to read AGGREGATE_I");
152 rc = -EIO; 152 rc = -EIO;
153 goto errout35; 153 goto errout35;
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 7971f37534a3..adcf92d3b603 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb)
68 /* 68 /*
69 * Wait for outstanding transactions to be written to log: 69 * Wait for outstanding transactions to be written to log:
70 */ 70 */
71 jfs_flush_journal(log, 2); 71 jfs_flush_journal(log, 1);
72 72
73 /* 73 /*
74 * close fileset inode allocation map (aka fileset inode) 74 * close fileset inode allocation map (aka fileset inode)
@@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb)
146 * 146 *
147 * remove file system from log active file system list. 147 * remove file system from log active file system list.
148 */ 148 */
149 jfs_flush_journal(log, 2); 149 jfs_flush_journal(log, 1);
150 150
151 /* 151 /*
152 * Make sure all metadata makes it to disk 152 * Make sure all metadata makes it to disk
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4e0a8493cef6..f8718de3505e 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1103,8 +1103,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1103 * Make sure dest inode number (if any) is what we think it is 1103 * Make sure dest inode number (if any) is what we think it is
1104 */ 1104 */
1105 rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP); 1105 rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP);
1106 if (rc == 0) { 1106 if (!rc) {
1107 if ((new_ip == 0) || (ino != new_ip->i_ino)) { 1107 if ((!new_ip) || (ino != new_ip->i_ino)) {
1108 rc = -ESTALE; 1108 rc = -ESTALE;
1109 goto out3; 1109 goto out3;
1110 } 1110 }
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 71984ee95346..7f24a0bb08ca 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -172,7 +172,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
172 */ 172 */
173 t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP) 173 t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP)
174 << L2BPERDMAP; 174 << L2BPERDMAP;
175 t32 = ((t64 + (BITSPERPAGE - 1)) / BITSPERPAGE) + 1 + 50; 175 t32 = DIV_ROUND_UP(t64, BITSPERPAGE) + 1 + 50;
176 newFSCKSize = t32 << sbi->l2nbperpage; 176 newFSCKSize = t32 << sbi->l2nbperpage;
177 newFSCKAddress = newLogAddress - newFSCKSize; 177 newFSCKAddress = newLogAddress - newFSCKSize;
178 178
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 314bb4ff1ba8..70a14001c98f 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -598,6 +598,12 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
598 seq_printf(seq, ",umask=%03o", sbi->umask); 598 seq_printf(seq, ",umask=%03o", sbi->umask);
599 if (sbi->flag & JFS_NOINTEGRITY) 599 if (sbi->flag & JFS_NOINTEGRITY)
600 seq_puts(seq, ",nointegrity"); 600 seq_puts(seq, ",nointegrity");
601 if (sbi->nls_tab)
602 seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset);
603 if (sbi->flag & JFS_ERR_CONTINUE)
604 seq_printf(seq, ",errors=continue");
605 if (sbi->flag & JFS_ERR_PANIC)
606 seq_printf(seq, ",errors=panic");
601 607
602#ifdef CONFIG_QUOTA 608#ifdef CONFIG_QUOTA
603 if (sbi->flag & JFS_USRQUOTA) 609 if (sbi->flag & JFS_USRQUOTA)
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index d070b18e539d..0b45fd3a4bfd 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -41,6 +41,48 @@ struct nlm_wait {
41 41
42static LIST_HEAD(nlm_blocked); 42static LIST_HEAD(nlm_blocked);
43 43
44/**
45 * nlmclnt_init - Set up per-NFS mount point lockd data structures
46 * @nlm_init: pointer to arguments structure
47 *
48 * Returns pointer to an appropriate nlm_host struct,
49 * or an ERR_PTR value.
50 */
51struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
52{
53 struct nlm_host *host;
54 u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
55 int status;
56
57 status = lockd_up(nlm_init->protocol);
58 if (status < 0)
59 return ERR_PTR(status);
60
61 host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address,
62 nlm_init->protocol, nlm_version,
63 nlm_init->hostname,
64 strlen(nlm_init->hostname));
65 if (host == NULL) {
66 lockd_down();
67 return ERR_PTR(-ENOLCK);
68 }
69
70 return host;
71}
72EXPORT_SYMBOL_GPL(nlmclnt_init);
73
74/**
75 * nlmclnt_done - Release resources allocated by nlmclnt_init()
76 * @host: nlm_host structure reserved by nlmclnt_init()
77 *
78 */
79void nlmclnt_done(struct nlm_host *host)
80{
81 nlm_release_host(host);
82 lockd_down();
83}
84EXPORT_SYMBOL_GPL(nlmclnt_done);
85
44/* 86/*
45 * Queue up a lock for blocking so that the GRANTED request can see it 87 * Queue up a lock for blocking so that the GRANTED request can see it
46 */ 88 */
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index a10343bed160..b6b74a60e1eb 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -145,34 +145,21 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req)
145 BUG_ON(req->a_args.lock.fl.fl_ops != NULL); 145 BUG_ON(req->a_args.lock.fl.fl_ops != NULL);
146} 146}
147 147
148/* 148/**
149 * This is the main entry point for the NLM client. 149 * nlmclnt_proc - Perform a single client-side lock request
150 * @host: address of a valid nlm_host context representing the NLM server
151 * @cmd: fcntl-style file lock operation to perform
152 * @fl: address of arguments for the lock operation
153 *
150 */ 154 */
151int 155int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
152nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
153{ 156{
154 struct rpc_clnt *client = NFS_CLIENT(inode);
155 struct sockaddr_in addr;
156 struct nfs_server *nfssrv = NFS_SERVER(inode);
157 struct nlm_host *host;
158 struct nlm_rqst *call; 157 struct nlm_rqst *call;
159 sigset_t oldset; 158 sigset_t oldset;
160 unsigned long flags; 159 unsigned long flags;
161 int status, vers; 160 int status;
162
163 vers = (NFS_PROTO(inode)->version == 3) ? 4 : 1;
164 if (NFS_PROTO(inode)->version > 3) {
165 printk(KERN_NOTICE "NFSv4 file locking not implemented!\n");
166 return -ENOLCK;
167 }
168
169 rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr));
170 host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers,
171 nfssrv->nfs_client->cl_hostname,
172 strlen(nfssrv->nfs_client->cl_hostname));
173 if (host == NULL)
174 return -ENOLCK;
175 161
162 nlm_get_host(host);
176 call = nlm_alloc_call(host); 163 call = nlm_alloc_call(host);
177 if (call == NULL) 164 if (call == NULL)
178 return -ENOMEM; 165 return -ENOMEM;
@@ -219,7 +206,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
219 dprintk("lockd: clnt proc returns %d\n", status); 206 dprintk("lockd: clnt proc returns %d\n", status);
220 return status; 207 return status;
221} 208}
222EXPORT_SYMBOL(nlmclnt_proc); 209EXPORT_SYMBOL_GPL(nlmclnt_proc);
223 210
224/* 211/*
225 * Allocate an NLM RPC call struct 212 * Allocate an NLM RPC call struct
@@ -257,7 +244,7 @@ void nlm_release_call(struct nlm_rqst *call)
257 244
258static void nlmclnt_rpc_release(void *data) 245static void nlmclnt_rpc_release(void *data)
259{ 246{
260 return nlm_release_call(data); 247 nlm_release_call(data);
261} 248}
262 249
263static int nlm_wait_on_grace(wait_queue_head_t *queue) 250static int nlm_wait_on_grace(wait_queue_head_t *queue)
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 633653bff944..3e459e18cc31 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -612,8 +612,7 @@ const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
612 * called with BKL held. 612 * called with BKL held.
613 */ 613 */
614 static char buf[2*NLM_MAXCOOKIELEN+1]; 614 static char buf[2*NLM_MAXCOOKIELEN+1];
615 int i; 615 unsigned int i, len = sizeof(buf);
616 int len = sizeof(buf);
617 char *p = buf; 616 char *p = buf;
618 617
619 len--; /* allow for trailing \0 */ 618 len--; /* allow for trailing \0 */
diff --git a/fs/namei.c b/fs/namei.c
index 3b993db26cee..73e2e665817a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1605,7 +1605,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1605 if (S_ISLNK(inode->i_mode)) 1605 if (S_ISLNK(inode->i_mode))
1606 return -ELOOP; 1606 return -ELOOP;
1607 1607
1608 if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE)) 1608 if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE))
1609 return -EISDIR; 1609 return -EISDIR;
1610 1610
1611 /* 1611 /*
@@ -1620,7 +1620,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1620 return -EACCES; 1620 return -EACCES;
1621 1621
1622 flag &= ~O_TRUNC; 1622 flag &= ~O_TRUNC;
1623 } else if (IS_RDONLY(inode) && (flag & FMODE_WRITE)) 1623 } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
1624 return -EROFS; 1624 return -EROFS;
1625 1625
1626 error = vfs_permission(nd, acc_mode); 1626 error = vfs_permission(nd, acc_mode);
diff --git a/fs/namespace.c b/fs/namespace.c
index 06083885b21e..61bf376e29e8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -41,8 +41,8 @@ static struct kmem_cache *mnt_cache __read_mostly;
41static struct rw_semaphore namespace_sem; 41static struct rw_semaphore namespace_sem;
42 42
43/* /sys/fs */ 43/* /sys/fs */
44decl_subsys(fs, NULL, NULL); 44struct kobject *fs_kobj;
45EXPORT_SYMBOL_GPL(fs_subsys); 45EXPORT_SYMBOL_GPL(fs_kobj);
46 46
47static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 47static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
48{ 48{
@@ -1861,10 +1861,9 @@ void __init mnt_init(void)
1861 if (err) 1861 if (err)
1862 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 1862 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
1863 __FUNCTION__, err); 1863 __FUNCTION__, err);
1864 err = subsystem_register(&fs_subsys); 1864 fs_kobj = kobject_create_and_add("fs", NULL);
1865 if (err) 1865 if (!fs_kobj)
1866 printk(KERN_WARNING "%s: subsystem_register error: %d\n", 1866 printk(KERN_WARNING "%s: kobj create error\n", __FUNCTION__);
1867 __FUNCTION__, err);
1868 init_rootfs(); 1867 init_rootfs();
1869 init_mount_tree(); 1868 init_mount_tree();
1870} 1869}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a796be5051bf..9b6bbf1b9787 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -73,8 +73,6 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
73 complete(&nfs_callback_info.started); 73 complete(&nfs_callback_info.started);
74 74
75 for(;;) { 75 for(;;) {
76 char buf[RPC_MAX_ADDRBUFLEN];
77
78 if (signalled()) { 76 if (signalled()) {
79 if (nfs_callback_info.users == 0) 77 if (nfs_callback_info.users == 0)
80 break; 78 break;
@@ -92,8 +90,6 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
92 __FUNCTION__, -err); 90 __FUNCTION__, -err);
93 break; 91 break;
94 } 92 }
95 dprintk("%s: request from %s\n", __FUNCTION__,
96 svc_print_addr(rqstp, buf, sizeof(buf)));
97 svc_process(rqstp); 93 svc_process(rqstp);
98 } 94 }
99 95
@@ -168,12 +164,11 @@ void nfs_callback_down(void)
168 164
169static int nfs_callback_authenticate(struct svc_rqst *rqstp) 165static int nfs_callback_authenticate(struct svc_rqst *rqstp)
170{ 166{
171 struct sockaddr_in *addr = svc_addr_in(rqstp);
172 struct nfs_client *clp; 167 struct nfs_client *clp;
173 char buf[RPC_MAX_ADDRBUFLEN]; 168 char buf[RPC_MAX_ADDRBUFLEN];
174 169
175 /* Don't talk to strangers */ 170 /* Don't talk to strangers */
176 clp = nfs_find_client(addr, 4); 171 clp = nfs_find_client(svc_addr(rqstp), 4);
177 if (clp == NULL) 172 if (clp == NULL)
178 return SVC_DROP; 173 return SVC_DROP;
179 174
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index c2bb14e053e1..bb25d2135ff1 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,7 +38,7 @@ struct cb_compound_hdr_res {
38}; 38};
39 39
40struct cb_getattrargs { 40struct cb_getattrargs {
41 struct sockaddr_in *addr; 41 struct sockaddr *addr;
42 struct nfs_fh fh; 42 struct nfs_fh fh;
43 uint32_t bitmap[2]; 43 uint32_t bitmap[2];
44}; 44};
@@ -53,7 +53,7 @@ struct cb_getattrres {
53}; 53};
54 54
55struct cb_recallargs { 55struct cb_recallargs {
56 struct sockaddr_in *addr; 56 struct sockaddr *addr;
57 struct nfs_fh fh; 57 struct nfs_fh fh;
58 nfs4_stateid stateid; 58 nfs4_stateid stateid;
59 uint32_t truncate; 59 uint32_t truncate;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 72e55d83756d..15f7785048d3 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,7 +12,9 @@
12#include "delegation.h" 12#include "delegation.h"
13#include "internal.h" 13#include "internal.h"
14 14
15#ifdef NFS_DEBUG
15#define NFSDBG_FACILITY NFSDBG_CALLBACK 16#define NFSDBG_FACILITY NFSDBG_CALLBACK
17#endif
16 18
17__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res) 19__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
18{ 20{
@@ -20,12 +22,16 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
20 struct nfs_delegation *delegation; 22 struct nfs_delegation *delegation;
21 struct nfs_inode *nfsi; 23 struct nfs_inode *nfsi;
22 struct inode *inode; 24 struct inode *inode;
23 25
24 res->bitmap[0] = res->bitmap[1] = 0; 26 res->bitmap[0] = res->bitmap[1] = 0;
25 res->status = htonl(NFS4ERR_BADHANDLE); 27 res->status = htonl(NFS4ERR_BADHANDLE);
26 clp = nfs_find_client(args->addr, 4); 28 clp = nfs_find_client(args->addr, 4);
27 if (clp == NULL) 29 if (clp == NULL)
28 goto out; 30 goto out;
31
32 dprintk("NFS: GETATTR callback request from %s\n",
33 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
34
29 inode = nfs_delegation_find_inode(clp, &args->fh); 35 inode = nfs_delegation_find_inode(clp, &args->fh);
30 if (inode == NULL) 36 if (inode == NULL)
31 goto out_putclient; 37 goto out_putclient;
@@ -65,23 +71,32 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
65 clp = nfs_find_client(args->addr, 4); 71 clp = nfs_find_client(args->addr, 4);
66 if (clp == NULL) 72 if (clp == NULL)
67 goto out; 73 goto out;
68 inode = nfs_delegation_find_inode(clp, &args->fh); 74
69 if (inode == NULL) 75 dprintk("NFS: RECALL callback request from %s\n",
70 goto out_putclient; 76 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
71 /* Set up a helper thread to actually return the delegation */ 77
72 switch(nfs_async_inode_return_delegation(inode, &args->stateid)) { 78 do {
73 case 0: 79 struct nfs_client *prev = clp;
74 res = 0; 80
75 break; 81 inode = nfs_delegation_find_inode(clp, &args->fh);
76 case -ENOENT: 82 if (inode != NULL) {
77 res = htonl(NFS4ERR_BAD_STATEID); 83 /* Set up a helper thread to actually return the delegation */
78 break; 84 switch(nfs_async_inode_return_delegation(inode, &args->stateid)) {
79 default: 85 case 0:
80 res = htonl(NFS4ERR_RESOURCE); 86 res = 0;
81 } 87 break;
82 iput(inode); 88 case -ENOENT:
83out_putclient: 89 if (res != 0)
84 nfs_put_client(clp); 90 res = htonl(NFS4ERR_BAD_STATEID);
91 break;
92 default:
93 res = htonl(NFS4ERR_RESOURCE);
94 }
95 iput(inode);
96 }
97 clp = nfs_find_client_next(prev);
98 nfs_put_client(prev);
99 } while (clp != NULL);
85out: 100out:
86 dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res)); 101 dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res));
87 return res; 102 return res;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 058ade7efe79..c63eb720b68b 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -139,7 +139,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
139 if (unlikely(status != 0)) 139 if (unlikely(status != 0))
140 return status; 140 return status;
141 /* We do not like overly long tags! */ 141 /* We do not like overly long tags! */
142 if (hdr->taglen > CB_OP_TAGLEN_MAXSZ-12 || hdr->taglen < 0) { 142 if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
143 printk("NFSv4 CALLBACK %s: client sent tag of length %u\n", 143 printk("NFSv4 CALLBACK %s: client sent tag of length %u\n",
144 __FUNCTION__, hdr->taglen); 144 __FUNCTION__, hdr->taglen);
145 return htonl(NFS4ERR_RESOURCE); 145 return htonl(NFS4ERR_RESOURCE);
@@ -176,7 +176,7 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
176 status = decode_fh(xdr, &args->fh); 176 status = decode_fh(xdr, &args->fh);
177 if (unlikely(status != 0)) 177 if (unlikely(status != 0))
178 goto out; 178 goto out;
179 args->addr = svc_addr_in(rqstp); 179 args->addr = svc_addr(rqstp);
180 status = decode_bitmap(xdr, args->bitmap); 180 status = decode_bitmap(xdr, args->bitmap);
181out: 181out:
182 dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status)); 182 dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status));
@@ -188,7 +188,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
188 __be32 *p; 188 __be32 *p;
189 __be32 status; 189 __be32 status;
190 190
191 args->addr = svc_addr_in(rqstp); 191 args->addr = svc_addr(rqstp);
192 status = decode_stateid(xdr, &args->stateid); 192 status = decode_stateid(xdr, &args->stateid);
193 if (unlikely(status != 0)) 193 if (unlikely(status != 0))
194 goto out; 194 goto out;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 310fa2f4cbb8..c5c0175898f6 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -34,6 +34,8 @@
34#include <linux/nfs_idmap.h> 34#include <linux/nfs_idmap.h>
35#include <linux/vfs.h> 35#include <linux/vfs.h>
36#include <linux/inet.h> 36#include <linux/inet.h>
37#include <linux/in6.h>
38#include <net/ipv6.h>
37#include <linux/nfs_xdr.h> 39#include <linux/nfs_xdr.h>
38 40
39#include <asm/system.h> 41#include <asm/system.h>
@@ -93,22 +95,30 @@ struct rpc_program nfsacl_program = {
93}; 95};
94#endif /* CONFIG_NFS_V3_ACL */ 96#endif /* CONFIG_NFS_V3_ACL */
95 97
98struct nfs_client_initdata {
99 const char *hostname;
100 const struct sockaddr *addr;
101 size_t addrlen;
102 const struct nfs_rpc_ops *rpc_ops;
103 int proto;
104};
105
96/* 106/*
97 * Allocate a shared client record 107 * Allocate a shared client record
98 * 108 *
99 * Since these are allocated/deallocated very rarely, we don't 109 * Since these are allocated/deallocated very rarely, we don't
100 * bother putting them in a slab cache... 110 * bother putting them in a slab cache...
101 */ 111 */
102static struct nfs_client *nfs_alloc_client(const char *hostname, 112static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
103 const struct sockaddr_in *addr,
104 int nfsversion)
105{ 113{
106 struct nfs_client *clp; 114 struct nfs_client *clp;
107 115
108 if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) 116 if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
109 goto error_0; 117 goto error_0;
110 118
111 if (nfsversion == 4) { 119 clp->rpc_ops = cl_init->rpc_ops;
120
121 if (cl_init->rpc_ops->version == 4) {
112 if (nfs_callback_up() < 0) 122 if (nfs_callback_up() < 0)
113 goto error_2; 123 goto error_2;
114 __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); 124 __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
@@ -117,11 +127,11 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
117 atomic_set(&clp->cl_count, 1); 127 atomic_set(&clp->cl_count, 1);
118 clp->cl_cons_state = NFS_CS_INITING; 128 clp->cl_cons_state = NFS_CS_INITING;
119 129
120 clp->cl_nfsversion = nfsversion; 130 memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
121 memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr)); 131 clp->cl_addrlen = cl_init->addrlen;
122 132
123 if (hostname) { 133 if (cl_init->hostname) {
124 clp->cl_hostname = kstrdup(hostname, GFP_KERNEL); 134 clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
125 if (!clp->cl_hostname) 135 if (!clp->cl_hostname)
126 goto error_3; 136 goto error_3;
127 } 137 }
@@ -129,6 +139,8 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
129 INIT_LIST_HEAD(&clp->cl_superblocks); 139 INIT_LIST_HEAD(&clp->cl_superblocks);
130 clp->cl_rpcclient = ERR_PTR(-EINVAL); 140 clp->cl_rpcclient = ERR_PTR(-EINVAL);
131 141
142 clp->cl_proto = cl_init->proto;
143
132#ifdef CONFIG_NFS_V4 144#ifdef CONFIG_NFS_V4
133 init_rwsem(&clp->cl_sem); 145 init_rwsem(&clp->cl_sem);
134 INIT_LIST_HEAD(&clp->cl_delegations); 146 INIT_LIST_HEAD(&clp->cl_delegations);
@@ -166,7 +178,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
166 */ 178 */
167static void nfs_free_client(struct nfs_client *clp) 179static void nfs_free_client(struct nfs_client *clp)
168{ 180{
169 dprintk("--> nfs_free_client(%d)\n", clp->cl_nfsversion); 181 dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
170 182
171 nfs4_shutdown_client(clp); 183 nfs4_shutdown_client(clp);
172 184
@@ -203,76 +215,148 @@ void nfs_put_client(struct nfs_client *clp)
203 } 215 }
204} 216}
205 217
218static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
219 const struct sockaddr_in *sa2)
220{
221 return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
222}
223
224static int nfs_sockaddr_match_ipaddr6(const struct sockaddr_in6 *sa1,
225 const struct sockaddr_in6 *sa2)
226{
227 return ipv6_addr_equal(&sa1->sin6_addr, &sa2->sin6_addr);
228}
229
230static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
231 const struct sockaddr *sa2)
232{
233 switch (sa1->sa_family) {
234 case AF_INET:
235 return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
236 (const struct sockaddr_in *)sa2);
237 case AF_INET6:
238 return nfs_sockaddr_match_ipaddr6((const struct sockaddr_in6 *)sa1,
239 (const struct sockaddr_in6 *)sa2);
240 }
241 BUG();
242}
243
206/* 244/*
207 * Find a client by address 245 * Find a client by IP address and protocol version
208 * - caller must hold nfs_client_lock 246 * - returns NULL if no such client
209 */ 247 */
210static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion, int match_port) 248struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
211{ 249{
212 struct nfs_client *clp; 250 struct nfs_client *clp;
213 251
252 spin_lock(&nfs_client_lock);
214 list_for_each_entry(clp, &nfs_client_list, cl_share_link) { 253 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
254 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
255
215 /* Don't match clients that failed to initialise properly */ 256 /* Don't match clients that failed to initialise properly */
216 if (clp->cl_cons_state < 0) 257 if (clp->cl_cons_state != NFS_CS_READY)
217 continue; 258 continue;
218 259
219 /* Different NFS versions cannot share the same nfs_client */ 260 /* Different NFS versions cannot share the same nfs_client */
220 if (clp->cl_nfsversion != nfsversion) 261 if (clp->rpc_ops->version != nfsversion)
221 continue; 262 continue;
222 263
223 if (memcmp(&clp->cl_addr.sin_addr, &addr->sin_addr, 264 if (addr->sa_family != clap->sa_family)
224 sizeof(clp->cl_addr.sin_addr)) != 0) 265 continue;
266 /* Match only the IP address, not the port number */
267 if (!nfs_sockaddr_match_ipaddr(addr, clap))
225 continue; 268 continue;
226 269
227 if (!match_port || clp->cl_addr.sin_port == addr->sin_port) 270 atomic_inc(&clp->cl_count);
228 goto found; 271 spin_unlock(&nfs_client_lock);
272 return clp;
229 } 273 }
230 274 spin_unlock(&nfs_client_lock);
231 return NULL; 275 return NULL;
232
233found:
234 atomic_inc(&clp->cl_count);
235 return clp;
236} 276}
237 277
238/* 278/*
239 * Find a client by IP address and protocol version 279 * Find a client by IP address and protocol version
240 * - returns NULL if no such client 280 * - returns NULL if no such client
241 */ 281 */
242struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversion) 282struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
243{ 283{
244 struct nfs_client *clp; 284 struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
285 u32 nfsvers = clp->rpc_ops->version;
245 286
246 spin_lock(&nfs_client_lock); 287 spin_lock(&nfs_client_lock);
247 clp = __nfs_find_client(addr, nfsversion, 0); 288 list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
289 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
290
291 /* Don't match clients that failed to initialise properly */
292 if (clp->cl_cons_state != NFS_CS_READY)
293 continue;
294
295 /* Different NFS versions cannot share the same nfs_client */
296 if (clp->rpc_ops->version != nfsvers)
297 continue;
298
299 if (sap->sa_family != clap->sa_family)
300 continue;
301 /* Match only the IP address, not the port number */
302 if (!nfs_sockaddr_match_ipaddr(sap, clap))
303 continue;
304
305 atomic_inc(&clp->cl_count);
306 spin_unlock(&nfs_client_lock);
307 return clp;
308 }
248 spin_unlock(&nfs_client_lock); 309 spin_unlock(&nfs_client_lock);
249 if (clp != NULL && clp->cl_cons_state != NFS_CS_READY) { 310 return NULL;
250 nfs_put_client(clp); 311}
251 clp = NULL; 312
313/*
314 * Find an nfs_client on the list that matches the initialisation data
315 * that is supplied.
316 */
317static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
318{
319 struct nfs_client *clp;
320
321 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
322 /* Don't match clients that failed to initialise properly */
323 if (clp->cl_cons_state < 0)
324 continue;
325
326 /* Different NFS versions cannot share the same nfs_client */
327 if (clp->rpc_ops != data->rpc_ops)
328 continue;
329
330 if (clp->cl_proto != data->proto)
331 continue;
332
333 /* Match the full socket address */
334 if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0)
335 continue;
336
337 atomic_inc(&clp->cl_count);
338 return clp;
252 } 339 }
253 return clp; 340 return NULL;
254} 341}
255 342
256/* 343/*
257 * Look up a client by IP address and protocol version 344 * Look up a client by IP address and protocol version
258 * - creates a new record if one doesn't yet exist 345 * - creates a new record if one doesn't yet exist
259 */ 346 */
260static struct nfs_client *nfs_get_client(const char *hostname, 347static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
261 const struct sockaddr_in *addr,
262 int nfsversion)
263{ 348{
264 struct nfs_client *clp, *new = NULL; 349 struct nfs_client *clp, *new = NULL;
265 int error; 350 int error;
266 351
267 dprintk("--> nfs_get_client(%s,"NIPQUAD_FMT":%d,%d)\n", 352 dprintk("--> nfs_get_client(%s,v%u)\n",
268 hostname ?: "", NIPQUAD(addr->sin_addr), 353 cl_init->hostname ?: "", cl_init->rpc_ops->version);
269 addr->sin_port, nfsversion);
270 354
271 /* see if the client already exists */ 355 /* see if the client already exists */
272 do { 356 do {
273 spin_lock(&nfs_client_lock); 357 spin_lock(&nfs_client_lock);
274 358
275 clp = __nfs_find_client(addr, nfsversion, 1); 359 clp = nfs_match_client(cl_init);
276 if (clp) 360 if (clp)
277 goto found_client; 361 goto found_client;
278 if (new) 362 if (new)
@@ -280,7 +364,7 @@ static struct nfs_client *nfs_get_client(const char *hostname,
280 364
281 spin_unlock(&nfs_client_lock); 365 spin_unlock(&nfs_client_lock);
282 366
283 new = nfs_alloc_client(hostname, addr, nfsversion); 367 new = nfs_alloc_client(cl_init);
284 } while (new); 368 } while (new);
285 369
286 return ERR_PTR(-ENOMEM); 370 return ERR_PTR(-ENOMEM);
@@ -344,12 +428,16 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
344 switch (proto) { 428 switch (proto) {
345 case XPRT_TRANSPORT_TCP: 429 case XPRT_TRANSPORT_TCP:
346 case XPRT_TRANSPORT_RDMA: 430 case XPRT_TRANSPORT_RDMA:
347 if (!to->to_initval) 431 if (to->to_initval == 0)
348 to->to_initval = 60 * HZ; 432 to->to_initval = 60 * HZ;
349 if (to->to_initval > NFS_MAX_TCP_TIMEOUT) 433 if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
350 to->to_initval = NFS_MAX_TCP_TIMEOUT; 434 to->to_initval = NFS_MAX_TCP_TIMEOUT;
351 to->to_increment = to->to_initval; 435 to->to_increment = to->to_initval;
352 to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); 436 to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
437 if (to->to_maxval > NFS_MAX_TCP_TIMEOUT)
438 to->to_maxval = NFS_MAX_TCP_TIMEOUT;
439 if (to->to_maxval < to->to_initval)
440 to->to_maxval = to->to_initval;
353 to->to_exponential = 0; 441 to->to_exponential = 0;
354 break; 442 break;
355 case XPRT_TRANSPORT_UDP: 443 case XPRT_TRANSPORT_UDP:
@@ -367,19 +455,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
367/* 455/*
368 * Create an RPC client handle 456 * Create an RPC client handle
369 */ 457 */
370static int nfs_create_rpc_client(struct nfs_client *clp, int proto, 458static int nfs_create_rpc_client(struct nfs_client *clp,
371 unsigned int timeo, 459 const struct rpc_timeout *timeparms,
372 unsigned int retrans, 460 rpc_authflavor_t flavor,
373 rpc_authflavor_t flavor, 461 int flags)
374 int flags)
375{ 462{
376 struct rpc_timeout timeparms;
377 struct rpc_clnt *clnt = NULL; 463 struct rpc_clnt *clnt = NULL;
378 struct rpc_create_args args = { 464 struct rpc_create_args args = {
379 .protocol = proto, 465 .protocol = clp->cl_proto,
380 .address = (struct sockaddr *)&clp->cl_addr, 466 .address = (struct sockaddr *)&clp->cl_addr,
381 .addrsize = sizeof(clp->cl_addr), 467 .addrsize = clp->cl_addrlen,
382 .timeout = &timeparms, 468 .timeout = timeparms,
383 .servername = clp->cl_hostname, 469 .servername = clp->cl_hostname,
384 .program = &nfs_program, 470 .program = &nfs_program,
385 .version = clp->rpc_ops->version, 471 .version = clp->rpc_ops->version,
@@ -390,10 +476,6 @@ static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
390 if (!IS_ERR(clp->cl_rpcclient)) 476 if (!IS_ERR(clp->cl_rpcclient))
391 return 0; 477 return 0;
392 478
393 nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
394 clp->retrans_timeo = timeparms.to_initval;
395 clp->retrans_count = timeparms.to_retries;
396
397 clnt = rpc_create(&args); 479 clnt = rpc_create(&args);
398 if (IS_ERR(clnt)) { 480 if (IS_ERR(clnt)) {
399 dprintk("%s: cannot create RPC client. Error = %ld\n", 481 dprintk("%s: cannot create RPC client. Error = %ld\n",
@@ -410,11 +492,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
410 */ 492 */
411static void nfs_destroy_server(struct nfs_server *server) 493static void nfs_destroy_server(struct nfs_server *server)
412{ 494{
413 if (!IS_ERR(server->client_acl))
414 rpc_shutdown_client(server->client_acl);
415
416 if (!(server->flags & NFS_MOUNT_NONLM)) 495 if (!(server->flags & NFS_MOUNT_NONLM))
417 lockd_down(); /* release rpc.lockd */ 496 nlmclnt_done(server->nlm_host);
418} 497}
419 498
420/* 499/*
@@ -422,20 +501,29 @@ static void nfs_destroy_server(struct nfs_server *server)
422 */ 501 */
423static int nfs_start_lockd(struct nfs_server *server) 502static int nfs_start_lockd(struct nfs_server *server)
424{ 503{
425 int error = 0; 504 struct nlm_host *host;
505 struct nfs_client *clp = server->nfs_client;
506 struct nlmclnt_initdata nlm_init = {
507 .hostname = clp->cl_hostname,
508 .address = (struct sockaddr *)&clp->cl_addr,
509 .addrlen = clp->cl_addrlen,
510 .protocol = server->flags & NFS_MOUNT_TCP ?
511 IPPROTO_TCP : IPPROTO_UDP,
512 .nfs_version = clp->rpc_ops->version,
513 };
426 514
427 if (server->nfs_client->cl_nfsversion > 3) 515 if (nlm_init.nfs_version > 3)
428 goto out; 516 return 0;
429 if (server->flags & NFS_MOUNT_NONLM) 517 if (server->flags & NFS_MOUNT_NONLM)
430 goto out; 518 return 0;
431 error = lockd_up((server->flags & NFS_MOUNT_TCP) ? 519
432 IPPROTO_TCP : IPPROTO_UDP); 520 host = nlmclnt_init(&nlm_init);
433 if (error < 0) 521 if (IS_ERR(host))
434 server->flags |= NFS_MOUNT_NONLM; 522 return PTR_ERR(host);
435 else 523
436 server->destroy = nfs_destroy_server; 524 server->nlm_host = host;
437out: 525 server->destroy = nfs_destroy_server;
438 return error; 526 return 0;
439} 527}
440 528
441/* 529/*
@@ -444,7 +532,7 @@ out:
444#ifdef CONFIG_NFS_V3_ACL 532#ifdef CONFIG_NFS_V3_ACL
445static void nfs_init_server_aclclient(struct nfs_server *server) 533static void nfs_init_server_aclclient(struct nfs_server *server)
446{ 534{
447 if (server->nfs_client->cl_nfsversion != 3) 535 if (server->nfs_client->rpc_ops->version != 3)
448 goto out_noacl; 536 goto out_noacl;
449 if (server->flags & NFS_MOUNT_NOACL) 537 if (server->flags & NFS_MOUNT_NOACL)
450 goto out_noacl; 538 goto out_noacl;
@@ -471,7 +559,9 @@ static inline void nfs_init_server_aclclient(struct nfs_server *server)
471/* 559/*
472 * Create a general RPC client 560 * Create a general RPC client
473 */ 561 */
474static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t pseudoflavour) 562static int nfs_init_server_rpcclient(struct nfs_server *server,
563 const struct rpc_timeout *timeo,
564 rpc_authflavor_t pseudoflavour)
475{ 565{
476 struct nfs_client *clp = server->nfs_client; 566 struct nfs_client *clp = server->nfs_client;
477 567
@@ -481,6 +571,11 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
481 return PTR_ERR(server->client); 571 return PTR_ERR(server->client);
482 } 572 }
483 573
574 memcpy(&server->client->cl_timeout_default,
575 timeo,
576 sizeof(server->client->cl_timeout_default));
577 server->client->cl_timeout = &server->client->cl_timeout_default;
578
484 if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) { 579 if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) {
485 struct rpc_auth *auth; 580 struct rpc_auth *auth;
486 581
@@ -501,6 +596,7 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
501 * Initialise an NFS2 or NFS3 client 596 * Initialise an NFS2 or NFS3 client
502 */ 597 */
503static int nfs_init_client(struct nfs_client *clp, 598static int nfs_init_client(struct nfs_client *clp,
599 const struct rpc_timeout *timeparms,
504 const struct nfs_parsed_mount_data *data) 600 const struct nfs_parsed_mount_data *data)
505{ 601{
506 int error; 602 int error;
@@ -511,18 +607,11 @@ static int nfs_init_client(struct nfs_client *clp,
511 return 0; 607 return 0;
512 } 608 }
513 609
514 /* Check NFS protocol revision and initialize RPC op vector */
515 clp->rpc_ops = &nfs_v2_clientops;
516#ifdef CONFIG_NFS_V3
517 if (clp->cl_nfsversion == 3)
518 clp->rpc_ops = &nfs_v3_clientops;
519#endif
520 /* 610 /*
521 * Create a client RPC handle for doing FSSTAT with UNIX auth only 611 * Create a client RPC handle for doing FSSTAT with UNIX auth only
522 * - RFC 2623, sec 2.3.2 612 * - RFC 2623, sec 2.3.2
523 */ 613 */
524 error = nfs_create_rpc_client(clp, data->nfs_server.protocol, 614 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 0);
525 data->timeo, data->retrans, RPC_AUTH_UNIX, 0);
526 if (error < 0) 615 if (error < 0)
527 goto error; 616 goto error;
528 nfs_mark_client_ready(clp, NFS_CS_READY); 617 nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -540,25 +629,34 @@ error:
540static int nfs_init_server(struct nfs_server *server, 629static int nfs_init_server(struct nfs_server *server,
541 const struct nfs_parsed_mount_data *data) 630 const struct nfs_parsed_mount_data *data)
542{ 631{
632 struct nfs_client_initdata cl_init = {
633 .hostname = data->nfs_server.hostname,
634 .addr = (const struct sockaddr *)&data->nfs_server.address,
635 .addrlen = data->nfs_server.addrlen,
636 .rpc_ops = &nfs_v2_clientops,
637 .proto = data->nfs_server.protocol,
638 };
639 struct rpc_timeout timeparms;
543 struct nfs_client *clp; 640 struct nfs_client *clp;
544 int error, nfsvers = 2; 641 int error;
545 642
546 dprintk("--> nfs_init_server()\n"); 643 dprintk("--> nfs_init_server()\n");
547 644
548#ifdef CONFIG_NFS_V3 645#ifdef CONFIG_NFS_V3
549 if (data->flags & NFS_MOUNT_VER3) 646 if (data->flags & NFS_MOUNT_VER3)
550 nfsvers = 3; 647 cl_init.rpc_ops = &nfs_v3_clientops;
551#endif 648#endif
552 649
553 /* Allocate or find a client reference we can use */ 650 /* Allocate or find a client reference we can use */
554 clp = nfs_get_client(data->nfs_server.hostname, 651 clp = nfs_get_client(&cl_init);
555 &data->nfs_server.address, nfsvers);
556 if (IS_ERR(clp)) { 652 if (IS_ERR(clp)) {
557 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); 653 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
558 return PTR_ERR(clp); 654 return PTR_ERR(clp);
559 } 655 }
560 656
561 error = nfs_init_client(clp, data); 657 nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
658 data->timeo, data->retrans);
659 error = nfs_init_client(clp, &timeparms, data);
562 if (error < 0) 660 if (error < 0)
563 goto error; 661 goto error;
564 662
@@ -582,7 +680,7 @@ static int nfs_init_server(struct nfs_server *server,
582 if (error < 0) 680 if (error < 0)
583 goto error; 681 goto error;
584 682
585 error = nfs_init_server_rpcclient(server, data->auth_flavors[0]); 683 error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
586 if (error < 0) 684 if (error < 0)
587 goto error; 685 goto error;
588 686
@@ -728,6 +826,9 @@ static struct nfs_server *nfs_alloc_server(void)
728 INIT_LIST_HEAD(&server->client_link); 826 INIT_LIST_HEAD(&server->client_link);
729 INIT_LIST_HEAD(&server->master_link); 827 INIT_LIST_HEAD(&server->master_link);
730 828
829 init_waitqueue_head(&server->active_wq);
830 atomic_set(&server->active, 0);
831
731 server->io_stats = nfs_alloc_iostats(); 832 server->io_stats = nfs_alloc_iostats();
732 if (!server->io_stats) { 833 if (!server->io_stats) {
733 kfree(server); 834 kfree(server);
@@ -751,6 +852,9 @@ void nfs_free_server(struct nfs_server *server)
751 852
752 if (server->destroy != NULL) 853 if (server->destroy != NULL)
753 server->destroy(server); 854 server->destroy(server);
855
856 if (!IS_ERR(server->client_acl))
857 rpc_shutdown_client(server->client_acl);
754 if (!IS_ERR(server->client)) 858 if (!IS_ERR(server->client))
755 rpc_shutdown_client(server->client); 859 rpc_shutdown_client(server->client);
756 860
@@ -836,7 +940,7 @@ error:
836 * Initialise an NFS4 client record 940 * Initialise an NFS4 client record
837 */ 941 */
838static int nfs4_init_client(struct nfs_client *clp, 942static int nfs4_init_client(struct nfs_client *clp,
839 int proto, int timeo, int retrans, 943 const struct rpc_timeout *timeparms,
840 const char *ip_addr, 944 const char *ip_addr,
841 rpc_authflavor_t authflavour) 945 rpc_authflavor_t authflavour)
842{ 946{
@@ -851,7 +955,7 @@ static int nfs4_init_client(struct nfs_client *clp,
851 /* Check NFS protocol revision and initialize RPC op vector */ 955 /* Check NFS protocol revision and initialize RPC op vector */
852 clp->rpc_ops = &nfs_v4_clientops; 956 clp->rpc_ops = &nfs_v4_clientops;
853 957
854 error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour, 958 error = nfs_create_rpc_client(clp, timeparms, authflavour,
855 RPC_CLNT_CREATE_DISCRTRY); 959 RPC_CLNT_CREATE_DISCRTRY);
856 if (error < 0) 960 if (error < 0)
857 goto error; 961 goto error;
@@ -878,23 +982,32 @@ error:
878 * Set up an NFS4 client 982 * Set up an NFS4 client
879 */ 983 */
880static int nfs4_set_client(struct nfs_server *server, 984static int nfs4_set_client(struct nfs_server *server,
881 const char *hostname, const struct sockaddr_in *addr, 985 const char *hostname,
986 const struct sockaddr *addr,
987 const size_t addrlen,
882 const char *ip_addr, 988 const char *ip_addr,
883 rpc_authflavor_t authflavour, 989 rpc_authflavor_t authflavour,
884 int proto, int timeo, int retrans) 990 int proto, const struct rpc_timeout *timeparms)
885{ 991{
992 struct nfs_client_initdata cl_init = {
993 .hostname = hostname,
994 .addr = addr,
995 .addrlen = addrlen,
996 .rpc_ops = &nfs_v4_clientops,
997 .proto = proto,
998 };
886 struct nfs_client *clp; 999 struct nfs_client *clp;
887 int error; 1000 int error;
888 1001
889 dprintk("--> nfs4_set_client()\n"); 1002 dprintk("--> nfs4_set_client()\n");
890 1003
891 /* Allocate or find a client reference we can use */ 1004 /* Allocate or find a client reference we can use */
892 clp = nfs_get_client(hostname, addr, 4); 1005 clp = nfs_get_client(&cl_init);
893 if (IS_ERR(clp)) { 1006 if (IS_ERR(clp)) {
894 error = PTR_ERR(clp); 1007 error = PTR_ERR(clp);
895 goto error; 1008 goto error;
896 } 1009 }
897 error = nfs4_init_client(clp, proto, timeo, retrans, ip_addr, authflavour); 1010 error = nfs4_init_client(clp, timeparms, ip_addr, authflavour);
898 if (error < 0) 1011 if (error < 0)
899 goto error_put; 1012 goto error_put;
900 1013
@@ -915,10 +1028,26 @@ error:
915static int nfs4_init_server(struct nfs_server *server, 1028static int nfs4_init_server(struct nfs_server *server,
916 const struct nfs_parsed_mount_data *data) 1029 const struct nfs_parsed_mount_data *data)
917{ 1030{
1031 struct rpc_timeout timeparms;
918 int error; 1032 int error;
919 1033
920 dprintk("--> nfs4_init_server()\n"); 1034 dprintk("--> nfs4_init_server()\n");
921 1035
1036 nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
1037 data->timeo, data->retrans);
1038
1039 /* Get a client record */
1040 error = nfs4_set_client(server,
1041 data->nfs_server.hostname,
1042 (const struct sockaddr *)&data->nfs_server.address,
1043 data->nfs_server.addrlen,
1044 data->client_address,
1045 data->auth_flavors[0],
1046 data->nfs_server.protocol,
1047 &timeparms);
1048 if (error < 0)
1049 goto error;
1050
922 /* Initialise the client representation from the mount data */ 1051 /* Initialise the client representation from the mount data */
923 server->flags = data->flags & NFS_MOUNT_FLAGMASK; 1052 server->flags = data->flags & NFS_MOUNT_FLAGMASK;
924 server->caps |= NFS_CAP_ATOMIC_OPEN; 1053 server->caps |= NFS_CAP_ATOMIC_OPEN;
@@ -933,8 +1062,9 @@ static int nfs4_init_server(struct nfs_server *server,
933 server->acdirmin = data->acdirmin * HZ; 1062 server->acdirmin = data->acdirmin * HZ;
934 server->acdirmax = data->acdirmax * HZ; 1063 server->acdirmax = data->acdirmax * HZ;
935 1064
936 error = nfs_init_server_rpcclient(server, data->auth_flavors[0]); 1065 error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
937 1066
1067error:
938 /* Done */ 1068 /* Done */
939 dprintk("<-- nfs4_init_server() = %d\n", error); 1069 dprintk("<-- nfs4_init_server() = %d\n", error);
940 return error; 1070 return error;
@@ -957,17 +1087,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
957 if (!server) 1087 if (!server)
958 return ERR_PTR(-ENOMEM); 1088 return ERR_PTR(-ENOMEM);
959 1089
960 /* Get a client record */
961 error = nfs4_set_client(server,
962 data->nfs_server.hostname,
963 &data->nfs_server.address,
964 data->client_address,
965 data->auth_flavors[0],
966 data->nfs_server.protocol,
967 data->timeo, data->retrans);
968 if (error < 0)
969 goto error;
970
971 /* set up the general RPC client */ 1090 /* set up the general RPC client */
972 error = nfs4_init_server(server, data); 1091 error = nfs4_init_server(server, data);
973 if (error < 0) 1092 if (error < 0)
@@ -1035,12 +1154,13 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1035 1154
1036 /* Get a client representation. 1155 /* Get a client representation.
1037 * Note: NFSv4 always uses TCP, */ 1156 * Note: NFSv4 always uses TCP, */
1038 error = nfs4_set_client(server, data->hostname, data->addr, 1157 error = nfs4_set_client(server, data->hostname,
1039 parent_client->cl_ipaddr, 1158 data->addr,
1040 data->authflavor, 1159 data->addrlen,
1041 parent_server->client->cl_xprt->prot, 1160 parent_client->cl_ipaddr,
1042 parent_client->retrans_timeo, 1161 data->authflavor,
1043 parent_client->retrans_count); 1162 parent_server->client->cl_xprt->prot,
1163 parent_server->client->cl_timeout);
1044 if (error < 0) 1164 if (error < 0)
1045 goto error; 1165 goto error;
1046 1166
@@ -1048,7 +1168,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1048 nfs_server_copy_userdata(server, parent_server); 1168 nfs_server_copy_userdata(server, parent_server);
1049 server->caps |= NFS_CAP_ATOMIC_OPEN; 1169 server->caps |= NFS_CAP_ATOMIC_OPEN;
1050 1170
1051 error = nfs_init_server_rpcclient(server, data->authflavor); 1171 error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
1052 if (error < 0) 1172 if (error < 0)
1053 goto error; 1173 goto error;
1054 1174
@@ -1117,7 +1237,9 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1117 1237
1118 server->fsid = fattr->fsid; 1238 server->fsid = fattr->fsid;
1119 1239
1120 error = nfs_init_server_rpcclient(server, source->client->cl_auth->au_flavor); 1240 error = nfs_init_server_rpcclient(server,
1241 source->client->cl_timeout,
1242 source->client->cl_auth->au_flavor);
1121 if (error < 0) 1243 if (error < 0)
1122 goto out_free_server; 1244 goto out_free_server;
1123 if (!IS_ERR(source->client_acl)) 1245 if (!IS_ERR(source->client_acl))
@@ -1259,10 +1381,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
1259 /* display one transport per line on subsequent lines */ 1381 /* display one transport per line on subsequent lines */
1260 clp = list_entry(v, struct nfs_client, cl_share_link); 1382 clp = list_entry(v, struct nfs_client, cl_share_link);
1261 1383
1262 seq_printf(m, "v%d %02x%02x%02x%02x %4hx %3d %s\n", 1384 seq_printf(m, "v%u %s %s %3d %s\n",
1263 clp->cl_nfsversion, 1385 clp->rpc_ops->version,
1264 NIPQUAD(clp->cl_addr.sin_addr), 1386 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
1265 ntohs(clp->cl_addr.sin_port), 1387 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
1266 atomic_read(&clp->cl_count), 1388 atomic_read(&clp->cl_count),
1267 clp->cl_hostname); 1389 clp->cl_hostname);
1268 1390
@@ -1338,10 +1460,10 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1338 (unsigned long long) server->fsid.major, 1460 (unsigned long long) server->fsid.major,
1339 (unsigned long long) server->fsid.minor); 1461 (unsigned long long) server->fsid.minor);
1340 1462
1341 seq_printf(m, "v%d %02x%02x%02x%02x %4hx %-7s %-17s\n", 1463 seq_printf(m, "v%u %s %s %-7s %-17s\n",
1342 clp->cl_nfsversion, 1464 clp->rpc_ops->version,
1343 NIPQUAD(clp->cl_addr.sin_addr), 1465 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
1344 ntohs(clp->cl_addr.sin_port), 1466 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
1345 dev, 1467 dev,
1346 fsid); 1468 fsid);
1347 1469
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 11833f4caeaa..b9eadd18ba70 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -125,6 +125,32 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
125 put_rpccred(oldcred); 125 put_rpccred(oldcred);
126} 126}
127 127
128static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
129{
130 int res = 0;
131
132 res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
133 nfs_free_delegation(delegation);
134 return res;
135}
136
137static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
138{
139 struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
140
141 if (delegation == NULL)
142 goto nomatch;
143 if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
144 sizeof(delegation->stateid.data)) != 0)
145 goto nomatch;
146 list_del_rcu(&delegation->super_list);
147 nfsi->delegation_state = 0;
148 rcu_assign_pointer(nfsi->delegation, NULL);
149 return delegation;
150nomatch:
151 return NULL;
152}
153
128/* 154/*
129 * Set up a delegation on an inode 155 * Set up a delegation on an inode
130 */ 156 */
@@ -133,6 +159,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
133 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 159 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
134 struct nfs_inode *nfsi = NFS_I(inode); 160 struct nfs_inode *nfsi = NFS_I(inode);
135 struct nfs_delegation *delegation; 161 struct nfs_delegation *delegation;
162 struct nfs_delegation *freeme = NULL;
136 int status = 0; 163 int status = 0;
137 164
138 delegation = kmalloc(sizeof(*delegation), GFP_KERNEL); 165 delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
@@ -147,41 +174,45 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
147 delegation->inode = inode; 174 delegation->inode = inode;
148 175
149 spin_lock(&clp->cl_lock); 176 spin_lock(&clp->cl_lock);
150 if (rcu_dereference(nfsi->delegation) == NULL) { 177 if (rcu_dereference(nfsi->delegation) != NULL) {
151 list_add_rcu(&delegation->super_list, &clp->cl_delegations);
152 nfsi->delegation_state = delegation->type;
153 rcu_assign_pointer(nfsi->delegation, delegation);
154 delegation = NULL;
155 } else {
156 if (memcmp(&delegation->stateid, &nfsi->delegation->stateid, 178 if (memcmp(&delegation->stateid, &nfsi->delegation->stateid,
157 sizeof(delegation->stateid)) != 0 || 179 sizeof(delegation->stateid)) == 0 &&
158 delegation->type != nfsi->delegation->type) { 180 delegation->type == nfsi->delegation->type) {
159 printk("%s: server %u.%u.%u.%u, handed out a duplicate delegation!\n", 181 goto out;
160 __FUNCTION__, NIPQUAD(clp->cl_addr.sin_addr)); 182 }
161 status = -EIO; 183 /*
184 * Deal with broken servers that hand out two
185 * delegations for the same file.
186 */
187 dfprintk(FILE, "%s: server %s handed out "
188 "a duplicate delegation!\n",
189 __FUNCTION__, clp->cl_hostname);
190 if (delegation->type <= nfsi->delegation->type) {
191 freeme = delegation;
192 delegation = NULL;
193 goto out;
162 } 194 }
195 freeme = nfs_detach_delegation_locked(nfsi, NULL);
163 } 196 }
197 list_add_rcu(&delegation->super_list, &clp->cl_delegations);
198 nfsi->delegation_state = delegation->type;
199 rcu_assign_pointer(nfsi->delegation, delegation);
200 delegation = NULL;
164 201
165 /* Ensure we revalidate the attributes and page cache! */ 202 /* Ensure we revalidate the attributes and page cache! */
166 spin_lock(&inode->i_lock); 203 spin_lock(&inode->i_lock);
167 nfsi->cache_validity |= NFS_INO_REVAL_FORCED; 204 nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
168 spin_unlock(&inode->i_lock); 205 spin_unlock(&inode->i_lock);
169 206
207out:
170 spin_unlock(&clp->cl_lock); 208 spin_unlock(&clp->cl_lock);
171 if (delegation != NULL) 209 if (delegation != NULL)
172 nfs_free_delegation(delegation); 210 nfs_free_delegation(delegation);
211 if (freeme != NULL)
212 nfs_do_return_delegation(inode, freeme, 0);
173 return status; 213 return status;
174} 214}
175 215
176static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
177{
178 int res = 0;
179
180 res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid);
181 nfs_free_delegation(delegation);
182 return res;
183}
184
185/* Sync all data to disk upon delegation return */ 216/* Sync all data to disk upon delegation return */
186static void nfs_msync_inode(struct inode *inode) 217static void nfs_msync_inode(struct inode *inode)
187{ 218{
@@ -207,24 +238,28 @@ static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegat
207 up_read(&clp->cl_sem); 238 up_read(&clp->cl_sem);
208 nfs_msync_inode(inode); 239 nfs_msync_inode(inode);
209 240
210 return nfs_do_return_delegation(inode, delegation); 241 return nfs_do_return_delegation(inode, delegation, 1);
211} 242}
212 243
213static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid) 244/*
245 * This function returns the delegation without reclaiming opens
246 * or protecting against delegation reclaims.
247 * It is therefore really only safe to be called from
248 * nfs4_clear_inode()
249 */
250void nfs_inode_return_delegation_noreclaim(struct inode *inode)
214{ 251{
215 struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); 252 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
253 struct nfs_inode *nfsi = NFS_I(inode);
254 struct nfs_delegation *delegation;
216 255
217 if (delegation == NULL) 256 if (rcu_dereference(nfsi->delegation) != NULL) {
218 goto nomatch; 257 spin_lock(&clp->cl_lock);
219 if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, 258 delegation = nfs_detach_delegation_locked(nfsi, NULL);
220 sizeof(delegation->stateid.data)) != 0) 259 spin_unlock(&clp->cl_lock);
221 goto nomatch; 260 if (delegation != NULL)
222 list_del_rcu(&delegation->super_list); 261 nfs_do_return_delegation(inode, delegation, 0);
223 nfsi->delegation_state = 0; 262 }
224 rcu_assign_pointer(nfsi->delegation, NULL);
225 return delegation;
226nomatch:
227 return NULL;
228} 263}
229 264
230int nfs_inode_return_delegation(struct inode *inode) 265int nfs_inode_return_delegation(struct inode *inode)
@@ -314,8 +349,9 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
314 __module_get(THIS_MODULE); 349 __module_get(THIS_MODULE);
315 atomic_inc(&clp->cl_count); 350 atomic_inc(&clp->cl_count);
316 task = kthread_run(nfs_do_expire_all_delegations, clp, 351 task = kthread_run(nfs_do_expire_all_delegations, clp,
317 "%u.%u.%u.%u-delegreturn", 352 "%s-delegreturn",
318 NIPQUAD(clp->cl_addr.sin_addr)); 353 rpc_peeraddr2str(clp->cl_rpcclient,
354 RPC_DISPLAY_ADDR));
319 if (!IS_ERR(task)) 355 if (!IS_ERR(task))
320 return; 356 return;
321 nfs_put_client(clp); 357 nfs_put_client(clp);
@@ -386,7 +422,7 @@ static int recall_thread(void *data)
386 nfs_msync_inode(inode); 422 nfs_msync_inode(inode);
387 423
388 if (delegation != NULL) 424 if (delegation != NULL)
389 nfs_do_return_delegation(inode, delegation); 425 nfs_do_return_delegation(inode, delegation, 1);
390 iput(inode); 426 iput(inode);
391 module_put_and_exit(0); 427 module_put_and_exit(0);
392} 428}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 5874ce7fdbae..f1c5e2a5d88e 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -29,6 +29,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
29void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 29void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
30int nfs_inode_return_delegation(struct inode *inode); 30int nfs_inode_return_delegation(struct inode *inode);
31int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); 31int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
32void nfs_inode_return_delegation_noreclaim(struct inode *inode);
32 33
33struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); 34struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
34void nfs_return_all_delegations(struct super_block *sb); 35void nfs_return_all_delegations(struct super_block *sb);
@@ -39,7 +40,7 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp);
39void nfs_delegation_reap_unclaimed(struct nfs_client *clp); 40void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
40 41
41/* NFSv4 delegation-related procedures */ 42/* NFSv4 delegation-related procedures */
42int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid); 43int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
43int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); 44int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
44int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); 45int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
45int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); 46int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f697b5c74b7c..476cb0f837fd 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -192,7 +192,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
192 /* We requested READDIRPLUS, but the server doesn't grok it */ 192 /* We requested READDIRPLUS, but the server doesn't grok it */
193 if (error == -ENOTSUPP && desc->plus) { 193 if (error == -ENOTSUPP && desc->plus) {
194 NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; 194 NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
195 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode)); 195 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
196 desc->plus = 0; 196 desc->plus = 0;
197 goto again; 197 goto again;
198 } 198 }
@@ -537,12 +537,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
537 537
538 lock_kernel(); 538 lock_kernel();
539 539
540 res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping);
541 if (res < 0) {
542 unlock_kernel();
543 return res;
544 }
545
546 /* 540 /*
547 * filp->f_pos points to the dirent entry number. 541 * filp->f_pos points to the dirent entry number.
548 * *desc->dir_cookie has the cookie for the next entry. We have 542 * *desc->dir_cookie has the cookie for the next entry. We have
@@ -564,6 +558,10 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
564 desc->entry = &my_entry; 558 desc->entry = &my_entry;
565 559
566 nfs_block_sillyrename(dentry); 560 nfs_block_sillyrename(dentry);
561 res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping);
562 if (res < 0)
563 goto out;
564
567 while(!desc->entry->eof) { 565 while(!desc->entry->eof) {
568 res = readdir_search_pagecache(desc); 566 res = readdir_search_pagecache(desc);
569 567
@@ -579,7 +577,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
579 break; 577 break;
580 } 578 }
581 if (res == -ETOOSMALL && desc->plus) { 579 if (res == -ETOOSMALL && desc->plus) {
582 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode)); 580 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
583 nfs_zap_caches(inode); 581 nfs_zap_caches(inode);
584 desc->plus = 0; 582 desc->plus = 0;
585 desc->entry->eof = 0; 583 desc->entry->eof = 0;
@@ -594,6 +592,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
594 break; 592 break;
595 } 593 }
596 } 594 }
595out:
597 nfs_unblock_sillyrename(dentry); 596 nfs_unblock_sillyrename(dentry);
598 unlock_kernel(); 597 unlock_kernel();
599 if (res > 0) 598 if (res > 0)
@@ -639,6 +638,21 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
639 return 0; 638 return 0;
640} 639}
641 640
641/**
642 * nfs_force_lookup_revalidate - Mark the directory as having changed
643 * @dir - pointer to directory inode
644 *
645 * This forces the revalidation code in nfs_lookup_revalidate() to do a
646 * full lookup on all child dentries of 'dir' whenever a change occurs
647 * on the server that might have invalidated our dcache.
648 *
649 * The caller should be holding dir->i_lock
650 */
651void nfs_force_lookup_revalidate(struct inode *dir)
652{
653 NFS_I(dir)->cache_change_attribute = jiffies;
654}
655
642/* 656/*
643 * A check for whether or not the parent directory has changed. 657 * A check for whether or not the parent directory has changed.
644 * In the case it has, we assume that the dentries are untrustworthy 658 * In the case it has, we assume that the dentries are untrustworthy
@@ -827,6 +841,10 @@ static int nfs_dentry_delete(struct dentry *dentry)
827 dentry->d_parent->d_name.name, dentry->d_name.name, 841 dentry->d_parent->d_name.name, dentry->d_name.name,
828 dentry->d_flags); 842 dentry->d_flags);
829 843
844 /* Unhash any dentry with a stale inode */
845 if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))
846 return 1;
847
830 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 848 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
831 /* Unhash it, so that ->d_iput() would be called */ 849 /* Unhash it, so that ->d_iput() would be called */
832 return 1; 850 return 1;
@@ -846,7 +864,6 @@ static int nfs_dentry_delete(struct dentry *dentry)
846 */ 864 */
847static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) 865static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
848{ 866{
849 nfs_inode_return_delegation(inode);
850 if (S_ISDIR(inode->i_mode)) 867 if (S_ISDIR(inode->i_mode))
851 /* drop any readdir cache as it could easily be old */ 868 /* drop any readdir cache as it could easily be old */
852 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; 869 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
@@ -1268,6 +1285,12 @@ out_err:
1268 return error; 1285 return error;
1269} 1286}
1270 1287
1288static void nfs_dentry_handle_enoent(struct dentry *dentry)
1289{
1290 if (dentry->d_inode != NULL && !d_unhashed(dentry))
1291 d_delete(dentry);
1292}
1293
1271static int nfs_rmdir(struct inode *dir, struct dentry *dentry) 1294static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1272{ 1295{
1273 int error; 1296 int error;
@@ -1280,6 +1303,8 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1280 /* Ensure the VFS deletes this inode */ 1303 /* Ensure the VFS deletes this inode */
1281 if (error == 0 && dentry->d_inode != NULL) 1304 if (error == 0 && dentry->d_inode != NULL)
1282 clear_nlink(dentry->d_inode); 1305 clear_nlink(dentry->d_inode);
1306 else if (error == -ENOENT)
1307 nfs_dentry_handle_enoent(dentry);
1283 unlock_kernel(); 1308 unlock_kernel();
1284 1309
1285 return error; 1310 return error;
@@ -1386,6 +1411,8 @@ static int nfs_safe_remove(struct dentry *dentry)
1386 nfs_mark_for_revalidate(inode); 1411 nfs_mark_for_revalidate(inode);
1387 } else 1412 } else
1388 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1413 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1414 if (error == -ENOENT)
1415 nfs_dentry_handle_enoent(dentry);
1389out: 1416out:
1390 return error; 1417 return error;
1391} 1418}
@@ -1422,7 +1449,7 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
1422 spin_unlock(&dentry->d_lock); 1449 spin_unlock(&dentry->d_lock);
1423 spin_unlock(&dcache_lock); 1450 spin_unlock(&dcache_lock);
1424 error = nfs_safe_remove(dentry); 1451 error = nfs_safe_remove(dentry);
1425 if (!error) { 1452 if (!error || error == -ENOENT) {
1426 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1453 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1427 } else if (need_rehash) 1454 } else if (need_rehash)
1428 d_rehash(dentry); 1455 d_rehash(dentry);
@@ -1635,7 +1662,8 @@ out:
1635 d_move(old_dentry, new_dentry); 1662 d_move(old_dentry, new_dentry);
1636 nfs_set_verifier(new_dentry, 1663 nfs_set_verifier(new_dentry,
1637 nfs_save_change_attribute(new_dir)); 1664 nfs_save_change_attribute(new_dir));
1638 } 1665 } else if (error == -ENOENT)
1666 nfs_dentry_handle_enoent(old_dentry);
1639 1667
1640 /* new dentry created? */ 1668 /* new dentry created? */
1641 if (dentry) 1669 if (dentry)
@@ -1666,13 +1694,19 @@ int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
1666restart: 1694restart:
1667 spin_lock(&nfs_access_lru_lock); 1695 spin_lock(&nfs_access_lru_lock);
1668 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { 1696 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
1697 struct rw_semaphore *s_umount;
1669 struct inode *inode; 1698 struct inode *inode;
1670 1699
1671 if (nr_to_scan-- == 0) 1700 if (nr_to_scan-- == 0)
1672 break; 1701 break;
1702 s_umount = &nfsi->vfs_inode.i_sb->s_umount;
1703 if (!down_read_trylock(s_umount))
1704 continue;
1673 inode = igrab(&nfsi->vfs_inode); 1705 inode = igrab(&nfsi->vfs_inode);
1674 if (inode == NULL) 1706 if (inode == NULL) {
1707 up_read(s_umount);
1675 continue; 1708 continue;
1709 }
1676 spin_lock(&inode->i_lock); 1710 spin_lock(&inode->i_lock);
1677 if (list_empty(&nfsi->access_cache_entry_lru)) 1711 if (list_empty(&nfsi->access_cache_entry_lru))
1678 goto remove_lru_entry; 1712 goto remove_lru_entry;
@@ -1691,6 +1725,7 @@ remove_lru_entry:
1691 spin_unlock(&inode->i_lock); 1725 spin_unlock(&inode->i_lock);
1692 spin_unlock(&nfs_access_lru_lock); 1726 spin_unlock(&nfs_access_lru_lock);
1693 iput(inode); 1727 iput(inode);
1728 up_read(s_umount);
1694 goto restart; 1729 goto restart;
1695 } 1730 }
1696 spin_unlock(&nfs_access_lru_lock); 1731 spin_unlock(&nfs_access_lru_lock);
@@ -1731,7 +1766,7 @@ static void __nfs_access_zap_cache(struct inode *inode)
1731void nfs_access_zap_cache(struct inode *inode) 1766void nfs_access_zap_cache(struct inode *inode)
1732{ 1767{
1733 /* Remove from global LRU init */ 1768 /* Remove from global LRU init */
1734 if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) { 1769 if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
1735 spin_lock(&nfs_access_lru_lock); 1770 spin_lock(&nfs_access_lru_lock);
1736 list_del_init(&NFS_I(inode)->access_cache_inode_lru); 1771 list_del_init(&NFS_I(inode)->access_cache_inode_lru);
1737 spin_unlock(&nfs_access_lru_lock); 1772 spin_unlock(&nfs_access_lru_lock);
@@ -1845,7 +1880,7 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
1845 smp_mb__after_atomic_inc(); 1880 smp_mb__after_atomic_inc();
1846 1881
1847 /* Add inode to global LRU list */ 1882 /* Add inode to global LRU list */
1848 if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) { 1883 if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
1849 spin_lock(&nfs_access_lru_lock); 1884 spin_lock(&nfs_access_lru_lock);
1850 list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list); 1885 list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list);
1851 spin_unlock(&nfs_access_lru_lock); 1886 spin_unlock(&nfs_access_lru_lock);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 7b994b2fa593..16844f98f50e 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -272,6 +272,16 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
272 unsigned long user_addr = (unsigned long)iov->iov_base; 272 unsigned long user_addr = (unsigned long)iov->iov_base;
273 size_t count = iov->iov_len; 273 size_t count = iov->iov_len;
274 size_t rsize = NFS_SERVER(inode)->rsize; 274 size_t rsize = NFS_SERVER(inode)->rsize;
275 struct rpc_task *task;
276 struct rpc_message msg = {
277 .rpc_cred = ctx->cred,
278 };
279 struct rpc_task_setup task_setup_data = {
280 .rpc_client = NFS_CLIENT(inode),
281 .rpc_message = &msg,
282 .callback_ops = &nfs_read_direct_ops,
283 .flags = RPC_TASK_ASYNC,
284 };
275 unsigned int pgbase; 285 unsigned int pgbase;
276 int result; 286 int result;
277 ssize_t started = 0; 287 ssize_t started = 0;
@@ -311,7 +321,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
311 321
312 data->req = (struct nfs_page *) dreq; 322 data->req = (struct nfs_page *) dreq;
313 data->inode = inode; 323 data->inode = inode;
314 data->cred = ctx->cred; 324 data->cred = msg.rpc_cred;
315 data->args.fh = NFS_FH(inode); 325 data->args.fh = NFS_FH(inode);
316 data->args.context = ctx; 326 data->args.context = ctx;
317 data->args.offset = pos; 327 data->args.offset = pos;
@@ -321,14 +331,16 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
321 data->res.fattr = &data->fattr; 331 data->res.fattr = &data->fattr;
322 data->res.eof = 0; 332 data->res.eof = 0;
323 data->res.count = bytes; 333 data->res.count = bytes;
334 msg.rpc_argp = &data->args;
335 msg.rpc_resp = &data->res;
324 336
325 rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC, 337 task_setup_data.task = &data->task;
326 &nfs_read_direct_ops, data); 338 task_setup_data.callback_data = data;
327 NFS_PROTO(inode)->read_setup(data); 339 NFS_PROTO(inode)->read_setup(data, &msg);
328 340
329 data->task.tk_cookie = (unsigned long) inode; 341 task = rpc_run_task(&task_setup_data);
330 342 if (!IS_ERR(task))
331 rpc_execute(&data->task); 343 rpc_put_task(task);
332 344
333 dprintk("NFS: %5u initiated direct read call " 345 dprintk("NFS: %5u initiated direct read call "
334 "(req %s/%Ld, %zu bytes @ offset %Lu)\n", 346 "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
@@ -427,6 +439,15 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
427 struct inode *inode = dreq->inode; 439 struct inode *inode = dreq->inode;
428 struct list_head *p; 440 struct list_head *p;
429 struct nfs_write_data *data; 441 struct nfs_write_data *data;
442 struct rpc_task *task;
443 struct rpc_message msg = {
444 .rpc_cred = dreq->ctx->cred,
445 };
446 struct rpc_task_setup task_setup_data = {
447 .rpc_client = NFS_CLIENT(inode),
448 .callback_ops = &nfs_write_direct_ops,
449 .flags = RPC_TASK_ASYNC,
450 };
430 451
431 dreq->count = 0; 452 dreq->count = 0;
432 get_dreq(dreq); 453 get_dreq(dreq);
@@ -436,6 +457,9 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
436 457
437 get_dreq(dreq); 458 get_dreq(dreq);
438 459
460 /* Use stable writes */
461 data->args.stable = NFS_FILE_SYNC;
462
439 /* 463 /*
440 * Reset data->res. 464 * Reset data->res.
441 */ 465 */
@@ -447,17 +471,18 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
447 * Reuse data->task; data->args should not have changed 471 * Reuse data->task; data->args should not have changed
448 * since the original request was sent. 472 * since the original request was sent.
449 */ 473 */
450 rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC, 474 task_setup_data.task = &data->task;
451 &nfs_write_direct_ops, data); 475 task_setup_data.callback_data = data;
452 NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE); 476 msg.rpc_argp = &data->args;
453 477 msg.rpc_resp = &data->res;
454 data->task.tk_priority = RPC_PRIORITY_NORMAL; 478 NFS_PROTO(inode)->write_setup(data, &msg);
455 data->task.tk_cookie = (unsigned long) inode;
456 479
457 /* 480 /*
458 * We're called via an RPC callback, so BKL is already held. 481 * We're called via an RPC callback, so BKL is already held.
459 */ 482 */
460 rpc_execute(&data->task); 483 task = rpc_run_task(&task_setup_data);
484 if (!IS_ERR(task))
485 rpc_put_task(task);
461 486
462 dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", 487 dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
463 data->task.tk_pid, 488 data->task.tk_pid,
@@ -500,9 +525,23 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
500static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) 525static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
501{ 526{
502 struct nfs_write_data *data = dreq->commit_data; 527 struct nfs_write_data *data = dreq->commit_data;
528 struct rpc_task *task;
529 struct rpc_message msg = {
530 .rpc_argp = &data->args,
531 .rpc_resp = &data->res,
532 .rpc_cred = dreq->ctx->cred,
533 };
534 struct rpc_task_setup task_setup_data = {
535 .task = &data->task,
536 .rpc_client = NFS_CLIENT(dreq->inode),
537 .rpc_message = &msg,
538 .callback_ops = &nfs_commit_direct_ops,
539 .callback_data = data,
540 .flags = RPC_TASK_ASYNC,
541 };
503 542
504 data->inode = dreq->inode; 543 data->inode = dreq->inode;
505 data->cred = dreq->ctx->cred; 544 data->cred = msg.rpc_cred;
506 545
507 data->args.fh = NFS_FH(data->inode); 546 data->args.fh = NFS_FH(data->inode);
508 data->args.offset = 0; 547 data->args.offset = 0;
@@ -511,18 +550,16 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
511 data->res.fattr = &data->fattr; 550 data->res.fattr = &data->fattr;
512 data->res.verf = &data->verf; 551 data->res.verf = &data->verf;
513 552
514 rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC, 553 NFS_PROTO(data->inode)->commit_setup(data, &msg);
515 &nfs_commit_direct_ops, data);
516 NFS_PROTO(data->inode)->commit_setup(data, 0);
517 554
518 data->task.tk_priority = RPC_PRIORITY_NORMAL;
519 data->task.tk_cookie = (unsigned long)data->inode;
520 /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ 555 /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
521 dreq->commit_data = NULL; 556 dreq->commit_data = NULL;
522 557
523 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); 558 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
524 559
525 rpc_execute(&data->task); 560 task = rpc_run_task(&task_setup_data);
561 if (!IS_ERR(task))
562 rpc_put_task(task);
526} 563}
527 564
528static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) 565static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
@@ -637,6 +674,16 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
637 struct inode *inode = ctx->path.dentry->d_inode; 674 struct inode *inode = ctx->path.dentry->d_inode;
638 unsigned long user_addr = (unsigned long)iov->iov_base; 675 unsigned long user_addr = (unsigned long)iov->iov_base;
639 size_t count = iov->iov_len; 676 size_t count = iov->iov_len;
677 struct rpc_task *task;
678 struct rpc_message msg = {
679 .rpc_cred = ctx->cred,
680 };
681 struct rpc_task_setup task_setup_data = {
682 .rpc_client = NFS_CLIENT(inode),
683 .rpc_message = &msg,
684 .callback_ops = &nfs_write_direct_ops,
685 .flags = RPC_TASK_ASYNC,
686 };
640 size_t wsize = NFS_SERVER(inode)->wsize; 687 size_t wsize = NFS_SERVER(inode)->wsize;
641 unsigned int pgbase; 688 unsigned int pgbase;
642 int result; 689 int result;
@@ -679,25 +726,27 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
679 726
680 data->req = (struct nfs_page *) dreq; 727 data->req = (struct nfs_page *) dreq;
681 data->inode = inode; 728 data->inode = inode;
682 data->cred = ctx->cred; 729 data->cred = msg.rpc_cred;
683 data->args.fh = NFS_FH(inode); 730 data->args.fh = NFS_FH(inode);
684 data->args.context = ctx; 731 data->args.context = ctx;
685 data->args.offset = pos; 732 data->args.offset = pos;
686 data->args.pgbase = pgbase; 733 data->args.pgbase = pgbase;
687 data->args.pages = data->pagevec; 734 data->args.pages = data->pagevec;
688 data->args.count = bytes; 735 data->args.count = bytes;
736 data->args.stable = sync;
689 data->res.fattr = &data->fattr; 737 data->res.fattr = &data->fattr;
690 data->res.count = bytes; 738 data->res.count = bytes;
691 data->res.verf = &data->verf; 739 data->res.verf = &data->verf;
692 740
693 rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC, 741 task_setup_data.task = &data->task;
694 &nfs_write_direct_ops, data); 742 task_setup_data.callback_data = data;
695 NFS_PROTO(inode)->write_setup(data, sync); 743 msg.rpc_argp = &data->args;
744 msg.rpc_resp = &data->res;
745 NFS_PROTO(inode)->write_setup(data, &msg);
696 746
697 data->task.tk_priority = RPC_PRIORITY_NORMAL; 747 task = rpc_run_task(&task_setup_data);
698 data->task.tk_cookie = (unsigned long) inode; 748 if (!IS_ERR(task))
699 749 rpc_put_task(task);
700 rpc_execute(&data->task);
701 750
702 dprintk("NFS: %5u initiated direct write call " 751 dprintk("NFS: %5u initiated direct write call "
703 "(req %s/%Ld, %zu bytes @ offset %Lu)\n", 752 "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
@@ -766,7 +815,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
766 struct inode *inode = iocb->ki_filp->f_mapping->host; 815 struct inode *inode = iocb->ki_filp->f_mapping->host;
767 struct nfs_direct_req *dreq; 816 struct nfs_direct_req *dreq;
768 size_t wsize = NFS_SERVER(inode)->wsize; 817 size_t wsize = NFS_SERVER(inode)->wsize;
769 int sync = 0; 818 int sync = NFS_UNSTABLE;
770 819
771 dreq = nfs_direct_req_alloc(); 820 dreq = nfs_direct_req_alloc();
772 if (!dreq) 821 if (!dreq)
@@ -774,7 +823,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
774 nfs_alloc_commit_data(dreq); 823 nfs_alloc_commit_data(dreq);
775 824
776 if (dreq->commit_data == NULL || count < wsize) 825 if (dreq->commit_data == NULL || count < wsize)
777 sync = FLUSH_STABLE; 826 sync = NFS_FILE_SYNC;
778 827
779 dreq->inode = inode; 828 dreq->inode = inode;
780 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 829 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
@@ -886,8 +935,6 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
886 retval = generic_write_checks(file, &pos, &count, 0); 935 retval = generic_write_checks(file, &pos, &count, 0);
887 if (retval) 936 if (retval)
888 goto out; 937 goto out;
889 if (!count)
890 goto out; /* return 0 */
891 938
892 retval = -EINVAL; 939 retval = -EINVAL;
893 if ((ssize_t) count < 0) 940 if ((ssize_t) count < 0)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index b3bb89f7d5d2..ef57a5ae5904 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -349,7 +349,9 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
349 unlock_page(page); 349 unlock_page(page);
350 page_cache_release(page); 350 page_cache_release(page);
351 351
352 return status < 0 ? status : copied; 352 if (status < 0)
353 return status;
354 return copied;
353} 355}
354 356
355static void nfs_invalidate_page(struct page *page, unsigned long offset) 357static void nfs_invalidate_page(struct page *page, unsigned long offset)
@@ -392,35 +394,27 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
392 struct file *filp = vma->vm_file; 394 struct file *filp = vma->vm_file;
393 unsigned pagelen; 395 unsigned pagelen;
394 int ret = -EINVAL; 396 int ret = -EINVAL;
395 void *fsdata;
396 struct address_space *mapping; 397 struct address_space *mapping;
397 loff_t offset;
398 398
399 lock_page(page); 399 lock_page(page);
400 mapping = page->mapping; 400 mapping = page->mapping;
401 if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping) { 401 if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
402 unlock_page(page); 402 goto out_unlock;
403 return -EINVAL; 403
404 } 404 ret = 0;
405 pagelen = nfs_page_length(page); 405 pagelen = nfs_page_length(page);
406 offset = (loff_t)page->index << PAGE_CACHE_SHIFT; 406 if (pagelen == 0)
407 unlock_page(page); 407 goto out_unlock;
408 408
409 /* 409 ret = nfs_flush_incompatible(filp, page);
410 * we can use mapping after releasing the page lock, because: 410 if (ret != 0)
411 * we hold mmap_sem on the fault path, which should pin the vma 411 goto out_unlock;
412 * which should pin the file, which pins the dentry which should
413 * hold a reference on inode.
414 */
415 412
416 if (pagelen) { 413 ret = nfs_updatepage(filp, page, 0, pagelen);
417 struct page *page2 = NULL; 414 if (ret == 0)
418 ret = nfs_write_begin(filp, mapping, offset, pagelen, 415 ret = pagelen;
419 0, &page2, &fsdata); 416out_unlock:
420 if (!ret) 417 unlock_page(page);
421 ret = nfs_write_end(filp, mapping, offset, pagelen,
422 pagelen, page2, fsdata);
423 }
424 return ret; 418 return ret;
425} 419}
426 420
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 0ee43843f4ec..e6242cdbaf91 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -57,6 +57,17 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
57 } 57 }
58 /* Circumvent igrab(): we know the inode is not being freed */ 58 /* Circumvent igrab(): we know the inode is not being freed */
59 atomic_inc(&inode->i_count); 59 atomic_inc(&inode->i_count);
60 /*
61 * Ensure that this dentry is invisible to d_find_alias().
62 * Otherwise, it may be spliced into the tree by
63 * d_materialise_unique if a parent directory from the same
64 * filesystem gets mounted at a later time.
65 * This again causes shrink_dcache_for_umount_subtree() to
66 * Oops, since the test for IS_ROOT() will fail.
67 */
68 spin_lock(&dcache_lock);
69 list_del_init(&sb->s_root->d_alias);
70 spin_unlock(&dcache_lock);
60 } 71 }
61 return 0; 72 return 0;
62} 73}
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index d11eb055265c..8ae5dba2d4e5 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -72,39 +72,39 @@ module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
72 &nfs_idmap_cache_timeout, 0644); 72 &nfs_idmap_cache_timeout, 0644);
73 73
74struct idmap_hashent { 74struct idmap_hashent {
75 unsigned long ih_expires; 75 unsigned long ih_expires;
76 __u32 ih_id; 76 __u32 ih_id;
77 int ih_namelen; 77 size_t ih_namelen;
78 char ih_name[IDMAP_NAMESZ]; 78 char ih_name[IDMAP_NAMESZ];
79}; 79};
80 80
81struct idmap_hashtable { 81struct idmap_hashtable {
82 __u8 h_type; 82 __u8 h_type;
83 struct idmap_hashent h_entries[IDMAP_HASH_SZ]; 83 struct idmap_hashent h_entries[IDMAP_HASH_SZ];
84}; 84};
85 85
86struct idmap { 86struct idmap {
87 struct dentry *idmap_dentry; 87 struct dentry *idmap_dentry;
88 wait_queue_head_t idmap_wq; 88 wait_queue_head_t idmap_wq;
89 struct idmap_msg idmap_im; 89 struct idmap_msg idmap_im;
90 struct mutex idmap_lock; /* Serializes upcalls */ 90 struct mutex idmap_lock; /* Serializes upcalls */
91 struct mutex idmap_im_lock; /* Protects the hashtable */ 91 struct mutex idmap_im_lock; /* Protects the hashtable */
92 struct idmap_hashtable idmap_user_hash; 92 struct idmap_hashtable idmap_user_hash;
93 struct idmap_hashtable idmap_group_hash; 93 struct idmap_hashtable idmap_group_hash;
94}; 94};
95 95
96static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *, 96static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
97 char __user *, size_t); 97 char __user *, size_t);
98static ssize_t idmap_pipe_downcall(struct file *, const char __user *, 98static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
99 size_t); 99 size_t);
100static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); 100static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
101 101
102static unsigned int fnvhash32(const void *, size_t); 102static unsigned int fnvhash32(const void *, size_t);
103 103
104static struct rpc_pipe_ops idmap_upcall_ops = { 104static struct rpc_pipe_ops idmap_upcall_ops = {
105 .upcall = idmap_pipe_upcall, 105 .upcall = idmap_pipe_upcall,
106 .downcall = idmap_pipe_downcall, 106 .downcall = idmap_pipe_downcall,
107 .destroy_msg = idmap_pipe_destroy_msg, 107 .destroy_msg = idmap_pipe_destroy_msg,
108}; 108};
109 109
110int 110int
@@ -115,19 +115,20 @@ nfs_idmap_new(struct nfs_client *clp)
115 115
116 BUG_ON(clp->cl_idmap != NULL); 116 BUG_ON(clp->cl_idmap != NULL);
117 117
118 if ((idmap = kzalloc(sizeof(*idmap), GFP_KERNEL)) == NULL) 118 idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);
119 return -ENOMEM; 119 if (idmap == NULL)
120 return -ENOMEM;
120 121
121 idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap", 122 idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
122 idmap, &idmap_upcall_ops, 0); 123 idmap, &idmap_upcall_ops, 0);
123 if (IS_ERR(idmap->idmap_dentry)) { 124 if (IS_ERR(idmap->idmap_dentry)) {
124 error = PTR_ERR(idmap->idmap_dentry); 125 error = PTR_ERR(idmap->idmap_dentry);
125 kfree(idmap); 126 kfree(idmap);
126 return error; 127 return error;
127 } 128 }
128 129
129 mutex_init(&idmap->idmap_lock); 130 mutex_init(&idmap->idmap_lock);
130 mutex_init(&idmap->idmap_im_lock); 131 mutex_init(&idmap->idmap_im_lock);
131 init_waitqueue_head(&idmap->idmap_wq); 132 init_waitqueue_head(&idmap->idmap_wq);
132 idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER; 133 idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
133 idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP; 134 idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
@@ -192,7 +193,7 @@ idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
192 * pretty trivial. 193 * pretty trivial.
193 */ 194 */
194static inline struct idmap_hashent * 195static inline struct idmap_hashent *
195idmap_alloc_name(struct idmap_hashtable *h, char *name, unsigned len) 196idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len)
196{ 197{
197 return idmap_name_hash(h, name, len); 198 return idmap_name_hash(h, name, len);
198} 199}
@@ -285,7 +286,7 @@ nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h,
285 memset(im, 0, sizeof(*im)); 286 memset(im, 0, sizeof(*im));
286 mutex_unlock(&idmap->idmap_im_lock); 287 mutex_unlock(&idmap->idmap_im_lock);
287 mutex_unlock(&idmap->idmap_lock); 288 mutex_unlock(&idmap->idmap_lock);
288 return (ret); 289 return ret;
289} 290}
290 291
291/* 292/*
@@ -354,42 +355,40 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
354/* RPC pipefs upcall/downcall routines */ 355/* RPC pipefs upcall/downcall routines */
355static ssize_t 356static ssize_t
356idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, 357idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
357 char __user *dst, size_t buflen) 358 char __user *dst, size_t buflen)
358{ 359{
359 char *data = (char *)msg->data + msg->copied; 360 char *data = (char *)msg->data + msg->copied;
360 ssize_t mlen = msg->len - msg->copied; 361 size_t mlen = min(msg->len, buflen);
361 ssize_t left; 362 unsigned long left;
362 363
363 if (mlen > buflen) 364 left = copy_to_user(dst, data, mlen);
364 mlen = buflen; 365 if (left == mlen) {
365 366 msg->errno = -EFAULT;
366 left = copy_to_user(dst, data, mlen); 367 return -EFAULT;
367 if (left < 0) {
368 msg->errno = left;
369 return left;
370 } 368 }
369
371 mlen -= left; 370 mlen -= left;
372 msg->copied += mlen; 371 msg->copied += mlen;
373 msg->errno = 0; 372 msg->errno = 0;
374 return mlen; 373 return mlen;
375} 374}
376 375
377static ssize_t 376static ssize_t
378idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) 377idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
379{ 378{
380 struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); 379 struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
381 struct idmap *idmap = (struct idmap *)rpci->private; 380 struct idmap *idmap = (struct idmap *)rpci->private;
382 struct idmap_msg im_in, *im = &idmap->idmap_im; 381 struct idmap_msg im_in, *im = &idmap->idmap_im;
383 struct idmap_hashtable *h; 382 struct idmap_hashtable *h;
384 struct idmap_hashent *he = NULL; 383 struct idmap_hashent *he = NULL;
385 int namelen_in; 384 size_t namelen_in;
386 int ret; 385 int ret;
387 386
388 if (mlen != sizeof(im_in)) 387 if (mlen != sizeof(im_in))
389 return (-ENOSPC); 388 return -ENOSPC;
390 389
391 if (copy_from_user(&im_in, src, mlen) != 0) 390 if (copy_from_user(&im_in, src, mlen) != 0)
392 return (-EFAULT); 391 return -EFAULT;
393 392
394 mutex_lock(&idmap->idmap_im_lock); 393 mutex_lock(&idmap->idmap_im_lock);
395 394
@@ -487,7 +486,7 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
487 hash ^= (unsigned int)*p; 486 hash ^= (unsigned int)*p;
488 } 487 }
489 488
490 return (hash); 489 return hash;
491} 490}
492 491
493int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) 492int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f68c22215b14..966a8850aa30 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -192,7 +192,7 @@ void nfs_invalidate_atime(struct inode *inode)
192 */ 192 */
193static void nfs_invalidate_inode(struct inode *inode) 193static void nfs_invalidate_inode(struct inode *inode)
194{ 194{
195 set_bit(NFS_INO_STALE, &NFS_FLAGS(inode)); 195 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
196 nfs_zap_caches_locked(inode); 196 nfs_zap_caches_locked(inode);
197} 197}
198 198
@@ -229,7 +229,7 @@ nfs_init_locked(struct inode *inode, void *opaque)
229 struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; 229 struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque;
230 struct nfs_fattr *fattr = desc->fattr; 230 struct nfs_fattr *fattr = desc->fattr;
231 231
232 NFS_FILEID(inode) = fattr->fileid; 232 set_nfs_fileid(inode, fattr->fileid);
233 nfs_copy_fh(NFS_FH(inode), desc->fh); 233 nfs_copy_fh(NFS_FH(inode), desc->fh);
234 return 0; 234 return 0;
235} 235}
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
291 inode->i_fop = &nfs_dir_operations; 291 inode->i_fop = &nfs_dir_operations;
292 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) 292 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
293 && fattr->size <= NFS_LIMIT_READDIRPLUS) 293 && fattr->size <= NFS_LIMIT_READDIRPLUS)
294 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode)); 294 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
295 /* Deal with crossing mountpoints */ 295 /* Deal with crossing mountpoints */
296 if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { 296 if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
297 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 297 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
@@ -457,9 +457,18 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
457 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; 457 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
458 int err; 458 int err;
459 459
460 /* Flush out writes to the server in order to update c/mtime */ 460 /*
461 if (S_ISREG(inode->i_mode)) 461 * Flush out writes to the server in order to update c/mtime.
462 *
463 * Hold the i_mutex to suspend application writes temporarily;
464 * this prevents long-running writing applications from blocking
465 * nfs_wb_nocommit.
466 */
467 if (S_ISREG(inode->i_mode)) {
468 mutex_lock(&inode->i_mutex);
462 nfs_wb_nocommit(inode); 469 nfs_wb_nocommit(inode);
470 mutex_unlock(&inode->i_mutex);
471 }
463 472
464 /* 473 /*
465 * We may force a getattr if the user cares about atime. 474 * We may force a getattr if the user cares about atime.
@@ -655,7 +664,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
655 if (status == -ESTALE) { 664 if (status == -ESTALE) {
656 nfs_zap_caches(inode); 665 nfs_zap_caches(inode);
657 if (!S_ISDIR(inode->i_mode)) 666 if (!S_ISDIR(inode->i_mode))
658 set_bit(NFS_INO_STALE, &NFS_FLAGS(inode)); 667 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
659 } 668 }
660 goto out; 669 goto out;
661 } 670 }
@@ -810,8 +819,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
810 if (S_ISDIR(inode->i_mode)) 819 if (S_ISDIR(inode->i_mode))
811 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 820 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
812 } 821 }
813 if (inode->i_size == fattr->pre_size && nfsi->npages == 0) 822 if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) &&
814 inode->i_size = fattr->size; 823 nfsi->npages == 0)
824 inode->i_size = nfs_size_to_loff_t(fattr->size);
815 } 825 }
816} 826}
817 827
@@ -1015,7 +1025,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1015 dprintk("NFS: mtime change on server for file %s/%ld\n", 1025 dprintk("NFS: mtime change on server for file %s/%ld\n",
1016 inode->i_sb->s_id, inode->i_ino); 1026 inode->i_sb->s_id, inode->i_ino);
1017 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1027 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1018 nfsi->cache_change_attribute = now; 1028 if (S_ISDIR(inode->i_mode))
1029 nfs_force_lookup_revalidate(inode);
1019 } 1030 }
1020 /* If ctime has changed we should definitely clear access+acl caches */ 1031 /* If ctime has changed we should definitely clear access+acl caches */
1021 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) 1032 if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
@@ -1024,7 +1035,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1024 dprintk("NFS: change_attr change on server for file %s/%ld\n", 1035 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1025 inode->i_sb->s_id, inode->i_ino); 1036 inode->i_sb->s_id, inode->i_ino);
1026 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1037 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1027 nfsi->cache_change_attribute = now; 1038 if (S_ISDIR(inode->i_mode))
1039 nfs_force_lookup_revalidate(inode);
1028 } 1040 }
1029 1041
1030 /* Check if our cached file size is stale */ 1042 /* Check if our cached file size is stale */
@@ -1129,7 +1141,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1129void nfs4_clear_inode(struct inode *inode) 1141void nfs4_clear_inode(struct inode *inode)
1130{ 1142{
1131 /* If we are holding a delegation, return it! */ 1143 /* If we are holding a delegation, return it! */
1132 nfs_inode_return_delegation(inode); 1144 nfs_inode_return_delegation_noreclaim(inode);
1133 /* First call standard NFS clear_inode() code */ 1145 /* First call standard NFS clear_inode() code */
1134 nfs_clear_inode(inode); 1146 nfs_clear_inode(inode);
1135} 1147}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f3acf48412be..0f5619611b8d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -21,7 +21,8 @@ struct nfs_clone_mount {
21 struct nfs_fattr *fattr; 21 struct nfs_fattr *fattr;
22 char *hostname; 22 char *hostname;
23 char *mnt_path; 23 char *mnt_path;
24 struct sockaddr_in *addr; 24 struct sockaddr *addr;
25 size_t addrlen;
25 rpc_authflavor_t authflavor; 26 rpc_authflavor_t authflavor;
26}; 27};
27 28
@@ -41,19 +42,19 @@ struct nfs_parsed_mount_data {
41 char *client_address; 42 char *client_address;
42 43
43 struct { 44 struct {
44 struct sockaddr_in address; 45 struct sockaddr_storage address;
46 size_t addrlen;
45 char *hostname; 47 char *hostname;
46 unsigned int program;
47 unsigned int version; 48 unsigned int version;
48 unsigned short port; 49 unsigned short port;
49 int protocol; 50 int protocol;
50 } mount_server; 51 } mount_server;
51 52
52 struct { 53 struct {
53 struct sockaddr_in address; 54 struct sockaddr_storage address;
55 size_t addrlen;
54 char *hostname; 56 char *hostname;
55 char *export_path; 57 char *export_path;
56 unsigned int program;
57 int protocol; 58 int protocol;
58 } nfs_server; 59 } nfs_server;
59}; 60};
@@ -62,7 +63,8 @@ struct nfs_parsed_mount_data {
62extern struct rpc_program nfs_program; 63extern struct rpc_program nfs_program;
63 64
64extern void nfs_put_client(struct nfs_client *); 65extern void nfs_put_client(struct nfs_client *);
65extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int); 66extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
67extern struct nfs_client *nfs_find_client_next(struct nfs_client *);
66extern struct nfs_server *nfs_create_server( 68extern struct nfs_server *nfs_create_server(
67 const struct nfs_parsed_mount_data *, 69 const struct nfs_parsed_mount_data *,
68 struct nfs_fh *); 70 struct nfs_fh *);
@@ -160,6 +162,8 @@ extern struct rpc_stat nfs_rpcstat;
160 162
161extern int __init register_nfs_fs(void); 163extern int __init register_nfs_fs(void);
162extern void __exit unregister_nfs_fs(void); 164extern void __exit unregister_nfs_fs(void);
165extern void nfs_sb_active(struct nfs_server *server);
166extern void nfs_sb_deactive(struct nfs_server *server);
163 167
164/* namespace.c */ 168/* namespace.c */
165extern char *nfs_path(const char *base, 169extern char *nfs_path(const char *base,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index acfc56f9edc0..be4ce1c3a3d8 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -188,7 +188,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
188{ 188{
189#ifdef CONFIG_NFS_V4 189#ifdef CONFIG_NFS_V4
190 struct vfsmount *mnt = NULL; 190 struct vfsmount *mnt = NULL;
191 switch (server->nfs_client->cl_nfsversion) { 191 switch (server->nfs_client->rpc_ops->version) {
192 case 2: 192 case 2:
193 case 3: 193 case 3:
194 mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); 194 mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 668ab96c7b59..1f7ea675e0c5 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -262,7 +262,9 @@ static int
262nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) 262nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
263{ 263{
264 struct kvec *iov = req->rq_rcv_buf.head; 264 struct kvec *iov = req->rq_rcv_buf.head;
265 int status, count, recvd, hdrlen; 265 size_t hdrlen;
266 u32 count, recvd;
267 int status;
266 268
267 if ((status = ntohl(*p++))) 269 if ((status = ntohl(*p++)))
268 return -nfs_stat_to_errno(status); 270 return -nfs_stat_to_errno(status);
@@ -273,7 +275,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
273 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 275 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
274 if (iov->iov_len < hdrlen) { 276 if (iov->iov_len < hdrlen) {
275 dprintk("NFS: READ reply header overflowed:" 277 dprintk("NFS: READ reply header overflowed:"
276 "length %d > %Zu\n", hdrlen, iov->iov_len); 278 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
277 return -errno_NFSERR_IO; 279 return -errno_NFSERR_IO;
278 } else if (iov->iov_len != hdrlen) { 280 } else if (iov->iov_len != hdrlen) {
279 dprintk("NFS: READ header is short. iovec will be shifted.\n"); 281 dprintk("NFS: READ header is short. iovec will be shifted.\n");
@@ -283,11 +285,11 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
283 recvd = req->rq_rcv_buf.len - hdrlen; 285 recvd = req->rq_rcv_buf.len - hdrlen;
284 if (count > recvd) { 286 if (count > recvd) {
285 dprintk("NFS: server cheating in read reply: " 287 dprintk("NFS: server cheating in read reply: "
286 "count %d > recvd %d\n", count, recvd); 288 "count %u > recvd %u\n", count, recvd);
287 count = recvd; 289 count = recvd;
288 } 290 }
289 291
290 dprintk("RPC: readres OK count %d\n", count); 292 dprintk("RPC: readres OK count %u\n", count);
291 if (count < res->count) 293 if (count < res->count)
292 res->count = count; 294 res->count = count;
293 295
@@ -423,9 +425,10 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
423 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 425 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
424 struct kvec *iov = rcvbuf->head; 426 struct kvec *iov = rcvbuf->head;
425 struct page **page; 427 struct page **page;
426 int hdrlen, recvd; 428 size_t hdrlen;
429 unsigned int pglen, recvd;
430 u32 len;
427 int status, nr; 431 int status, nr;
428 unsigned int len, pglen;
429 __be32 *end, *entry, *kaddr; 432 __be32 *end, *entry, *kaddr;
430 433
431 if ((status = ntohl(*p++))) 434 if ((status = ntohl(*p++)))
@@ -434,7 +437,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
434 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 437 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
435 if (iov->iov_len < hdrlen) { 438 if (iov->iov_len < hdrlen) {
436 dprintk("NFS: READDIR reply header overflowed:" 439 dprintk("NFS: READDIR reply header overflowed:"
437 "length %d > %Zu\n", hdrlen, iov->iov_len); 440 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
438 return -errno_NFSERR_IO; 441 return -errno_NFSERR_IO;
439 } else if (iov->iov_len != hdrlen) { 442 } else if (iov->iov_len != hdrlen) {
440 dprintk("NFS: READDIR header is short. iovec will be shifted.\n"); 443 dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
@@ -576,7 +579,8 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
576{ 579{
577 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 580 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
578 struct kvec *iov = rcvbuf->head; 581 struct kvec *iov = rcvbuf->head;
579 int hdrlen, len, recvd; 582 size_t hdrlen;
583 u32 len, recvd;
580 char *kaddr; 584 char *kaddr;
581 int status; 585 int status;
582 586
@@ -584,14 +588,14 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
584 return -nfs_stat_to_errno(status); 588 return -nfs_stat_to_errno(status);
585 /* Convert length of symlink */ 589 /* Convert length of symlink */
586 len = ntohl(*p++); 590 len = ntohl(*p++);
587 if (len >= rcvbuf->page_len || len <= 0) { 591 if (len >= rcvbuf->page_len) {
588 dprintk("nfs: server returned giant symlink!\n"); 592 dprintk("nfs: server returned giant symlink!\n");
589 return -ENAMETOOLONG; 593 return -ENAMETOOLONG;
590 } 594 }
591 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 595 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
592 if (iov->iov_len < hdrlen) { 596 if (iov->iov_len < hdrlen) {
593 dprintk("NFS: READLINK reply header overflowed:" 597 dprintk("NFS: READLINK reply header overflowed:"
594 "length %d > %Zu\n", hdrlen, iov->iov_len); 598 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
595 return -errno_NFSERR_IO; 599 return -errno_NFSERR_IO;
596 } else if (iov->iov_len != hdrlen) { 600 } else if (iov->iov_len != hdrlen) {
597 dprintk("NFS: READLINK header is short. iovec will be shifted.\n"); 601 dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 5ae96340f2c2..549dbce714a4 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -729,16 +729,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
729 return 0; 729 return 0;
730} 730}
731 731
732static void nfs3_proc_read_setup(struct nfs_read_data *data) 732static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
733{ 733{
734 struct rpc_message msg = { 734 msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
735 .rpc_proc = &nfs3_procedures[NFS3PROC_READ],
736 .rpc_argp = &data->args,
737 .rpc_resp = &data->res,
738 .rpc_cred = data->cred,
739 };
740
741 rpc_call_setup(&data->task, &msg, 0);
742} 735}
743 736
744static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) 737static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -750,24 +743,9 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
750 return 0; 743 return 0;
751} 744}
752 745
753static void nfs3_proc_write_setup(struct nfs_write_data *data, int how) 746static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
754{ 747{
755 struct rpc_message msg = { 748 msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
756 .rpc_proc = &nfs3_procedures[NFS3PROC_WRITE],
757 .rpc_argp = &data->args,
758 .rpc_resp = &data->res,
759 .rpc_cred = data->cred,
760 };
761
762 data->args.stable = NFS_UNSTABLE;
763 if (how & FLUSH_STABLE) {
764 data->args.stable = NFS_FILE_SYNC;
765 if (NFS_I(data->inode)->ncommit)
766 data->args.stable = NFS_DATA_SYNC;
767 }
768
769 /* Finalize the task. */
770 rpc_call_setup(&data->task, &msg, 0);
771} 749}
772 750
773static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data) 751static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -778,22 +756,17 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
778 return 0; 756 return 0;
779} 757}
780 758
781static void nfs3_proc_commit_setup(struct nfs_write_data *data, int how) 759static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
782{ 760{
783 struct rpc_message msg = { 761 msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
784 .rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT],
785 .rpc_argp = &data->args,
786 .rpc_resp = &data->res,
787 .rpc_cred = data->cred,
788 };
789
790 rpc_call_setup(&data->task, &msg, 0);
791} 762}
792 763
793static int 764static int
794nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) 765nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
795{ 766{
796 return nlmclnt_proc(filp->f_path.dentry->d_inode, cmd, fl); 767 struct inode *inode = filp->f_path.dentry->d_inode;
768
769 return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
797} 770}
798 771
799const struct nfs_rpc_ops nfs_v3_clientops = { 772const struct nfs_rpc_ops nfs_v3_clientops = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 616d3267b7e7..3917e2fa4e40 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -506,9 +506,9 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
506 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 506 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
507 struct kvec *iov = rcvbuf->head; 507 struct kvec *iov = rcvbuf->head;
508 struct page **page; 508 struct page **page;
509 int hdrlen, recvd; 509 size_t hdrlen;
510 u32 len, recvd, pglen;
510 int status, nr; 511 int status, nr;
511 unsigned int len, pglen;
512 __be32 *entry, *end, *kaddr; 512 __be32 *entry, *end, *kaddr;
513 513
514 status = ntohl(*p++); 514 status = ntohl(*p++);
@@ -527,7 +527,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
527 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 527 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
528 if (iov->iov_len < hdrlen) { 528 if (iov->iov_len < hdrlen) {
529 dprintk("NFS: READDIR reply header overflowed:" 529 dprintk("NFS: READDIR reply header overflowed:"
530 "length %d > %Zu\n", hdrlen, iov->iov_len); 530 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
531 return -errno_NFSERR_IO; 531 return -errno_NFSERR_IO;
532 } else if (iov->iov_len != hdrlen) { 532 } else if (iov->iov_len != hdrlen) {
533 dprintk("NFS: READDIR header is short. iovec will be shifted.\n"); 533 dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
@@ -549,7 +549,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
549 len = ntohl(*p++); /* string length */ 549 len = ntohl(*p++); /* string length */
550 p += XDR_QUADLEN(len) + 2; /* name + cookie */ 550 p += XDR_QUADLEN(len) + 2; /* name + cookie */
551 if (len > NFS3_MAXNAMLEN) { 551 if (len > NFS3_MAXNAMLEN) {
552 dprintk("NFS: giant filename in readdir (len %x)!\n", 552 dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
553 len); 553 len);
554 goto err_unmap; 554 goto err_unmap;
555 } 555 }
@@ -570,7 +570,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
570 len = ntohl(*p++); 570 len = ntohl(*p++);
571 if (len > NFS3_FHSIZE) { 571 if (len > NFS3_FHSIZE) {
572 dprintk("NFS: giant filehandle in " 572 dprintk("NFS: giant filehandle in "
573 "readdir (len %x)!\n", len); 573 "readdir (len 0x%x)!\n", len);
574 goto err_unmap; 574 goto err_unmap;
575 } 575 }
576 p += XDR_QUADLEN(len); 576 p += XDR_QUADLEN(len);
@@ -815,7 +815,8 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
815{ 815{
816 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 816 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
817 struct kvec *iov = rcvbuf->head; 817 struct kvec *iov = rcvbuf->head;
818 int hdrlen, len, recvd; 818 size_t hdrlen;
819 u32 len, recvd;
819 char *kaddr; 820 char *kaddr;
820 int status; 821 int status;
821 822
@@ -827,7 +828,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
827 828
828 /* Convert length of symlink */ 829 /* Convert length of symlink */
829 len = ntohl(*p++); 830 len = ntohl(*p++);
830 if (len >= rcvbuf->page_len || len <= 0) { 831 if (len >= rcvbuf->page_len) {
831 dprintk("nfs: server returned giant symlink!\n"); 832 dprintk("nfs: server returned giant symlink!\n");
832 return -ENAMETOOLONG; 833 return -ENAMETOOLONG;
833 } 834 }
@@ -835,7 +836,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
835 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 836 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
836 if (iov->iov_len < hdrlen) { 837 if (iov->iov_len < hdrlen) {
837 dprintk("NFS: READLINK reply header overflowed:" 838 dprintk("NFS: READLINK reply header overflowed:"
838 "length %d > %Zu\n", hdrlen, iov->iov_len); 839 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
839 return -errno_NFSERR_IO; 840 return -errno_NFSERR_IO;
840 } else if (iov->iov_len != hdrlen) { 841 } else if (iov->iov_len != hdrlen) {
841 dprintk("NFS: READLINK header is short. " 842 dprintk("NFS: READLINK header is short. "
@@ -863,7 +864,9 @@ static int
863nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) 864nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
864{ 865{
865 struct kvec *iov = req->rq_rcv_buf.head; 866 struct kvec *iov = req->rq_rcv_buf.head;
866 int status, count, ocount, recvd, hdrlen; 867 size_t hdrlen;
868 u32 count, ocount, recvd;
869 int status;
867 870
868 status = ntohl(*p++); 871 status = ntohl(*p++);
869 p = xdr_decode_post_op_attr(p, res->fattr); 872 p = xdr_decode_post_op_attr(p, res->fattr);
@@ -871,7 +874,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
871 if (status != 0) 874 if (status != 0)
872 return -nfs_stat_to_errno(status); 875 return -nfs_stat_to_errno(status);
873 876
874 /* Decode reply could and EOF flag. NFSv3 is somewhat redundant 877 /* Decode reply count and EOF flag. NFSv3 is somewhat redundant
875 * in that it puts the count both in the res struct and in the 878 * in that it puts the count both in the res struct and in the
876 * opaque data count. */ 879 * opaque data count. */
877 count = ntohl(*p++); 880 count = ntohl(*p++);
@@ -886,7 +889,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
886 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 889 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
887 if (iov->iov_len < hdrlen) { 890 if (iov->iov_len < hdrlen) {
888 dprintk("NFS: READ reply header overflowed:" 891 dprintk("NFS: READ reply header overflowed:"
889 "length %d > %Zu\n", hdrlen, iov->iov_len); 892 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
890 return -errno_NFSERR_IO; 893 return -errno_NFSERR_IO;
891 } else if (iov->iov_len != hdrlen) { 894 } else if (iov->iov_len != hdrlen) {
892 dprintk("NFS: READ header is short. iovec will be shifted.\n"); 895 dprintk("NFS: READ header is short. iovec will be shifted.\n");
@@ -896,7 +899,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
896 recvd = req->rq_rcv_buf.len - hdrlen; 899 recvd = req->rq_rcv_buf.len - hdrlen;
897 if (count > recvd) { 900 if (count > recvd) {
898 dprintk("NFS: server cheating in read reply: " 901 dprintk("NFS: server cheating in read reply: "
899 "count %d > recvd %d\n", count, recvd); 902 "count %u > recvd %u\n", count, recvd);
900 count = recvd; 903 count = recvd;
901 res->eof = 0; 904 res->eof = 0;
902 } 905 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index b35069a2aa9e..bd1b9d663fb9 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -115,6 +115,7 @@ struct nfs4_lock_state {
115#define NFS_LOCK_INITIALIZED 1 115#define NFS_LOCK_INITIALIZED 1
116 int ls_flags; 116 int ls_flags;
117 struct nfs_seqid_counter ls_seqid; 117 struct nfs_seqid_counter ls_seqid;
118 struct rpc_sequence ls_sequence;
118 struct nfs_unique_id ls_id; 119 struct nfs_unique_id ls_id;
119 nfs4_stateid ls_stateid; 120 nfs4_stateid ls_stateid;
120 atomic_t ls_count; 121 atomic_t ls_count;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index dd5fef20c702..5f9ba41ed5bf 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -114,10 +114,7 @@ static inline int valid_ipaddr4(const char *buf)
114 * nfs_follow_referral - set up mountpoint when hitting a referral on moved error 114 * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
115 * @mnt_parent - mountpoint of parent directory 115 * @mnt_parent - mountpoint of parent directory
116 * @dentry - parent directory 116 * @dentry - parent directory
117 * @fspath - fs path returned in fs_locations 117 * @locations - array of NFSv4 server location information
118 * @mntpath - mount path to new server
119 * @hostname - hostname of new server
120 * @addr - host addr of new server
121 * 118 *
122 */ 119 */
123static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, 120static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
@@ -131,7 +128,8 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
131 .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, 128 .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
132 }; 129 };
133 char *page = NULL, *page2 = NULL; 130 char *page = NULL, *page2 = NULL;
134 int loc, s, error; 131 unsigned int s;
132 int loc, error;
135 133
136 if (locations == NULL || locations->nlocations <= 0) 134 if (locations == NULL || locations->nlocations <= 0)
137 goto out; 135 goto out;
@@ -174,7 +172,10 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
174 172
175 s = 0; 173 s = 0;
176 while (s < location->nservers) { 174 while (s < location->nservers) {
177 struct sockaddr_in addr = {}; 175 struct sockaddr_in addr = {
176 .sin_family = AF_INET,
177 .sin_port = htons(NFS_PORT),
178 };
178 179
179 if (location->servers[s].len <= 0 || 180 if (location->servers[s].len <= 0 ||
180 valid_ipaddr4(location->servers[s].data) < 0) { 181 valid_ipaddr4(location->servers[s].data) < 0) {
@@ -183,10 +184,9 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
183 } 184 }
184 185
185 mountdata.hostname = location->servers[s].data; 186 mountdata.hostname = location->servers[s].data;
186 addr.sin_addr.s_addr = in_aton(mountdata.hostname); 187 addr.sin_addr.s_addr = in_aton(mountdata.hostname),
187 addr.sin_family = AF_INET; 188 mountdata.addr = (struct sockaddr *)&addr;
188 addr.sin_port = htons(NFS_PORT); 189 mountdata.addrlen = sizeof(addr);
189 mountdata.addr = &addr;
190 190
191 snprintf(page, PAGE_SIZE, "%s:%s", 191 snprintf(page, PAGE_SIZE, "%s:%s",
192 mountdata.hostname, 192 mountdata.hostname,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c4faa43b36de..027e1095256e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -210,7 +210,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
210 spin_lock(&dir->i_lock); 210 spin_lock(&dir->i_lock);
211 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; 211 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
212 if (!cinfo->atomic || cinfo->before != nfsi->change_attr) 212 if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
213 nfsi->cache_change_attribute = jiffies; 213 nfs_force_lookup_revalidate(dir);
214 nfsi->change_attr = cinfo->after; 214 nfsi->change_attr = cinfo->after;
215 spin_unlock(&dir->i_lock); 215 spin_unlock(&dir->i_lock);
216} 216}
@@ -715,19 +715,6 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
715 return err; 715 return err;
716} 716}
717 717
718static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
719{
720 struct nfs4_opendata *data = calldata;
721 struct rpc_message msg = {
722 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
723 .rpc_argp = &data->c_arg,
724 .rpc_resp = &data->c_res,
725 .rpc_cred = data->owner->so_cred,
726 };
727 data->timestamp = jiffies;
728 rpc_call_setup(task, &msg, 0);
729}
730
731static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) 718static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
732{ 719{
733 struct nfs4_opendata *data = calldata; 720 struct nfs4_opendata *data = calldata;
@@ -738,10 +725,10 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
738 if (data->rpc_status == 0) { 725 if (data->rpc_status == 0) {
739 memcpy(data->o_res.stateid.data, data->c_res.stateid.data, 726 memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
740 sizeof(data->o_res.stateid.data)); 727 sizeof(data->o_res.stateid.data));
728 nfs_confirm_seqid(&data->owner->so_seqid, 0);
741 renew_lease(data->o_res.server, data->timestamp); 729 renew_lease(data->o_res.server, data->timestamp);
742 data->rpc_done = 1; 730 data->rpc_done = 1;
743 } 731 }
744 nfs_confirm_seqid(&data->owner->so_seqid, data->rpc_status);
745 nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid); 732 nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid);
746} 733}
747 734
@@ -756,7 +743,6 @@ static void nfs4_open_confirm_release(void *calldata)
756 /* In case of error, no cleanup! */ 743 /* In case of error, no cleanup! */
757 if (!data->rpc_done) 744 if (!data->rpc_done)
758 goto out_free; 745 goto out_free;
759 nfs_confirm_seqid(&data->owner->so_seqid, 0);
760 state = nfs4_opendata_to_nfs4_state(data); 746 state = nfs4_opendata_to_nfs4_state(data);
761 if (!IS_ERR(state)) 747 if (!IS_ERR(state))
762 nfs4_close_state(&data->path, state, data->o_arg.open_flags); 748 nfs4_close_state(&data->path, state, data->o_arg.open_flags);
@@ -765,7 +751,6 @@ out_free:
765} 751}
766 752
767static const struct rpc_call_ops nfs4_open_confirm_ops = { 753static const struct rpc_call_ops nfs4_open_confirm_ops = {
768 .rpc_call_prepare = nfs4_open_confirm_prepare,
769 .rpc_call_done = nfs4_open_confirm_done, 754 .rpc_call_done = nfs4_open_confirm_done,
770 .rpc_release = nfs4_open_confirm_release, 755 .rpc_release = nfs4_open_confirm_release,
771}; 756};
@@ -777,12 +762,26 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
777{ 762{
778 struct nfs_server *server = NFS_SERVER(data->dir->d_inode); 763 struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
779 struct rpc_task *task; 764 struct rpc_task *task;
765 struct rpc_message msg = {
766 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
767 .rpc_argp = &data->c_arg,
768 .rpc_resp = &data->c_res,
769 .rpc_cred = data->owner->so_cred,
770 };
771 struct rpc_task_setup task_setup_data = {
772 .rpc_client = server->client,
773 .rpc_message = &msg,
774 .callback_ops = &nfs4_open_confirm_ops,
775 .callback_data = data,
776 .flags = RPC_TASK_ASYNC,
777 };
780 int status; 778 int status;
781 779
782 kref_get(&data->kref); 780 kref_get(&data->kref);
783 data->rpc_done = 0; 781 data->rpc_done = 0;
784 data->rpc_status = 0; 782 data->rpc_status = 0;
785 task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_confirm_ops, data); 783 data->timestamp = jiffies;
784 task = rpc_run_task(&task_setup_data);
786 if (IS_ERR(task)) 785 if (IS_ERR(task))
787 return PTR_ERR(task); 786 return PTR_ERR(task);
788 status = nfs4_wait_for_completion_rpc_task(task); 787 status = nfs4_wait_for_completion_rpc_task(task);
@@ -799,13 +798,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
799{ 798{
800 struct nfs4_opendata *data = calldata; 799 struct nfs4_opendata *data = calldata;
801 struct nfs4_state_owner *sp = data->owner; 800 struct nfs4_state_owner *sp = data->owner;
802 struct rpc_message msg = { 801
803 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
804 .rpc_argp = &data->o_arg,
805 .rpc_resp = &data->o_res,
806 .rpc_cred = sp->so_cred,
807 };
808
809 if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) 802 if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
810 return; 803 return;
811 /* 804 /*
@@ -830,11 +823,11 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
830 data->o_arg.id = sp->so_owner_id.id; 823 data->o_arg.id = sp->so_owner_id.id;
831 data->o_arg.clientid = sp->so_client->cl_clientid; 824 data->o_arg.clientid = sp->so_client->cl_clientid;
832 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { 825 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
833 msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; 826 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
834 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); 827 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
835 } 828 }
836 data->timestamp = jiffies; 829 data->timestamp = jiffies;
837 rpc_call_setup(task, &msg, 0); 830 rpc_call_start(task);
838 return; 831 return;
839out_no_action: 832out_no_action:
840 task->tk_action = NULL; 833 task->tk_action = NULL;
@@ -883,7 +876,6 @@ static void nfs4_open_release(void *calldata)
883 /* In case we need an open_confirm, no cleanup! */ 876 /* In case we need an open_confirm, no cleanup! */
884 if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) 877 if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)
885 goto out_free; 878 goto out_free;
886 nfs_confirm_seqid(&data->owner->so_seqid, 0);
887 state = nfs4_opendata_to_nfs4_state(data); 879 state = nfs4_opendata_to_nfs4_state(data);
888 if (!IS_ERR(state)) 880 if (!IS_ERR(state))
889 nfs4_close_state(&data->path, state, data->o_arg.open_flags); 881 nfs4_close_state(&data->path, state, data->o_arg.open_flags);
@@ -907,13 +899,26 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
907 struct nfs_openargs *o_arg = &data->o_arg; 899 struct nfs_openargs *o_arg = &data->o_arg;
908 struct nfs_openres *o_res = &data->o_res; 900 struct nfs_openres *o_res = &data->o_res;
909 struct rpc_task *task; 901 struct rpc_task *task;
902 struct rpc_message msg = {
903 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
904 .rpc_argp = o_arg,
905 .rpc_resp = o_res,
906 .rpc_cred = data->owner->so_cred,
907 };
908 struct rpc_task_setup task_setup_data = {
909 .rpc_client = server->client,
910 .rpc_message = &msg,
911 .callback_ops = &nfs4_open_ops,
912 .callback_data = data,
913 .flags = RPC_TASK_ASYNC,
914 };
910 int status; 915 int status;
911 916
912 kref_get(&data->kref); 917 kref_get(&data->kref);
913 data->rpc_done = 0; 918 data->rpc_done = 0;
914 data->rpc_status = 0; 919 data->rpc_status = 0;
915 data->cancelled = 0; 920 data->cancelled = 0;
916 task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_ops, data); 921 task = rpc_run_task(&task_setup_data);
917 if (IS_ERR(task)) 922 if (IS_ERR(task))
918 return PTR_ERR(task); 923 return PTR_ERR(task);
919 status = nfs4_wait_for_completion_rpc_task(task); 924 status = nfs4_wait_for_completion_rpc_task(task);
@@ -1243,12 +1248,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1243{ 1248{
1244 struct nfs4_closedata *calldata = data; 1249 struct nfs4_closedata *calldata = data;
1245 struct nfs4_state *state = calldata->state; 1250 struct nfs4_state *state = calldata->state;
1246 struct rpc_message msg = {
1247 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
1248 .rpc_argp = &calldata->arg,
1249 .rpc_resp = &calldata->res,
1250 .rpc_cred = state->owner->so_cred,
1251 };
1252 int clear_rd, clear_wr, clear_rdwr; 1251 int clear_rd, clear_wr, clear_rdwr;
1253 1252
1254 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) 1253 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
@@ -1275,14 +1274,14 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1275 } 1274 }
1276 nfs_fattr_init(calldata->res.fattr); 1275 nfs_fattr_init(calldata->res.fattr);
1277 if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) { 1276 if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
1278 msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; 1277 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
1279 calldata->arg.open_flags = FMODE_READ; 1278 calldata->arg.open_flags = FMODE_READ;
1280 } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) { 1279 } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
1281 msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; 1280 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
1282 calldata->arg.open_flags = FMODE_WRITE; 1281 calldata->arg.open_flags = FMODE_WRITE;
1283 } 1282 }
1284 calldata->timestamp = jiffies; 1283 calldata->timestamp = jiffies;
1285 rpc_call_setup(task, &msg, 0); 1284 rpc_call_start(task);
1286} 1285}
1287 1286
1288static const struct rpc_call_ops nfs4_close_ops = { 1287static const struct rpc_call_ops nfs4_close_ops = {
@@ -1308,6 +1307,16 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1308 struct nfs4_closedata *calldata; 1307 struct nfs4_closedata *calldata;
1309 struct nfs4_state_owner *sp = state->owner; 1308 struct nfs4_state_owner *sp = state->owner;
1310 struct rpc_task *task; 1309 struct rpc_task *task;
1310 struct rpc_message msg = {
1311 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
1312 .rpc_cred = state->owner->so_cred,
1313 };
1314 struct rpc_task_setup task_setup_data = {
1315 .rpc_client = server->client,
1316 .rpc_message = &msg,
1317 .callback_ops = &nfs4_close_ops,
1318 .flags = RPC_TASK_ASYNC,
1319 };
1311 int status = -ENOMEM; 1320 int status = -ENOMEM;
1312 1321
1313 calldata = kmalloc(sizeof(*calldata), GFP_KERNEL); 1322 calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
@@ -1327,7 +1336,10 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1327 calldata->path.mnt = mntget(path->mnt); 1336 calldata->path.mnt = mntget(path->mnt);
1328 calldata->path.dentry = dget(path->dentry); 1337 calldata->path.dentry = dget(path->dentry);
1329 1338
1330 task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_close_ops, calldata); 1339 msg.rpc_argp = &calldata->arg,
1340 msg.rpc_resp = &calldata->res,
1341 task_setup_data.callback_data = calldata;
1342 task = rpc_run_task(&task_setup_data);
1331 if (IS_ERR(task)) 1343 if (IS_ERR(task))
1332 return PTR_ERR(task); 1344 return PTR_ERR(task);
1333 status = 0; 1345 status = 0;
@@ -2413,18 +2425,10 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
2413 return 0; 2425 return 0;
2414} 2426}
2415 2427
2416static void nfs4_proc_read_setup(struct nfs_read_data *data) 2428static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
2417{ 2429{
2418 struct rpc_message msg = {
2419 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ],
2420 .rpc_argp = &data->args,
2421 .rpc_resp = &data->res,
2422 .rpc_cred = data->cred,
2423 };
2424
2425 data->timestamp = jiffies; 2430 data->timestamp = jiffies;
2426 2431 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
2427 rpc_call_setup(&data->task, &msg, 0);
2428} 2432}
2429 2433
2430static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) 2434static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -2442,33 +2446,15 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
2442 return 0; 2446 return 0;
2443} 2447}
2444 2448
2445static void nfs4_proc_write_setup(struct nfs_write_data *data, int how) 2449static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
2446{ 2450{
2447 struct rpc_message msg = { 2451 struct nfs_server *server = NFS_SERVER(data->inode);
2448 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE], 2452
2449 .rpc_argp = &data->args,
2450 .rpc_resp = &data->res,
2451 .rpc_cred = data->cred,
2452 };
2453 struct inode *inode = data->inode;
2454 struct nfs_server *server = NFS_SERVER(inode);
2455 int stable;
2456
2457 if (how & FLUSH_STABLE) {
2458 if (!NFS_I(inode)->ncommit)
2459 stable = NFS_FILE_SYNC;
2460 else
2461 stable = NFS_DATA_SYNC;
2462 } else
2463 stable = NFS_UNSTABLE;
2464 data->args.stable = stable;
2465 data->args.bitmask = server->attr_bitmask; 2453 data->args.bitmask = server->attr_bitmask;
2466 data->res.server = server; 2454 data->res.server = server;
2467
2468 data->timestamp = jiffies; 2455 data->timestamp = jiffies;
2469 2456
2470 /* Finalize the task. */ 2457 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
2471 rpc_call_setup(&data->task, &msg, 0);
2472} 2458}
2473 2459
2474static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) 2460static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -2483,20 +2469,13 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
2483 return 0; 2469 return 0;
2484} 2470}
2485 2471
2486static void nfs4_proc_commit_setup(struct nfs_write_data *data, int how) 2472static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
2487{ 2473{
2488 struct rpc_message msg = {
2489 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT],
2490 .rpc_argp = &data->args,
2491 .rpc_resp = &data->res,
2492 .rpc_cred = data->cred,
2493 };
2494 struct nfs_server *server = NFS_SERVER(data->inode); 2474 struct nfs_server *server = NFS_SERVER(data->inode);
2495 2475
2496 data->args.bitmask = server->attr_bitmask; 2476 data->args.bitmask = server->attr_bitmask;
2497 data->res.server = server; 2477 data->res.server = server;
2498 2478 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
2499 rpc_call_setup(&data->task, &msg, 0);
2500} 2479}
2501 2480
2502/* 2481/*
@@ -2899,14 +2878,20 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
2899 2878
2900 for(;;) { 2879 for(;;) {
2901 setclientid.sc_name_len = scnprintf(setclientid.sc_name, 2880 setclientid.sc_name_len = scnprintf(setclientid.sc_name,
2902 sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u %s %u", 2881 sizeof(setclientid.sc_name), "%s/%s %s %s %u",
2903 clp->cl_ipaddr, NIPQUAD(clp->cl_addr.sin_addr), 2882 clp->cl_ipaddr,
2883 rpc_peeraddr2str(clp->cl_rpcclient,
2884 RPC_DISPLAY_ADDR),
2885 rpc_peeraddr2str(clp->cl_rpcclient,
2886 RPC_DISPLAY_PROTO),
2904 cred->cr_ops->cr_name, 2887 cred->cr_ops->cr_name,
2905 clp->cl_id_uniquifier); 2888 clp->cl_id_uniquifier);
2906 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, 2889 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
2907 sizeof(setclientid.sc_netid), "tcp"); 2890 sizeof(setclientid.sc_netid),
2891 rpc_peeraddr2str(clp->cl_rpcclient,
2892 RPC_DISPLAY_NETID));
2908 setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, 2893 setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
2909 sizeof(setclientid.sc_uaddr), "%s.%d.%d", 2894 sizeof(setclientid.sc_uaddr), "%s.%u.%u",
2910 clp->cl_ipaddr, port >> 8, port & 255); 2895 clp->cl_ipaddr, port >> 8, port & 255);
2911 2896
2912 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); 2897 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
@@ -2970,25 +2955,11 @@ struct nfs4_delegreturndata {
2970 struct nfs4_delegreturnres res; 2955 struct nfs4_delegreturnres res;
2971 struct nfs_fh fh; 2956 struct nfs_fh fh;
2972 nfs4_stateid stateid; 2957 nfs4_stateid stateid;
2973 struct rpc_cred *cred;
2974 unsigned long timestamp; 2958 unsigned long timestamp;
2975 struct nfs_fattr fattr; 2959 struct nfs_fattr fattr;
2976 int rpc_status; 2960 int rpc_status;
2977}; 2961};
2978 2962
2979static void nfs4_delegreturn_prepare(struct rpc_task *task, void *calldata)
2980{
2981 struct nfs4_delegreturndata *data = calldata;
2982 struct rpc_message msg = {
2983 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
2984 .rpc_argp = &data->args,
2985 .rpc_resp = &data->res,
2986 .rpc_cred = data->cred,
2987 };
2988 nfs_fattr_init(data->res.fattr);
2989 rpc_call_setup(task, &msg, 0);
2990}
2991
2992static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) 2963static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
2993{ 2964{
2994 struct nfs4_delegreturndata *data = calldata; 2965 struct nfs4_delegreturndata *data = calldata;
@@ -2999,24 +2970,30 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
2999 2970
3000static void nfs4_delegreturn_release(void *calldata) 2971static void nfs4_delegreturn_release(void *calldata)
3001{ 2972{
3002 struct nfs4_delegreturndata *data = calldata;
3003
3004 put_rpccred(data->cred);
3005 kfree(calldata); 2973 kfree(calldata);
3006} 2974}
3007 2975
3008static const struct rpc_call_ops nfs4_delegreturn_ops = { 2976static const struct rpc_call_ops nfs4_delegreturn_ops = {
3009 .rpc_call_prepare = nfs4_delegreturn_prepare,
3010 .rpc_call_done = nfs4_delegreturn_done, 2977 .rpc_call_done = nfs4_delegreturn_done,
3011 .rpc_release = nfs4_delegreturn_release, 2978 .rpc_release = nfs4_delegreturn_release,
3012}; 2979};
3013 2980
3014static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid) 2981static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
3015{ 2982{
3016 struct nfs4_delegreturndata *data; 2983 struct nfs4_delegreturndata *data;
3017 struct nfs_server *server = NFS_SERVER(inode); 2984 struct nfs_server *server = NFS_SERVER(inode);
3018 struct rpc_task *task; 2985 struct rpc_task *task;
3019 int status; 2986 struct rpc_message msg = {
2987 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
2988 .rpc_cred = cred,
2989 };
2990 struct rpc_task_setup task_setup_data = {
2991 .rpc_client = server->client,
2992 .rpc_message = &msg,
2993 .callback_ops = &nfs4_delegreturn_ops,
2994 .flags = RPC_TASK_ASYNC,
2995 };
2996 int status = 0;
3020 2997
3021 data = kmalloc(sizeof(*data), GFP_KERNEL); 2998 data = kmalloc(sizeof(*data), GFP_KERNEL);
3022 if (data == NULL) 2999 if (data == NULL)
@@ -3028,30 +3005,37 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3028 memcpy(&data->stateid, stateid, sizeof(data->stateid)); 3005 memcpy(&data->stateid, stateid, sizeof(data->stateid));
3029 data->res.fattr = &data->fattr; 3006 data->res.fattr = &data->fattr;
3030 data->res.server = server; 3007 data->res.server = server;
3031 data->cred = get_rpccred(cred); 3008 nfs_fattr_init(data->res.fattr);
3032 data->timestamp = jiffies; 3009 data->timestamp = jiffies;
3033 data->rpc_status = 0; 3010 data->rpc_status = 0;
3034 3011
3035 task = rpc_run_task(NFS_CLIENT(inode), RPC_TASK_ASYNC, &nfs4_delegreturn_ops, data); 3012 task_setup_data.callback_data = data;
3013 msg.rpc_argp = &data->args,
3014 msg.rpc_resp = &data->res,
3015 task = rpc_run_task(&task_setup_data);
3036 if (IS_ERR(task)) 3016 if (IS_ERR(task))
3037 return PTR_ERR(task); 3017 return PTR_ERR(task);
3018 if (!issync)
3019 goto out;
3038 status = nfs4_wait_for_completion_rpc_task(task); 3020 status = nfs4_wait_for_completion_rpc_task(task);
3039 if (status == 0) { 3021 if (status != 0)
3040 status = data->rpc_status; 3022 goto out;
3041 if (status == 0) 3023 status = data->rpc_status;
3042 nfs_refresh_inode(inode, &data->fattr); 3024 if (status != 0)
3043 } 3025 goto out;
3026 nfs_refresh_inode(inode, &data->fattr);
3027out:
3044 rpc_put_task(task); 3028 rpc_put_task(task);
3045 return status; 3029 return status;
3046} 3030}
3047 3031
3048int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid) 3032int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
3049{ 3033{
3050 struct nfs_server *server = NFS_SERVER(inode); 3034 struct nfs_server *server = NFS_SERVER(inode);
3051 struct nfs4_exception exception = { }; 3035 struct nfs4_exception exception = { };
3052 int err; 3036 int err;
3053 do { 3037 do {
3054 err = _nfs4_proc_delegreturn(inode, cred, stateid); 3038 err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
3055 switch (err) { 3039 switch (err) {
3056 case -NFS4ERR_STALE_STATEID: 3040 case -NFS4ERR_STALE_STATEID:
3057 case -NFS4ERR_EXPIRED: 3041 case -NFS4ERR_EXPIRED:
@@ -3219,12 +3203,6 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
3219static void nfs4_locku_prepare(struct rpc_task *task, void *data) 3203static void nfs4_locku_prepare(struct rpc_task *task, void *data)
3220{ 3204{
3221 struct nfs4_unlockdata *calldata = data; 3205 struct nfs4_unlockdata *calldata = data;
3222 struct rpc_message msg = {
3223 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
3224 .rpc_argp = &calldata->arg,
3225 .rpc_resp = &calldata->res,
3226 .rpc_cred = calldata->lsp->ls_state->owner->so_cred,
3227 };
3228 3206
3229 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) 3207 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
3230 return; 3208 return;
@@ -3234,7 +3212,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
3234 return; 3212 return;
3235 } 3213 }
3236 calldata->timestamp = jiffies; 3214 calldata->timestamp = jiffies;
3237 rpc_call_setup(task, &msg, 0); 3215 rpc_call_start(task);
3238} 3216}
3239 3217
3240static const struct rpc_call_ops nfs4_locku_ops = { 3218static const struct rpc_call_ops nfs4_locku_ops = {
@@ -3249,6 +3227,16 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
3249 struct nfs_seqid *seqid) 3227 struct nfs_seqid *seqid)
3250{ 3228{
3251 struct nfs4_unlockdata *data; 3229 struct nfs4_unlockdata *data;
3230 struct rpc_message msg = {
3231 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
3232 .rpc_cred = ctx->cred,
3233 };
3234 struct rpc_task_setup task_setup_data = {
3235 .rpc_client = NFS_CLIENT(lsp->ls_state->inode),
3236 .rpc_message = &msg,
3237 .callback_ops = &nfs4_locku_ops,
3238 .flags = RPC_TASK_ASYNC,
3239 };
3252 3240
3253 /* Ensure this is an unlock - when canceling a lock, the 3241 /* Ensure this is an unlock - when canceling a lock, the
3254 * canceled lock is passed in, and it won't be an unlock. 3242 * canceled lock is passed in, and it won't be an unlock.
@@ -3261,7 +3249,10 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
3261 return ERR_PTR(-ENOMEM); 3249 return ERR_PTR(-ENOMEM);
3262 } 3250 }
3263 3251
3264 return rpc_run_task(NFS_CLIENT(lsp->ls_state->inode), RPC_TASK_ASYNC, &nfs4_locku_ops, data); 3252 msg.rpc_argp = &data->arg,
3253 msg.rpc_resp = &data->res,
3254 task_setup_data.callback_data = data;
3255 return rpc_run_task(&task_setup_data);
3265} 3256}
3266 3257
3267static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) 3258static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
@@ -3320,9 +3311,12 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
3320 3311
3321 p->arg.fh = NFS_FH(inode); 3312 p->arg.fh = NFS_FH(inode);
3322 p->arg.fl = &p->fl; 3313 p->arg.fl = &p->fl;
3314 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid);
3315 if (p->arg.open_seqid == NULL)
3316 goto out_free;
3323 p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); 3317 p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
3324 if (p->arg.lock_seqid == NULL) 3318 if (p->arg.lock_seqid == NULL)
3325 goto out_free; 3319 goto out_free_seqid;
3326 p->arg.lock_stateid = &lsp->ls_stateid; 3320 p->arg.lock_stateid = &lsp->ls_stateid;
3327 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 3321 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
3328 p->arg.lock_owner.id = lsp->ls_id.id; 3322 p->arg.lock_owner.id = lsp->ls_id.id;
@@ -3331,6 +3325,8 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
3331 p->ctx = get_nfs_open_context(ctx); 3325 p->ctx = get_nfs_open_context(ctx);
3332 memcpy(&p->fl, fl, sizeof(p->fl)); 3326 memcpy(&p->fl, fl, sizeof(p->fl));
3333 return p; 3327 return p;
3328out_free_seqid:
3329 nfs_free_seqid(p->arg.open_seqid);
3334out_free: 3330out_free:
3335 kfree(p); 3331 kfree(p);
3336 return NULL; 3332 return NULL;
@@ -3340,31 +3336,20 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
3340{ 3336{
3341 struct nfs4_lockdata *data = calldata; 3337 struct nfs4_lockdata *data = calldata;
3342 struct nfs4_state *state = data->lsp->ls_state; 3338 struct nfs4_state *state = data->lsp->ls_state;
3343 struct nfs4_state_owner *sp = state->owner;
3344 struct rpc_message msg = {
3345 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
3346 .rpc_argp = &data->arg,
3347 .rpc_resp = &data->res,
3348 .rpc_cred = sp->so_cred,
3349 };
3350 3339
3340 dprintk("%s: begin!\n", __FUNCTION__);
3351 if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) 3341 if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
3352 return; 3342 return;
3353 dprintk("%s: begin!\n", __FUNCTION__);
3354 /* Do we need to do an open_to_lock_owner? */ 3343 /* Do we need to do an open_to_lock_owner? */
3355 if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { 3344 if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
3356 data->arg.open_seqid = nfs_alloc_seqid(&sp->so_seqid); 3345 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0)
3357 if (data->arg.open_seqid == NULL) { 3346 return;
3358 data->rpc_status = -ENOMEM;
3359 task->tk_action = NULL;
3360 goto out;
3361 }
3362 data->arg.open_stateid = &state->stateid; 3347 data->arg.open_stateid = &state->stateid;
3363 data->arg.new_lock_owner = 1; 3348 data->arg.new_lock_owner = 1;
3364 } 3349 } else
3350 data->arg.new_lock_owner = 0;
3365 data->timestamp = jiffies; 3351 data->timestamp = jiffies;
3366 rpc_call_setup(task, &msg, 0); 3352 rpc_call_start(task);
3367out:
3368 dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status); 3353 dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status);
3369} 3354}
3370 3355
@@ -3400,8 +3385,7 @@ static void nfs4_lock_release(void *calldata)
3400 struct nfs4_lockdata *data = calldata; 3385 struct nfs4_lockdata *data = calldata;
3401 3386
3402 dprintk("%s: begin!\n", __FUNCTION__); 3387 dprintk("%s: begin!\n", __FUNCTION__);
3403 if (data->arg.open_seqid != NULL) 3388 nfs_free_seqid(data->arg.open_seqid);
3404 nfs_free_seqid(data->arg.open_seqid);
3405 if (data->cancelled != 0) { 3389 if (data->cancelled != 0) {
3406 struct rpc_task *task; 3390 struct rpc_task *task;
3407 task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, 3391 task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
@@ -3427,6 +3411,16 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
3427{ 3411{
3428 struct nfs4_lockdata *data; 3412 struct nfs4_lockdata *data;
3429 struct rpc_task *task; 3413 struct rpc_task *task;
3414 struct rpc_message msg = {
3415 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
3416 .rpc_cred = state->owner->so_cred,
3417 };
3418 struct rpc_task_setup task_setup_data = {
3419 .rpc_client = NFS_CLIENT(state->inode),
3420 .rpc_message = &msg,
3421 .callback_ops = &nfs4_lock_ops,
3422 .flags = RPC_TASK_ASYNC,
3423 };
3430 int ret; 3424 int ret;
3431 3425
3432 dprintk("%s: begin!\n", __FUNCTION__); 3426 dprintk("%s: begin!\n", __FUNCTION__);
@@ -3438,8 +3432,10 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
3438 data->arg.block = 1; 3432 data->arg.block = 1;
3439 if (reclaim != 0) 3433 if (reclaim != 0)
3440 data->arg.reclaim = 1; 3434 data->arg.reclaim = 1;
3441 task = rpc_run_task(NFS_CLIENT(state->inode), RPC_TASK_ASYNC, 3435 msg.rpc_argp = &data->arg,
3442 &nfs4_lock_ops, data); 3436 msg.rpc_resp = &data->res,
3437 task_setup_data.callback_data = data;
3438 task = rpc_run_task(&task_setup_data);
3443 if (IS_ERR(task)) 3439 if (IS_ERR(task))
3444 return PTR_ERR(task); 3440 return PTR_ERR(task);
3445 ret = nfs4_wait_for_completion_rpc_task(task); 3441 ret = nfs4_wait_for_completion_rpc_task(task);
@@ -3612,10 +3608,6 @@ int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
3612 if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) 3608 if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
3613 return -EOPNOTSUPP; 3609 return -EOPNOTSUPP;
3614 3610
3615 if (!S_ISREG(inode->i_mode) &&
3616 (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
3617 return -EPERM;
3618
3619 return nfs4_proc_set_acl(inode, buf, buflen); 3611 return nfs4_proc_set_acl(inode, buf, buflen);
3620} 3612}
3621 3613
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 3ea352d82eba..5e2e4af1a0e6 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -133,9 +133,7 @@ nfs4_renewd_prepare_shutdown(struct nfs_server *server)
133void 133void
134nfs4_kill_renewd(struct nfs_client *clp) 134nfs4_kill_renewd(struct nfs_client *clp)
135{ 135{
136 down_read(&clp->cl_sem);
137 cancel_delayed_work_sync(&clp->cl_renewd); 136 cancel_delayed_work_sync(&clp->cl_renewd);
138 up_read(&clp->cl_sem);
139} 137}
140 138
141/* 139/*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 23a9a36556bf..f9c7432471dc 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -509,7 +509,10 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
509 lsp = kzalloc(sizeof(*lsp), GFP_KERNEL); 509 lsp = kzalloc(sizeof(*lsp), GFP_KERNEL);
510 if (lsp == NULL) 510 if (lsp == NULL)
511 return NULL; 511 return NULL;
512 lsp->ls_seqid.sequence = &state->owner->so_sequence; 512 rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
513 spin_lock_init(&lsp->ls_sequence.lock);
514 INIT_LIST_HEAD(&lsp->ls_sequence.list);
515 lsp->ls_seqid.sequence = &lsp->ls_sequence;
513 atomic_set(&lsp->ls_count, 1); 516 atomic_set(&lsp->ls_count, 1);
514 lsp->ls_owner = fl_owner; 517 lsp->ls_owner = fl_owner;
515 spin_lock(&clp->cl_lock); 518 spin_lock(&clp->cl_lock);
@@ -641,27 +644,26 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
641 644
642struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) 645struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
643{ 646{
644 struct rpc_sequence *sequence = counter->sequence;
645 struct nfs_seqid *new; 647 struct nfs_seqid *new;
646 648
647 new = kmalloc(sizeof(*new), GFP_KERNEL); 649 new = kmalloc(sizeof(*new), GFP_KERNEL);
648 if (new != NULL) { 650 if (new != NULL) {
649 new->sequence = counter; 651 new->sequence = counter;
650 spin_lock(&sequence->lock); 652 INIT_LIST_HEAD(&new->list);
651 list_add_tail(&new->list, &sequence->list);
652 spin_unlock(&sequence->lock);
653 } 653 }
654 return new; 654 return new;
655} 655}
656 656
657void nfs_free_seqid(struct nfs_seqid *seqid) 657void nfs_free_seqid(struct nfs_seqid *seqid)
658{ 658{
659 struct rpc_sequence *sequence = seqid->sequence->sequence; 659 if (!list_empty(&seqid->list)) {
660 struct rpc_sequence *sequence = seqid->sequence->sequence;
660 661
661 spin_lock(&sequence->lock); 662 spin_lock(&sequence->lock);
662 list_del(&seqid->list); 663 list_del(&seqid->list);
663 spin_unlock(&sequence->lock); 664 spin_unlock(&sequence->lock);
664 rpc_wake_up(&sequence->wait); 665 rpc_wake_up(&sequence->wait);
666 }
665 kfree(seqid); 667 kfree(seqid);
666} 668}
667 669
@@ -672,6 +674,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
672 */ 674 */
673static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) 675static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
674{ 676{
677 BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid);
675 switch (status) { 678 switch (status) {
676 case 0: 679 case 0:
677 break; 680 break;
@@ -723,15 +726,15 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
723 struct rpc_sequence *sequence = seqid->sequence->sequence; 726 struct rpc_sequence *sequence = seqid->sequence->sequence;
724 int status = 0; 727 int status = 0;
725 728
726 if (sequence->list.next == &seqid->list)
727 goto out;
728 spin_lock(&sequence->lock); 729 spin_lock(&sequence->lock);
729 if (sequence->list.next != &seqid->list) { 730 if (list_empty(&seqid->list))
730 rpc_sleep_on(&sequence->wait, task, NULL, NULL); 731 list_add_tail(&seqid->list, &sequence->list);
731 status = -EAGAIN; 732 if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid)
732 } 733 goto unlock;
734 rpc_sleep_on(&sequence->wait, task, NULL, NULL);
735 status = -EAGAIN;
736unlock:
733 spin_unlock(&sequence->lock); 737 spin_unlock(&sequence->lock);
734out:
735 return status; 738 return status;
736} 739}
737 740
@@ -755,8 +758,9 @@ static void nfs4_recover_state(struct nfs_client *clp)
755 758
756 __module_get(THIS_MODULE); 759 __module_get(THIS_MODULE);
757 atomic_inc(&clp->cl_count); 760 atomic_inc(&clp->cl_count);
758 task = kthread_run(reclaimer, clp, "%u.%u.%u.%u-reclaim", 761 task = kthread_run(reclaimer, clp, "%s-reclaim",
759 NIPQUAD(clp->cl_addr.sin_addr)); 762 rpc_peeraddr2str(clp->cl_rpcclient,
763 RPC_DISPLAY_ADDR));
760 if (!IS_ERR(task)) 764 if (!IS_ERR(task))
761 return; 765 return;
762 nfs4_clear_recover_bit(clp); 766 nfs4_clear_recover_bit(clp);
@@ -967,8 +971,8 @@ out:
967 module_put_and_exit(0); 971 module_put_and_exit(0);
968 return 0; 972 return 0;
969out_error: 973out_error:
970 printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n", 974 printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %s"
971 NIPQUAD(clp->cl_addr.sin_addr), -status); 975 " with error %d\n", clp->cl_hostname, -status);
972 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 976 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
973 goto out; 977 goto out;
974} 978}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 51dd3804866f..db1ed9c46ede 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -116,10 +116,12 @@ static int nfs4_stat_to_errno(int);
116#define decode_renew_maxsz (op_decode_hdr_maxsz) 116#define decode_renew_maxsz (op_decode_hdr_maxsz)
117#define encode_setclientid_maxsz \ 117#define encode_setclientid_maxsz \
118 (op_encode_hdr_maxsz + \ 118 (op_encode_hdr_maxsz + \
119 4 /*server->ip_addr*/ + \ 119 XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
120 1 /*Netid*/ + \ 120 XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \
121 6 /*uaddr*/ + \ 121 1 /* sc_prog */ + \
122 6 + (NFS4_VERIFIER_SIZE >> 2)) 122 XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
123 XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
124 1) /* sc_cb_ident */
123#define decode_setclientid_maxsz \ 125#define decode_setclientid_maxsz \
124 (op_decode_hdr_maxsz + \ 126 (op_decode_hdr_maxsz + \
125 2 + \ 127 2 + \
@@ -2515,14 +2517,12 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
2515 2517
2516static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) 2518static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
2517{ 2519{
2518 int n; 2520 u32 n;
2519 __be32 *p; 2521 __be32 *p;
2520 int status = 0; 2522 int status = 0;
2521 2523
2522 READ_BUF(4); 2524 READ_BUF(4);
2523 READ32(n); 2525 READ32(n);
2524 if (n < 0)
2525 goto out_eio;
2526 if (n == 0) 2526 if (n == 0)
2527 goto root_path; 2527 goto root_path;
2528 dprintk("path "); 2528 dprintk("path ");
@@ -2579,13 +2579,11 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2579 goto out_eio; 2579 goto out_eio;
2580 res->nlocations = 0; 2580 res->nlocations = 0;
2581 while (res->nlocations < n) { 2581 while (res->nlocations < n) {
2582 int m; 2582 u32 m;
2583 struct nfs4_fs_location *loc = &res->locations[res->nlocations]; 2583 struct nfs4_fs_location *loc = &res->locations[res->nlocations];
2584 2584
2585 READ_BUF(4); 2585 READ_BUF(4);
2586 READ32(m); 2586 READ32(m);
2587 if (m <= 0)
2588 goto out_eio;
2589 2587
2590 loc->nservers = 0; 2588 loc->nservers = 0;
2591 dprintk("%s: servers ", __FUNCTION__); 2589 dprintk("%s: servers ", __FUNCTION__);
@@ -2598,8 +2596,12 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2598 if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS) 2596 if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
2599 loc->nservers++; 2597 loc->nservers++;
2600 else { 2598 else {
2601 int i; 2599 unsigned int i;
2602 dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations); 2600 dprintk("%s: using first %u of %u servers "
2601 "returned for location %u\n",
2602 __FUNCTION__,
2603 NFS4_FS_LOCATION_MAXSERVERS,
2604 m, res->nlocations);
2603 for (i = loc->nservers; i < m; i++) { 2605 for (i = loc->nservers; i < m; i++) {
2604 unsigned int len; 2606 unsigned int len;
2605 char *data; 2607 char *data;
@@ -3476,10 +3478,11 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
3476 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 3478 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
3477 struct page *page = *rcvbuf->pages; 3479 struct page *page = *rcvbuf->pages;
3478 struct kvec *iov = rcvbuf->head; 3480 struct kvec *iov = rcvbuf->head;
3479 unsigned int nr, pglen = rcvbuf->page_len; 3481 size_t hdrlen;
3482 u32 recvd, pglen = rcvbuf->page_len;
3480 __be32 *end, *entry, *p, *kaddr; 3483 __be32 *end, *entry, *p, *kaddr;
3481 uint32_t len, attrlen, xlen; 3484 unsigned int nr;
3482 int hdrlen, recvd, status; 3485 int status;
3483 3486
3484 status = decode_op_hdr(xdr, OP_READDIR); 3487 status = decode_op_hdr(xdr, OP_READDIR);
3485 if (status) 3488 if (status)
@@ -3503,6 +3506,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
3503 end = p + ((pglen + readdir->pgbase) >> 2); 3506 end = p + ((pglen + readdir->pgbase) >> 2);
3504 entry = p; 3507 entry = p;
3505 for (nr = 0; *p++; nr++) { 3508 for (nr = 0; *p++; nr++) {
3509 u32 len, attrlen, xlen;
3506 if (end - p < 3) 3510 if (end - p < 3)
3507 goto short_pkt; 3511 goto short_pkt;
3508 dprintk("cookie = %Lu, ", *((unsigned long long *)p)); 3512 dprintk("cookie = %Lu, ", *((unsigned long long *)p));
@@ -3551,7 +3555,8 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
3551{ 3555{
3552 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 3556 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
3553 struct kvec *iov = rcvbuf->head; 3557 struct kvec *iov = rcvbuf->head;
3554 int hdrlen, len, recvd; 3558 size_t hdrlen;
3559 u32 len, recvd;
3555 __be32 *p; 3560 __be32 *p;
3556 char *kaddr; 3561 char *kaddr;
3557 int status; 3562 int status;
@@ -3646,7 +3651,8 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
3646 if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) 3651 if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
3647 return -EIO; 3652 return -EIO;
3648 if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { 3653 if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
3649 int hdrlen, recvd; 3654 size_t hdrlen;
3655 u32 recvd;
3650 3656
3651 /* We ignore &savep and don't do consistency checks on 3657 /* We ignore &savep and don't do consistency checks on
3652 * the attr length. Let userspace figure it out.... */ 3658 * the attr length. Let userspace figure it out.... */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 2dff469f04fe..7f079209d70a 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -58,7 +58,6 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
58 struct page *page, 58 struct page *page,
59 unsigned int offset, unsigned int count) 59 unsigned int offset, unsigned int count)
60{ 60{
61 struct nfs_server *server = NFS_SERVER(inode);
62 struct nfs_page *req; 61 struct nfs_page *req;
63 62
64 for (;;) { 63 for (;;) {
@@ -111,13 +110,14 @@ void nfs_unlock_request(struct nfs_page *req)
111 * nfs_set_page_tag_locked - Tag a request as locked 110 * nfs_set_page_tag_locked - Tag a request as locked
112 * @req: 111 * @req:
113 */ 112 */
114static int nfs_set_page_tag_locked(struct nfs_page *req) 113int nfs_set_page_tag_locked(struct nfs_page *req)
115{ 114{
116 struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode); 115 struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
117 116
118 if (!nfs_lock_request(req)) 117 if (!nfs_lock_request_dontget(req))
119 return 0; 118 return 0;
120 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 119 if (req->wb_page != NULL)
120 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
121 return 1; 121 return 1;
122} 122}
123 123
@@ -132,9 +132,10 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
132 if (req->wb_page != NULL) { 132 if (req->wb_page != NULL) {
133 spin_lock(&inode->i_lock); 133 spin_lock(&inode->i_lock);
134 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 134 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
135 nfs_unlock_request(req);
135 spin_unlock(&inode->i_lock); 136 spin_unlock(&inode->i_lock);
136 } 137 } else
137 nfs_unlock_request(req); 138 nfs_unlock_request(req);
138} 139}
139 140
140/** 141/**
@@ -413,6 +414,7 @@ int nfs_scan_list(struct nfs_inode *nfsi,
413 goto out; 414 goto out;
414 idx_start = req->wb_index + 1; 415 idx_start = req->wb_index + 1;
415 if (nfs_set_page_tag_locked(req)) { 416 if (nfs_set_page_tag_locked(req)) {
417 kref_get(&req->wb_kref);
416 nfs_list_remove_request(req); 418 nfs_list_remove_request(req);
417 radix_tree_tag_clear(&nfsi->nfs_page_tree, 419 radix_tree_tag_clear(&nfsi->nfs_page_tree,
418 req->wb_index, tag); 420 req->wb_index, tag);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 4f80d88e9fee..5ccf7faee19c 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -565,16 +565,9 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
565 return 0; 565 return 0;
566} 566}
567 567
568static void nfs_proc_read_setup(struct nfs_read_data *data) 568static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
569{ 569{
570 struct rpc_message msg = { 570 msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
571 .rpc_proc = &nfs_procedures[NFSPROC_READ],
572 .rpc_argp = &data->args,
573 .rpc_resp = &data->res,
574 .rpc_cred = data->cred,
575 };
576
577 rpc_call_setup(&data->task, &msg, 0);
578} 571}
579 572
580static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) 573static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -584,24 +577,15 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
584 return 0; 577 return 0;
585} 578}
586 579
587static void nfs_proc_write_setup(struct nfs_write_data *data, int how) 580static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
588{ 581{
589 struct rpc_message msg = {
590 .rpc_proc = &nfs_procedures[NFSPROC_WRITE],
591 .rpc_argp = &data->args,
592 .rpc_resp = &data->res,
593 .rpc_cred = data->cred,
594 };
595
596 /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ 582 /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
597 data->args.stable = NFS_FILE_SYNC; 583 data->args.stable = NFS_FILE_SYNC;
598 584 msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
599 /* Finalize the task. */
600 rpc_call_setup(&data->task, &msg, 0);
601} 585}
602 586
603static void 587static void
604nfs_proc_commit_setup(struct nfs_write_data *data, int how) 588nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
605{ 589{
606 BUG(); 590 BUG();
607} 591}
@@ -609,7 +593,9 @@ nfs_proc_commit_setup(struct nfs_write_data *data, int how)
609static int 593static int
610nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) 594nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
611{ 595{
612 return nlmclnt_proc(filp->f_path.dentry->d_inode, cmd, fl); 596 struct inode *inode = filp->f_path.dentry->d_inode;
597
598 return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
613} 599}
614 600
615 601
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 3dcaa6a73261..8fd6dfbe1bc3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -160,12 +160,26 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
160 const struct rpc_call_ops *call_ops, 160 const struct rpc_call_ops *call_ops,
161 unsigned int count, unsigned int offset) 161 unsigned int count, unsigned int offset)
162{ 162{
163 struct inode *inode; 163 struct inode *inode = req->wb_context->path.dentry->d_inode;
164 int flags; 164 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
165 struct rpc_task *task;
166 struct rpc_message msg = {
167 .rpc_argp = &data->args,
168 .rpc_resp = &data->res,
169 .rpc_cred = req->wb_context->cred,
170 };
171 struct rpc_task_setup task_setup_data = {
172 .task = &data->task,
173 .rpc_client = NFS_CLIENT(inode),
174 .rpc_message = &msg,
175 .callback_ops = call_ops,
176 .callback_data = data,
177 .flags = RPC_TASK_ASYNC | swap_flags,
178 };
165 179
166 data->req = req; 180 data->req = req;
167 data->inode = inode = req->wb_context->path.dentry->d_inode; 181 data->inode = inode;
168 data->cred = req->wb_context->cred; 182 data->cred = msg.rpc_cred;
169 183
170 data->args.fh = NFS_FH(inode); 184 data->args.fh = NFS_FH(inode);
171 data->args.offset = req_offset(req) + offset; 185 data->args.offset = req_offset(req) + offset;
@@ -180,11 +194,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
180 nfs_fattr_init(&data->fattr); 194 nfs_fattr_init(&data->fattr);
181 195
182 /* Set up the initial task struct. */ 196 /* Set up the initial task struct. */
183 flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); 197 NFS_PROTO(inode)->read_setup(data, &msg);
184 rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data);
185 NFS_PROTO(inode)->read_setup(data);
186
187 data->task.tk_cookie = (unsigned long)inode;
188 198
189 dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", 199 dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
190 data->task.tk_pid, 200 data->task.tk_pid,
@@ -192,6 +202,10 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
192 (long long)NFS_FILEID(inode), 202 (long long)NFS_FILEID(inode),
193 count, 203 count,
194 (unsigned long long)data->args.offset); 204 (unsigned long long)data->args.offset);
205
206 task = rpc_run_task(&task_setup_data);
207 if (!IS_ERR(task))
208 rpc_put_task(task);
195} 209}
196 210
197static void 211static void
@@ -208,14 +222,6 @@ nfs_async_read_error(struct list_head *head)
208} 222}
209 223
210/* 224/*
211 * Start an async read operation
212 */
213static void nfs_execute_read(struct nfs_read_data *data)
214{
215 rpc_execute(&data->task);
216}
217
218/*
219 * Generate multiple requests to fill a single page. 225 * Generate multiple requests to fill a single page.
220 * 226 *
221 * We optimize to reduce the number of read operations on the wire. If we 227 * We optimize to reduce the number of read operations on the wire. If we
@@ -269,7 +275,6 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
269 rsize, offset); 275 rsize, offset);
270 offset += rsize; 276 offset += rsize;
271 nbytes -= rsize; 277 nbytes -= rsize;
272 nfs_execute_read(data);
273 } while (nbytes != 0); 278 } while (nbytes != 0);
274 279
275 return 0; 280 return 0;
@@ -307,8 +312,6 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
307 req = nfs_list_entry(data->pages.next); 312 req = nfs_list_entry(data->pages.next);
308 313
309 nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); 314 nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
310
311 nfs_execute_read(data);
312 return 0; 315 return 0;
313out_bad: 316out_bad:
314 nfs_async_read_error(head); 317 nfs_async_read_error(head);
@@ -333,7 +336,7 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
333 nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count); 336 nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count);
334 337
335 if (task->tk_status == -ESTALE) { 338 if (task->tk_status == -ESTALE) {
336 set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode)); 339 set_bit(NFS_INO_STALE, &NFS_I(data->inode)->flags);
337 nfs_mark_for_revalidate(data->inode); 340 nfs_mark_for_revalidate(data->inode);
338 } 341 }
339 return 0; 342 return 0;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5b6339f70a4c..7f4505f6ac6f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -45,6 +45,8 @@
45#include <linux/nfs_idmap.h> 45#include <linux/nfs_idmap.h>
46#include <linux/vfs.h> 46#include <linux/vfs.h>
47#include <linux/inet.h> 47#include <linux/inet.h>
48#include <linux/in6.h>
49#include <net/ipv6.h>
48#include <linux/nfs_xdr.h> 50#include <linux/nfs_xdr.h>
49#include <linux/magic.h> 51#include <linux/magic.h>
50#include <linux/parser.h> 52#include <linux/parser.h>
@@ -83,11 +85,11 @@ enum {
83 Opt_actimeo, 85 Opt_actimeo,
84 Opt_namelen, 86 Opt_namelen,
85 Opt_mountport, 87 Opt_mountport,
86 Opt_mountprog, Opt_mountvers, 88 Opt_mountvers,
87 Opt_nfsprog, Opt_nfsvers, 89 Opt_nfsvers,
88 90
89 /* Mount options that take string arguments */ 91 /* Mount options that take string arguments */
90 Opt_sec, Opt_proto, Opt_mountproto, 92 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
91 Opt_addr, Opt_mountaddr, Opt_clientaddr, 93 Opt_addr, Opt_mountaddr, Opt_clientaddr,
92 94
93 /* Mount options that are ignored */ 95 /* Mount options that are ignored */
@@ -137,9 +139,7 @@ static match_table_t nfs_mount_option_tokens = {
137 { Opt_userspace, "retry=%u" }, 139 { Opt_userspace, "retry=%u" },
138 { Opt_namelen, "namlen=%u" }, 140 { Opt_namelen, "namlen=%u" },
139 { Opt_mountport, "mountport=%u" }, 141 { Opt_mountport, "mountport=%u" },
140 { Opt_mountprog, "mountprog=%u" },
141 { Opt_mountvers, "mountvers=%u" }, 142 { Opt_mountvers, "mountvers=%u" },
142 { Opt_nfsprog, "nfsprog=%u" },
143 { Opt_nfsvers, "nfsvers=%u" }, 143 { Opt_nfsvers, "nfsvers=%u" },
144 { Opt_nfsvers, "vers=%u" }, 144 { Opt_nfsvers, "vers=%u" },
145 145
@@ -148,7 +148,7 @@ static match_table_t nfs_mount_option_tokens = {
148 { Opt_mountproto, "mountproto=%s" }, 148 { Opt_mountproto, "mountproto=%s" },
149 { Opt_addr, "addr=%s" }, 149 { Opt_addr, "addr=%s" },
150 { Opt_clientaddr, "clientaddr=%s" }, 150 { Opt_clientaddr, "clientaddr=%s" },
151 { Opt_userspace, "mounthost=%s" }, 151 { Opt_mounthost, "mounthost=%s" },
152 { Opt_mountaddr, "mountaddr=%s" }, 152 { Opt_mountaddr, "mountaddr=%s" },
153 153
154 { Opt_err, NULL } 154 { Opt_err, NULL }
@@ -202,6 +202,7 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru
202static int nfs_xdev_get_sb(struct file_system_type *fs_type, 202static int nfs_xdev_get_sb(struct file_system_type *fs_type,
203 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 203 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
204static void nfs_kill_super(struct super_block *); 204static void nfs_kill_super(struct super_block *);
205static void nfs_put_super(struct super_block *);
205 206
206static struct file_system_type nfs_fs_type = { 207static struct file_system_type nfs_fs_type = {
207 .owner = THIS_MODULE, 208 .owner = THIS_MODULE,
@@ -223,6 +224,7 @@ static const struct super_operations nfs_sops = {
223 .alloc_inode = nfs_alloc_inode, 224 .alloc_inode = nfs_alloc_inode,
224 .destroy_inode = nfs_destroy_inode, 225 .destroy_inode = nfs_destroy_inode,
225 .write_inode = nfs_write_inode, 226 .write_inode = nfs_write_inode,
227 .put_super = nfs_put_super,
226 .statfs = nfs_statfs, 228 .statfs = nfs_statfs,
227 .clear_inode = nfs_clear_inode, 229 .clear_inode = nfs_clear_inode,
228 .umount_begin = nfs_umount_begin, 230 .umount_begin = nfs_umount_begin,
@@ -325,6 +327,28 @@ void __exit unregister_nfs_fs(void)
325 unregister_filesystem(&nfs_fs_type); 327 unregister_filesystem(&nfs_fs_type);
326} 328}
327 329
330void nfs_sb_active(struct nfs_server *server)
331{
332 atomic_inc(&server->active);
333}
334
335void nfs_sb_deactive(struct nfs_server *server)
336{
337 if (atomic_dec_and_test(&server->active))
338 wake_up(&server->active_wq);
339}
340
341static void nfs_put_super(struct super_block *sb)
342{
343 struct nfs_server *server = NFS_SB(sb);
344 /*
345 * Make sure there are no outstanding ops to this server.
346 * If so, wait for them to finish before allowing the
347 * unmount to continue.
348 */
349 wait_event(server->active_wq, atomic_read(&server->active) == 0);
350}
351
328/* 352/*
329 * Deliver file system statistics to userspace 353 * Deliver file system statistics to userspace
330 */ 354 */
@@ -454,8 +478,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
454 } 478 }
455 seq_printf(m, ",proto=%s", 479 seq_printf(m, ",proto=%s",
456 rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO)); 480 rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO));
457 seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ); 481 seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ);
458 seq_printf(m, ",retrans=%u", clp->retrans_count); 482 seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries);
459 seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); 483 seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
460} 484}
461 485
@@ -468,8 +492,9 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
468 492
469 nfs_show_mount_options(m, nfss, 0); 493 nfs_show_mount_options(m, nfss, 0);
470 494
471 seq_printf(m, ",addr="NIPQUAD_FMT, 495 seq_printf(m, ",addr=%s",
472 NIPQUAD(nfss->nfs_client->cl_addr.sin_addr)); 496 rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,
497 RPC_DISPLAY_ADDR));
473 498
474 return 0; 499 return 0;
475} 500}
@@ -506,7 +531,7 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
506 seq_printf(m, ",namelen=%d", nfss->namelen); 531 seq_printf(m, ",namelen=%d", nfss->namelen);
507 532
508#ifdef CONFIG_NFS_V4 533#ifdef CONFIG_NFS_V4
509 if (nfss->nfs_client->cl_nfsversion == 4) { 534 if (nfss->nfs_client->rpc_ops->version == 4) {
510 seq_printf(m, "\n\tnfsv4:\t"); 535 seq_printf(m, "\n\tnfsv4:\t");
511 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); 536 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
512 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); 537 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
@@ -574,16 +599,40 @@ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
574} 599}
575 600
576/* 601/*
577 * Sanity-check a server address provided by the mount command 602 * Set the port number in an address. Be agnostic about the address family.
603 */
604static void nfs_set_port(struct sockaddr *sap, unsigned short port)
605{
606 switch (sap->sa_family) {
607 case AF_INET: {
608 struct sockaddr_in *ap = (struct sockaddr_in *)sap;
609 ap->sin_port = htons(port);
610 break;
611 }
612 case AF_INET6: {
613 struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
614 ap->sin6_port = htons(port);
615 break;
616 }
617 }
618}
619
620/*
621 * Sanity-check a server address provided by the mount command.
622 *
623 * Address family must be initialized, and address must not be
624 * the ANY address for that family.
578 */ 625 */
579static int nfs_verify_server_address(struct sockaddr *addr) 626static int nfs_verify_server_address(struct sockaddr *addr)
580{ 627{
581 switch (addr->sa_family) { 628 switch (addr->sa_family) {
582 case AF_INET: { 629 case AF_INET: {
583 struct sockaddr_in *sa = (struct sockaddr_in *) addr; 630 struct sockaddr_in *sa = (struct sockaddr_in *)addr;
584 if (sa->sin_addr.s_addr != INADDR_ANY) 631 return sa->sin_addr.s_addr != INADDR_ANY;
585 return 1; 632 }
586 break; 633 case AF_INET6: {
634 struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr;
635 return !ipv6_addr_any(sa);
587 } 636 }
588 } 637 }
589 638
@@ -591,6 +640,40 @@ static int nfs_verify_server_address(struct sockaddr *addr)
591} 640}
592 641
593/* 642/*
643 * Parse string addresses passed in via a mount option,
644 * and construct a sockaddr based on the result.
645 *
646 * If address parsing fails, set the sockaddr's address
647 * family to AF_UNSPEC to force nfs_verify_server_address()
648 * to punt the mount.
649 */
650static void nfs_parse_server_address(char *value,
651 struct sockaddr *sap,
652 size_t *len)
653{
654 if (strchr(value, ':')) {
655 struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
656 u8 *addr = (u8 *)&ap->sin6_addr.in6_u;
657
658 ap->sin6_family = AF_INET6;
659 *len = sizeof(*ap);
660 if (in6_pton(value, -1, addr, '\0', NULL))
661 return;
662 } else {
663 struct sockaddr_in *ap = (struct sockaddr_in *)sap;
664 u8 *addr = (u8 *)&ap->sin_addr.s_addr;
665
666 ap->sin_family = AF_INET;
667 *len = sizeof(*ap);
668 if (in4_pton(value, -1, addr, '\0', NULL))
669 return;
670 }
671
672 sap->sa_family = AF_UNSPEC;
673 *len = 0;
674}
675
676/*
594 * Error-check and convert a string of mount options from user space into 677 * Error-check and convert a string of mount options from user space into
595 * a data structure 678 * a data structure
596 */ 679 */
@@ -598,6 +681,7 @@ static int nfs_parse_mount_options(char *raw,
598 struct nfs_parsed_mount_data *mnt) 681 struct nfs_parsed_mount_data *mnt)
599{ 682{
600 char *p, *string; 683 char *p, *string;
684 unsigned short port = 0;
601 685
602 if (!raw) { 686 if (!raw) {
603 dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); 687 dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -697,7 +781,7 @@ static int nfs_parse_mount_options(char *raw,
697 return 0; 781 return 0;
698 if (option < 0 || option > 65535) 782 if (option < 0 || option > 65535)
699 return 0; 783 return 0;
700 mnt->nfs_server.address.sin_port = htons(option); 784 port = option;
701 break; 785 break;
702 case Opt_rsize: 786 case Opt_rsize:
703 if (match_int(args, &mnt->rsize)) 787 if (match_int(args, &mnt->rsize))
@@ -759,13 +843,6 @@ static int nfs_parse_mount_options(char *raw,
759 return 0; 843 return 0;
760 mnt->mount_server.port = option; 844 mnt->mount_server.port = option;
761 break; 845 break;
762 case Opt_mountprog:
763 if (match_int(args, &option))
764 return 0;
765 if (option < 0)
766 return 0;
767 mnt->mount_server.program = option;
768 break;
769 case Opt_mountvers: 846 case Opt_mountvers:
770 if (match_int(args, &option)) 847 if (match_int(args, &option))
771 return 0; 848 return 0;
@@ -773,13 +850,6 @@ static int nfs_parse_mount_options(char *raw,
773 return 0; 850 return 0;
774 mnt->mount_server.version = option; 851 mnt->mount_server.version = option;
775 break; 852 break;
776 case Opt_nfsprog:
777 if (match_int(args, &option))
778 return 0;
779 if (option < 0)
780 return 0;
781 mnt->nfs_server.program = option;
782 break;
783 case Opt_nfsvers: 853 case Opt_nfsvers:
784 if (match_int(args, &option)) 854 if (match_int(args, &option))
785 return 0; 855 return 0;
@@ -923,24 +993,32 @@ static int nfs_parse_mount_options(char *raw,
923 string = match_strdup(args); 993 string = match_strdup(args);
924 if (string == NULL) 994 if (string == NULL)
925 goto out_nomem; 995 goto out_nomem;
926 mnt->nfs_server.address.sin_family = AF_INET; 996 nfs_parse_server_address(string, (struct sockaddr *)
927 mnt->nfs_server.address.sin_addr.s_addr = 997 &mnt->nfs_server.address,
928 in_aton(string); 998 &mnt->nfs_server.addrlen);
929 kfree(string); 999 kfree(string);
930 break; 1000 break;
931 case Opt_clientaddr: 1001 case Opt_clientaddr:
932 string = match_strdup(args); 1002 string = match_strdup(args);
933 if (string == NULL) 1003 if (string == NULL)
934 goto out_nomem; 1004 goto out_nomem;
1005 kfree(mnt->client_address);
935 mnt->client_address = string; 1006 mnt->client_address = string;
936 break; 1007 break;
1008 case Opt_mounthost:
1009 string = match_strdup(args);
1010 if (string == NULL)
1011 goto out_nomem;
1012 kfree(mnt->mount_server.hostname);
1013 mnt->mount_server.hostname = string;
1014 break;
937 case Opt_mountaddr: 1015 case Opt_mountaddr:
938 string = match_strdup(args); 1016 string = match_strdup(args);
939 if (string == NULL) 1017 if (string == NULL)
940 goto out_nomem; 1018 goto out_nomem;
941 mnt->mount_server.address.sin_family = AF_INET; 1019 nfs_parse_server_address(string, (struct sockaddr *)
942 mnt->mount_server.address.sin_addr.s_addr = 1020 &mnt->mount_server.address,
943 in_aton(string); 1021 &mnt->mount_server.addrlen);
944 kfree(string); 1022 kfree(string);
945 break; 1023 break;
946 1024
@@ -953,6 +1031,8 @@ static int nfs_parse_mount_options(char *raw,
953 } 1031 }
954 } 1032 }
955 1033
1034 nfs_set_port((struct sockaddr *)&mnt->nfs_server.address, port);
1035
956 return 1; 1036 return 1;
957 1037
958out_nomem: 1038out_nomem:
@@ -983,7 +1063,8 @@ out_unknown:
983static int nfs_try_mount(struct nfs_parsed_mount_data *args, 1063static int nfs_try_mount(struct nfs_parsed_mount_data *args,
984 struct nfs_fh *root_fh) 1064 struct nfs_fh *root_fh)
985{ 1065{
986 struct sockaddr_in sin; 1066 struct sockaddr *sap = (struct sockaddr *)&args->mount_server.address;
1067 char *hostname;
987 int status; 1068 int status;
988 1069
989 if (args->mount_server.version == 0) { 1070 if (args->mount_server.version == 0) {
@@ -993,25 +1074,32 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
993 args->mount_server.version = NFS_MNT_VERSION; 1074 args->mount_server.version = NFS_MNT_VERSION;
994 } 1075 }
995 1076
1077 if (args->mount_server.hostname)
1078 hostname = args->mount_server.hostname;
1079 else
1080 hostname = args->nfs_server.hostname;
1081
996 /* 1082 /*
997 * Construct the mount server's address. 1083 * Construct the mount server's address.
998 */ 1084 */
999 if (args->mount_server.address.sin_addr.s_addr != INADDR_ANY) 1085 if (args->mount_server.address.ss_family == AF_UNSPEC) {
1000 sin = args->mount_server.address; 1086 memcpy(sap, &args->nfs_server.address,
1001 else 1087 args->nfs_server.addrlen);
1002 sin = args->nfs_server.address; 1088 args->mount_server.addrlen = args->nfs_server.addrlen;
1089 }
1090
1003 /* 1091 /*
1004 * autobind will be used if mount_server.port == 0 1092 * autobind will be used if mount_server.port == 0
1005 */ 1093 */
1006 sin.sin_port = htons(args->mount_server.port); 1094 nfs_set_port(sap, args->mount_server.port);
1007 1095
1008 /* 1096 /*
1009 * Now ask the mount server to map our export path 1097 * Now ask the mount server to map our export path
1010 * to a file handle. 1098 * to a file handle.
1011 */ 1099 */
1012 status = nfs_mount((struct sockaddr *) &sin, 1100 status = nfs_mount(sap,
1013 sizeof(sin), 1101 args->mount_server.addrlen,
1014 args->nfs_server.hostname, 1102 hostname,
1015 args->nfs_server.export_path, 1103 args->nfs_server.export_path,
1016 args->mount_server.version, 1104 args->mount_server.version,
1017 args->mount_server.protocol, 1105 args->mount_server.protocol,
@@ -1019,8 +1107,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1019 if (status == 0) 1107 if (status == 0)
1020 return 0; 1108 return 0;
1021 1109
1022 dfprintk(MOUNT, "NFS: unable to mount server " NIPQUAD_FMT 1110 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d",
1023 ", error %d\n", NIPQUAD(sin.sin_addr.s_addr), status); 1111 hostname, status);
1024 return status; 1112 return status;
1025} 1113}
1026 1114
@@ -1039,9 +1127,6 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1039 * 1127 *
1040 * + breaking back: trying proto=udp after proto=tcp, v2 after v3, 1128 * + breaking back: trying proto=udp after proto=tcp, v2 after v3,
1041 * mountproto=tcp after mountproto=udp, and so on 1129 * mountproto=tcp after mountproto=udp, and so on
1042 *
1043 * XXX: as far as I can tell, changing the NFS program number is not
1044 * supported in the NFS client.
1045 */ 1130 */
1046static int nfs_validate_mount_data(void *options, 1131static int nfs_validate_mount_data(void *options,
1047 struct nfs_parsed_mount_data *args, 1132 struct nfs_parsed_mount_data *args,
@@ -1065,9 +1150,7 @@ static int nfs_validate_mount_data(void *options,
1065 args->acdirmin = 30; 1150 args->acdirmin = 30;
1066 args->acdirmax = 60; 1151 args->acdirmax = 60;
1067 args->mount_server.protocol = XPRT_TRANSPORT_UDP; 1152 args->mount_server.protocol = XPRT_TRANSPORT_UDP;
1068 args->mount_server.program = NFS_MNT_PROGRAM;
1069 args->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1153 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1070 args->nfs_server.program = NFS_PROGRAM;
1071 1154
1072 switch (data->version) { 1155 switch (data->version) {
1073 case 1: 1156 case 1:
@@ -1098,9 +1181,6 @@ static int nfs_validate_mount_data(void *options,
1098 memset(mntfh->data + mntfh->size, 0, 1181 memset(mntfh->data + mntfh->size, 0,
1099 sizeof(mntfh->data) - mntfh->size); 1182 sizeof(mntfh->data) - mntfh->size);
1100 1183
1101 if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
1102 goto out_no_address;
1103
1104 /* 1184 /*
1105 * Translate to nfs_parsed_mount_data, which nfs_fill_super 1185 * Translate to nfs_parsed_mount_data, which nfs_fill_super
1106 * can deal with. 1186 * can deal with.
@@ -1115,7 +1195,14 @@ static int nfs_validate_mount_data(void *options,
1115 args->acregmax = data->acregmax; 1195 args->acregmax = data->acregmax;
1116 args->acdirmin = data->acdirmin; 1196 args->acdirmin = data->acdirmin;
1117 args->acdirmax = data->acdirmax; 1197 args->acdirmax = data->acdirmax;
1118 args->nfs_server.address = data->addr; 1198
1199 memcpy(&args->nfs_server.address, &data->addr,
1200 sizeof(data->addr));
1201 args->nfs_server.addrlen = sizeof(data->addr);
1202 if (!nfs_verify_server_address((struct sockaddr *)
1203 &args->nfs_server.address))
1204 goto out_no_address;
1205
1119 if (!(data->flags & NFS_MOUNT_TCP)) 1206 if (!(data->flags & NFS_MOUNT_TCP))
1120 args->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1207 args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
1121 /* N.B. caller will free nfs_server.hostname in all cases */ 1208 /* N.B. caller will free nfs_server.hostname in all cases */
@@ -1318,15 +1405,50 @@ static int nfs_set_super(struct super_block *s, void *data)
1318 return ret; 1405 return ret;
1319} 1406}
1320 1407
1408static int nfs_compare_super_address(struct nfs_server *server1,
1409 struct nfs_server *server2)
1410{
1411 struct sockaddr *sap1, *sap2;
1412
1413 sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
1414 sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
1415
1416 if (sap1->sa_family != sap2->sa_family)
1417 return 0;
1418
1419 switch (sap1->sa_family) {
1420 case AF_INET: {
1421 struct sockaddr_in *sin1 = (struct sockaddr_in *)sap1;
1422 struct sockaddr_in *sin2 = (struct sockaddr_in *)sap2;
1423 if (sin1->sin_addr.s_addr != sin2->sin_addr.s_addr)
1424 return 0;
1425 if (sin1->sin_port != sin2->sin_port)
1426 return 0;
1427 break;
1428 }
1429 case AF_INET6: {
1430 struct sockaddr_in6 *sin1 = (struct sockaddr_in6 *)sap1;
1431 struct sockaddr_in6 *sin2 = (struct sockaddr_in6 *)sap2;
1432 if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
1433 return 0;
1434 if (sin1->sin6_port != sin2->sin6_port)
1435 return 0;
1436 break;
1437 }
1438 default:
1439 return 0;
1440 }
1441
1442 return 1;
1443}
1444
1321static int nfs_compare_super(struct super_block *sb, void *data) 1445static int nfs_compare_super(struct super_block *sb, void *data)
1322{ 1446{
1323 struct nfs_sb_mountdata *sb_mntdata = data; 1447 struct nfs_sb_mountdata *sb_mntdata = data;
1324 struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb); 1448 struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb);
1325 int mntflags = sb_mntdata->mntflags; 1449 int mntflags = sb_mntdata->mntflags;
1326 1450
1327 if (memcmp(&old->nfs_client->cl_addr, 1451 if (!nfs_compare_super_address(old, server))
1328 &server->nfs_client->cl_addr,
1329 sizeof(old->nfs_client->cl_addr)) != 0)
1330 return 0; 1452 return 0;
1331 /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */ 1453 /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
1332 if (old->flags & NFS_MOUNT_UNSHARED) 1454 if (old->flags & NFS_MOUNT_UNSHARED)
@@ -1396,6 +1518,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
1396 1518
1397out: 1519out:
1398 kfree(data.nfs_server.hostname); 1520 kfree(data.nfs_server.hostname);
1521 kfree(data.mount_server.hostname);
1399 return error; 1522 return error;
1400 1523
1401out_err_nosb: 1524out_err_nosb:
@@ -1471,7 +1594,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
1471 error = PTR_ERR(mntroot); 1594 error = PTR_ERR(mntroot);
1472 goto error_splat_super; 1595 goto error_splat_super;
1473 } 1596 }
1474 if (mntroot->d_inode->i_op != &nfs_dir_inode_operations) { 1597 if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
1475 dput(mntroot); 1598 dput(mntroot);
1476 error = -ESTALE; 1599 error = -ESTALE;
1477 goto error_splat_super; 1600 goto error_splat_super;
@@ -1524,12 +1647,35 @@ static void nfs4_fill_super(struct super_block *sb)
1524} 1647}
1525 1648
1526/* 1649/*
1650 * If the user didn't specify a port, set the port number to
1651 * the NFS version 4 default port.
1652 */
1653static void nfs4_default_port(struct sockaddr *sap)
1654{
1655 switch (sap->sa_family) {
1656 case AF_INET: {
1657 struct sockaddr_in *ap = (struct sockaddr_in *)sap;
1658 if (ap->sin_port == 0)
1659 ap->sin_port = htons(NFS_PORT);
1660 break;
1661 }
1662 case AF_INET6: {
1663 struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
1664 if (ap->sin6_port == 0)
1665 ap->sin6_port = htons(NFS_PORT);
1666 break;
1667 }
1668 }
1669}
1670
1671/*
1527 * Validate NFSv4 mount options 1672 * Validate NFSv4 mount options
1528 */ 1673 */
1529static int nfs4_validate_mount_data(void *options, 1674static int nfs4_validate_mount_data(void *options,
1530 struct nfs_parsed_mount_data *args, 1675 struct nfs_parsed_mount_data *args,
1531 const char *dev_name) 1676 const char *dev_name)
1532{ 1677{
1678 struct sockaddr_in *ap;
1533 struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; 1679 struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
1534 char *c; 1680 char *c;
1535 1681
@@ -1550,18 +1696,21 @@ static int nfs4_validate_mount_data(void *options,
1550 1696
1551 switch (data->version) { 1697 switch (data->version) {
1552 case 1: 1698 case 1:
1553 if (data->host_addrlen != sizeof(args->nfs_server.address)) 1699 ap = (struct sockaddr_in *)&args->nfs_server.address;
1700 if (data->host_addrlen > sizeof(args->nfs_server.address))
1701 goto out_no_address;
1702 if (data->host_addrlen == 0)
1554 goto out_no_address; 1703 goto out_no_address;
1555 if (copy_from_user(&args->nfs_server.address, 1704 args->nfs_server.addrlen = data->host_addrlen;
1556 data->host_addr, 1705 if (copy_from_user(ap, data->host_addr, data->host_addrlen))
1557 sizeof(args->nfs_server.address)))
1558 return -EFAULT; 1706 return -EFAULT;
1559 if (args->nfs_server.address.sin_port == 0)
1560 args->nfs_server.address.sin_port = htons(NFS_PORT);
1561 if (!nfs_verify_server_address((struct sockaddr *) 1707 if (!nfs_verify_server_address((struct sockaddr *)
1562 &args->nfs_server.address)) 1708 &args->nfs_server.address))
1563 goto out_no_address; 1709 goto out_no_address;
1564 1710
1711 nfs4_default_port((struct sockaddr *)
1712 &args->nfs_server.address);
1713
1565 switch (data->auth_flavourlen) { 1714 switch (data->auth_flavourlen) {
1566 case 0: 1715 case 0:
1567 args->auth_flavors[0] = RPC_AUTH_UNIX; 1716 args->auth_flavors[0] = RPC_AUTH_UNIX;
@@ -1619,6 +1768,9 @@ static int nfs4_validate_mount_data(void *options,
1619 &args->nfs_server.address)) 1768 &args->nfs_server.address))
1620 return -EINVAL; 1769 return -EINVAL;
1621 1770
1771 nfs4_default_port((struct sockaddr *)
1772 &args->nfs_server.address);
1773
1622 switch (args->auth_flavor_len) { 1774 switch (args->auth_flavor_len) {
1623 case 0: 1775 case 0:
1624 args->auth_flavors[0] = RPC_AUTH_UNIX; 1776 args->auth_flavors[0] = RPC_AUTH_UNIX;
@@ -1639,21 +1791,16 @@ static int nfs4_validate_mount_data(void *options,
1639 len = c - dev_name; 1791 len = c - dev_name;
1640 if (len > NFS4_MAXNAMLEN) 1792 if (len > NFS4_MAXNAMLEN)
1641 return -ENAMETOOLONG; 1793 return -ENAMETOOLONG;
1642 args->nfs_server.hostname = kzalloc(len, GFP_KERNEL); 1794 /* N.B. caller will free nfs_server.hostname in all cases */
1643 if (args->nfs_server.hostname == NULL) 1795 args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
1644 return -ENOMEM;
1645 strncpy(args->nfs_server.hostname, dev_name, len - 1);
1646 1796
1647 c++; /* step over the ':' */ 1797 c++; /* step over the ':' */
1648 len = strlen(c); 1798 len = strlen(c);
1649 if (len > NFS4_MAXPATHLEN) 1799 if (len > NFS4_MAXPATHLEN)
1650 return -ENAMETOOLONG; 1800 return -ENAMETOOLONG;
1651 args->nfs_server.export_path = kzalloc(len + 1, GFP_KERNEL); 1801 args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
1652 if (args->nfs_server.export_path == NULL)
1653 return -ENOMEM;
1654 strncpy(args->nfs_server.export_path, c, len);
1655 1802
1656 dprintk("MNTPATH: %s\n", args->nfs_server.export_path); 1803 dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path);
1657 1804
1658 if (args->client_address == NULL) 1805 if (args->client_address == NULL)
1659 goto out_no_client_address; 1806 goto out_no_client_address;
@@ -1822,6 +1969,11 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
1822 error = PTR_ERR(mntroot); 1969 error = PTR_ERR(mntroot);
1823 goto error_splat_super; 1970 goto error_splat_super;
1824 } 1971 }
1972 if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
1973 dput(mntroot);
1974 error = -ESTALE;
1975 goto error_splat_super;
1976 }
1825 1977
1826 s->s_flags |= MS_ACTIVE; 1978 s->s_flags |= MS_ACTIVE;
1827 mnt->mnt_sb = s; 1979 mnt->mnt_sb = s;
@@ -1896,6 +2048,11 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
1896 error = PTR_ERR(mntroot); 2048 error = PTR_ERR(mntroot);
1897 goto error_splat_super; 2049 goto error_splat_super;
1898 } 2050 }
2051 if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
2052 dput(mntroot);
2053 error = -ESTALE;
2054 goto error_splat_super;
2055 }
1899 2056
1900 s->s_flags |= MS_ACTIVE; 2057 s->s_flags |= MS_ACTIVE;
1901 mnt->mnt_sb = s; 2058 mnt->mnt_sb = s;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 233ad38161f9..757415363422 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -14,6 +14,8 @@
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/wait.h> 15#include <linux/wait.h>
16 16
17#include "internal.h"
18
17struct nfs_unlinkdata { 19struct nfs_unlinkdata {
18 struct hlist_node list; 20 struct hlist_node list;
19 struct nfs_removeargs args; 21 struct nfs_removeargs args;
@@ -69,24 +71,6 @@ static void nfs_dec_sillycount(struct inode *dir)
69} 71}
70 72
71/** 73/**
72 * nfs_async_unlink_init - Initialize the RPC info
73 * task: rpc_task of the sillydelete
74 */
75static void nfs_async_unlink_init(struct rpc_task *task, void *calldata)
76{
77 struct nfs_unlinkdata *data = calldata;
78 struct inode *dir = data->dir;
79 struct rpc_message msg = {
80 .rpc_argp = &data->args,
81 .rpc_resp = &data->res,
82 .rpc_cred = data->cred,
83 };
84
85 NFS_PROTO(dir)->unlink_setup(&msg, dir);
86 rpc_call_setup(task, &msg, 0);
87}
88
89/**
90 * nfs_async_unlink_done - Sillydelete post-processing 74 * nfs_async_unlink_done - Sillydelete post-processing
91 * @task: rpc_task of the sillydelete 75 * @task: rpc_task of the sillydelete
92 * 76 *
@@ -113,32 +97,45 @@ static void nfs_async_unlink_release(void *calldata)
113 struct nfs_unlinkdata *data = calldata; 97 struct nfs_unlinkdata *data = calldata;
114 98
115 nfs_dec_sillycount(data->dir); 99 nfs_dec_sillycount(data->dir);
100 nfs_sb_deactive(NFS_SERVER(data->dir));
116 nfs_free_unlinkdata(data); 101 nfs_free_unlinkdata(data);
117} 102}
118 103
119static const struct rpc_call_ops nfs_unlink_ops = { 104static const struct rpc_call_ops nfs_unlink_ops = {
120 .rpc_call_prepare = nfs_async_unlink_init,
121 .rpc_call_done = nfs_async_unlink_done, 105 .rpc_call_done = nfs_async_unlink_done,
122 .rpc_release = nfs_async_unlink_release, 106 .rpc_release = nfs_async_unlink_release,
123}; 107};
124 108
125static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data) 109static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
126{ 110{
111 struct rpc_message msg = {
112 .rpc_argp = &data->args,
113 .rpc_resp = &data->res,
114 .rpc_cred = data->cred,
115 };
116 struct rpc_task_setup task_setup_data = {
117 .rpc_message = &msg,
118 .callback_ops = &nfs_unlink_ops,
119 .callback_data = data,
120 .flags = RPC_TASK_ASYNC,
121 };
127 struct rpc_task *task; 122 struct rpc_task *task;
128 struct dentry *alias; 123 struct dentry *alias;
129 124
130 alias = d_lookup(parent, &data->args.name); 125 alias = d_lookup(parent, &data->args.name);
131 if (alias != NULL) { 126 if (alias != NULL) {
132 int ret = 0; 127 int ret = 0;
128
133 /* 129 /*
134 * Hey, we raced with lookup... See if we need to transfer 130 * Hey, we raced with lookup... See if we need to transfer
135 * the sillyrename information to the aliased dentry. 131 * the sillyrename information to the aliased dentry.
136 */ 132 */
137 nfs_free_dname(data); 133 nfs_free_dname(data);
138 spin_lock(&alias->d_lock); 134 spin_lock(&alias->d_lock);
139 if (!(alias->d_flags & DCACHE_NFSFS_RENAMED)) { 135 if (alias->d_inode != NULL &&
136 !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
140 alias->d_fsdata = data; 137 alias->d_fsdata = data;
141 alias->d_flags ^= DCACHE_NFSFS_RENAMED; 138 alias->d_flags |= DCACHE_NFSFS_RENAMED;
142 ret = 1; 139 ret = 1;
143 } 140 }
144 spin_unlock(&alias->d_lock); 141 spin_unlock(&alias->d_lock);
@@ -151,10 +148,14 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
151 nfs_dec_sillycount(dir); 148 nfs_dec_sillycount(dir);
152 return 0; 149 return 0;
153 } 150 }
151 nfs_sb_active(NFS_SERVER(dir));
154 data->args.fh = NFS_FH(dir); 152 data->args.fh = NFS_FH(dir);
155 nfs_fattr_init(&data->res.dir_attr); 153 nfs_fattr_init(&data->res.dir_attr);
156 154
157 task = rpc_run_task(NFS_CLIENT(dir), RPC_TASK_ASYNC, &nfs_unlink_ops, data); 155 NFS_PROTO(dir)->unlink_setup(&msg, dir);
156
157 task_setup_data.rpc_client = NFS_CLIENT(dir);
158 task = rpc_run_task(&task_setup_data);
158 if (!IS_ERR(task)) 159 if (!IS_ERR(task))
159 rpc_put_task(task); 160 rpc_put_task(task);
160 return 1; 161 return 1;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 60e3e870ada4..522efff3e2c5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -196,7 +196,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
196 } 196 }
197 /* Update file length */ 197 /* Update file length */
198 nfs_grow_file(page, offset, count); 198 nfs_grow_file(page, offset, count);
199 nfs_unlock_request(req); 199 nfs_clear_page_tag_locked(req);
200 return 0; 200 return 0;
201} 201}
202 202
@@ -252,7 +252,6 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
252 struct page *page) 252 struct page *page)
253{ 253{
254 struct inode *inode = page->mapping->host; 254 struct inode *inode = page->mapping->host;
255 struct nfs_inode *nfsi = NFS_I(inode);
256 struct nfs_page *req; 255 struct nfs_page *req;
257 int ret; 256 int ret;
258 257
@@ -263,10 +262,10 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
263 spin_unlock(&inode->i_lock); 262 spin_unlock(&inode->i_lock);
264 return 0; 263 return 0;
265 } 264 }
266 if (nfs_lock_request_dontget(req)) 265 if (nfs_set_page_tag_locked(req))
267 break; 266 break;
268 /* Note: If we hold the page lock, as is the case in nfs_writepage, 267 /* Note: If we hold the page lock, as is the case in nfs_writepage,
269 * then the call to nfs_lock_request_dontget() will always 268 * then the call to nfs_set_page_tag_locked() will always
270 * succeed provided that someone hasn't already marked the 269 * succeed provided that someone hasn't already marked the
271 * request as dirty (in which case we don't care). 270 * request as dirty (in which case we don't care).
272 */ 271 */
@@ -280,7 +279,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
280 if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { 279 if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
281 /* This request is marked for commit */ 280 /* This request is marked for commit */
282 spin_unlock(&inode->i_lock); 281 spin_unlock(&inode->i_lock);
283 nfs_unlock_request(req); 282 nfs_clear_page_tag_locked(req);
284 nfs_pageio_complete(pgio); 283 nfs_pageio_complete(pgio);
285 return 0; 284 return 0;
286 } 285 }
@@ -288,8 +287,6 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
288 spin_unlock(&inode->i_lock); 287 spin_unlock(&inode->i_lock);
289 BUG(); 288 BUG();
290 } 289 }
291 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
292 NFS_PAGE_TAG_LOCKED);
293 spin_unlock(&inode->i_lock); 290 spin_unlock(&inode->i_lock);
294 nfs_pageio_add_request(pgio, req); 291 nfs_pageio_add_request(pgio, req);
295 return 0; 292 return 0;
@@ -381,6 +378,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
381 set_page_private(req->wb_page, (unsigned long)req); 378 set_page_private(req->wb_page, (unsigned long)req);
382 nfsi->npages++; 379 nfsi->npages++;
383 kref_get(&req->wb_kref); 380 kref_get(&req->wb_kref);
381 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
384 return 0; 382 return 0;
385} 383}
386 384
@@ -596,7 +594,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
596 spin_lock(&inode->i_lock); 594 spin_lock(&inode->i_lock);
597 req = nfs_page_find_request_locked(page); 595 req = nfs_page_find_request_locked(page);
598 if (req) { 596 if (req) {
599 if (!nfs_lock_request_dontget(req)) { 597 if (!nfs_set_page_tag_locked(req)) {
600 int error; 598 int error;
601 599
602 spin_unlock(&inode->i_lock); 600 spin_unlock(&inode->i_lock);
@@ -646,7 +644,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
646 || req->wb_page != page 644 || req->wb_page != page
647 || !nfs_dirty_request(req) 645 || !nfs_dirty_request(req)
648 || offset > rqend || end < req->wb_offset) { 646 || offset > rqend || end < req->wb_offset) {
649 nfs_unlock_request(req); 647 nfs_clear_page_tag_locked(req);
650 return ERR_PTR(-EBUSY); 648 return ERR_PTR(-EBUSY);
651 } 649 }
652 650
@@ -755,7 +753,7 @@ static void nfs_writepage_release(struct nfs_page *req)
755 nfs_clear_page_tag_locked(req); 753 nfs_clear_page_tag_locked(req);
756} 754}
757 755
758static inline int flush_task_priority(int how) 756static int flush_task_priority(int how)
759{ 757{
760 switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) { 758 switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) {
761 case FLUSH_HIGHPRI: 759 case FLUSH_HIGHPRI:
@@ -775,15 +773,31 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
775 unsigned int count, unsigned int offset, 773 unsigned int count, unsigned int offset,
776 int how) 774 int how)
777{ 775{
778 struct inode *inode; 776 struct inode *inode = req->wb_context->path.dentry->d_inode;
779 int flags; 777 int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
778 int priority = flush_task_priority(how);
779 struct rpc_task *task;
780 struct rpc_message msg = {
781 .rpc_argp = &data->args,
782 .rpc_resp = &data->res,
783 .rpc_cred = req->wb_context->cred,
784 };
785 struct rpc_task_setup task_setup_data = {
786 .rpc_client = NFS_CLIENT(inode),
787 .task = &data->task,
788 .rpc_message = &msg,
789 .callback_ops = call_ops,
790 .callback_data = data,
791 .flags = flags,
792 .priority = priority,
793 };
780 794
781 /* Set up the RPC argument and reply structs 795 /* Set up the RPC argument and reply structs
782 * NB: take care not to mess about with data->commit et al. */ 796 * NB: take care not to mess about with data->commit et al. */
783 797
784 data->req = req; 798 data->req = req;
785 data->inode = inode = req->wb_context->path.dentry->d_inode; 799 data->inode = inode = req->wb_context->path.dentry->d_inode;
786 data->cred = req->wb_context->cred; 800 data->cred = msg.rpc_cred;
787 801
788 data->args.fh = NFS_FH(inode); 802 data->args.fh = NFS_FH(inode);
789 data->args.offset = req_offset(req) + offset; 803 data->args.offset = req_offset(req) + offset;
@@ -791,6 +805,12 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
791 data->args.pages = data->pagevec; 805 data->args.pages = data->pagevec;
792 data->args.count = count; 806 data->args.count = count;
793 data->args.context = req->wb_context; 807 data->args.context = req->wb_context;
808 data->args.stable = NFS_UNSTABLE;
809 if (how & FLUSH_STABLE) {
810 data->args.stable = NFS_DATA_SYNC;
811 if (!NFS_I(inode)->ncommit)
812 data->args.stable = NFS_FILE_SYNC;
813 }
794 814
795 data->res.fattr = &data->fattr; 815 data->res.fattr = &data->fattr;
796 data->res.count = count; 816 data->res.count = count;
@@ -798,12 +818,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
798 nfs_fattr_init(&data->fattr); 818 nfs_fattr_init(&data->fattr);
799 819
800 /* Set up the initial task struct. */ 820 /* Set up the initial task struct. */
801 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; 821 NFS_PROTO(inode)->write_setup(data, &msg);
802 rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data);
803 NFS_PROTO(inode)->write_setup(data, how);
804
805 data->task.tk_priority = flush_task_priority(how);
806 data->task.tk_cookie = (unsigned long)inode;
807 822
808 dprintk("NFS: %5u initiated write call " 823 dprintk("NFS: %5u initiated write call "
809 "(req %s/%Ld, %u bytes @ offset %Lu)\n", 824 "(req %s/%Ld, %u bytes @ offset %Lu)\n",
@@ -812,11 +827,10 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
812 (long long)NFS_FILEID(inode), 827 (long long)NFS_FILEID(inode),
813 count, 828 count,
814 (unsigned long long)data->args.offset); 829 (unsigned long long)data->args.offset);
815}
816 830
817static void nfs_execute_write(struct nfs_write_data *data) 831 task = rpc_run_task(&task_setup_data);
818{ 832 if (!IS_ERR(task))
819 rpc_execute(&data->task); 833 rpc_put_task(task);
820} 834}
821 835
822/* 836/*
@@ -863,7 +877,6 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
863 wsize, offset, how); 877 wsize, offset, how);
864 offset += wsize; 878 offset += wsize;
865 nbytes -= wsize; 879 nbytes -= wsize;
866 nfs_execute_write(data);
867 } while (nbytes != 0); 880 } while (nbytes != 0);
868 881
869 return 0; 882 return 0;
@@ -911,7 +924,6 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
911 /* Set up the argument struct */ 924 /* Set up the argument struct */
912 nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); 925 nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
913 926
914 nfs_execute_write(data);
915 return 0; 927 return 0;
916 out_bad: 928 out_bad:
917 while (!list_empty(head)) { 929 while (!list_empty(head)) {
@@ -927,7 +939,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
927static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, 939static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
928 struct inode *inode, int ioflags) 940 struct inode *inode, int ioflags)
929{ 941{
930 int wsize = NFS_SERVER(inode)->wsize; 942 size_t wsize = NFS_SERVER(inode)->wsize;
931 943
932 if (wsize < PAGE_CACHE_SIZE) 944 if (wsize < PAGE_CACHE_SIZE)
933 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); 945 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
@@ -1141,19 +1153,33 @@ static void nfs_commit_rpcsetup(struct list_head *head,
1141 struct nfs_write_data *data, 1153 struct nfs_write_data *data,
1142 int how) 1154 int how)
1143{ 1155{
1144 struct nfs_page *first; 1156 struct nfs_page *first = nfs_list_entry(head->next);
1145 struct inode *inode; 1157 struct inode *inode = first->wb_context->path.dentry->d_inode;
1146 int flags; 1158 int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
1159 int priority = flush_task_priority(how);
1160 struct rpc_task *task;
1161 struct rpc_message msg = {
1162 .rpc_argp = &data->args,
1163 .rpc_resp = &data->res,
1164 .rpc_cred = first->wb_context->cred,
1165 };
1166 struct rpc_task_setup task_setup_data = {
1167 .task = &data->task,
1168 .rpc_client = NFS_CLIENT(inode),
1169 .rpc_message = &msg,
1170 .callback_ops = &nfs_commit_ops,
1171 .callback_data = data,
1172 .flags = flags,
1173 .priority = priority,
1174 };
1147 1175
1148 /* Set up the RPC argument and reply structs 1176 /* Set up the RPC argument and reply structs
1149 * NB: take care not to mess about with data->commit et al. */ 1177 * NB: take care not to mess about with data->commit et al. */
1150 1178
1151 list_splice_init(head, &data->pages); 1179 list_splice_init(head, &data->pages);
1152 first = nfs_list_entry(data->pages.next);
1153 inode = first->wb_context->path.dentry->d_inode;
1154 1180
1155 data->inode = inode; 1181 data->inode = inode;
1156 data->cred = first->wb_context->cred; 1182 data->cred = msg.rpc_cred;
1157 1183
1158 data->args.fh = NFS_FH(data->inode); 1184 data->args.fh = NFS_FH(data->inode);
1159 /* Note: we always request a commit of the entire inode */ 1185 /* Note: we always request a commit of the entire inode */
@@ -1165,14 +1191,13 @@ static void nfs_commit_rpcsetup(struct list_head *head,
1165 nfs_fattr_init(&data->fattr); 1191 nfs_fattr_init(&data->fattr);
1166 1192
1167 /* Set up the initial task struct. */ 1193 /* Set up the initial task struct. */
1168 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; 1194 NFS_PROTO(inode)->commit_setup(data, &msg);
1169 rpc_init_task(&data->task, NFS_CLIENT(inode), flags, &nfs_commit_ops, data);
1170 NFS_PROTO(inode)->commit_setup(data, how);
1171 1195
1172 data->task.tk_priority = flush_task_priority(how);
1173 data->task.tk_cookie = (unsigned long)inode;
1174
1175 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); 1196 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
1197
1198 task = rpc_run_task(&task_setup_data);
1199 if (!IS_ERR(task))
1200 rpc_put_task(task);
1176} 1201}
1177 1202
1178/* 1203/*
@@ -1192,7 +1217,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1192 /* Set up the argument struct */ 1217 /* Set up the argument struct */
1193 nfs_commit_rpcsetup(head, data, how); 1218 nfs_commit_rpcsetup(head, data, how);
1194 1219
1195 nfs_execute_write(data);
1196 return 0; 1220 return 0;
1197 out_bad: 1221 out_bad:
1198 while (!list_empty(head)) { 1222 while (!list_empty(head)) {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2d116d2298f8..f917fd25858a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -388,8 +388,11 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
388 * Round the length of the data which was specified up to 388 * Round the length of the data which was specified up to
389 * the next multiple of XDR units and then compare that 389 * the next multiple of XDR units and then compare that
390 * against the length which was actually received. 390 * against the length which was actually received.
391 * Note that when RPCSEC/GSS (for example) is used, the
392 * data buffer can be padded so dlen might be larger
393 * than required. It must never be smaller.
391 */ 394 */
392 if (dlen != XDR_QUADLEN(len)*4) 395 if (dlen < XDR_QUADLEN(len)*4)
393 return 0; 396 return 0;
394 397
395 if (args->count > max_blocksize) { 398 if (args->count > max_blocksize) {
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 986f9b32083c..b86e3658a0af 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -313,8 +313,11 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
313 * Round the length of the data which was specified up to 313 * Round the length of the data which was specified up to
314 * the next multiple of XDR units and then compare that 314 * the next multiple of XDR units and then compare that
315 * against the length which was actually received. 315 * against the length which was actually received.
316 * Note that when RPCSEC/GSS (for example) is used, the
317 * data buffer can be padded so dlen might be larger
318 * than required. It must never be smaller.
316 */ 319 */
317 if (dlen != XDR_QUADLEN(len)*4) 320 if (dlen < XDR_QUADLEN(len)*4)
318 return 0; 321 return 0;
319 322
320 rqstp->rq_vec[0].iov_base = (void*)p; 323 rqstp->rq_vec[0].iov_base = (void*)p;
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 9fb8132f19b0..4d4ce48bb42c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -19,16 +19,17 @@ ocfs2-objs := \
19 ioctl.o \ 19 ioctl.o \
20 journal.o \ 20 journal.o \
21 localalloc.o \ 21 localalloc.o \
22 locks.o \
22 mmap.o \ 23 mmap.o \
23 namei.o \ 24 namei.o \
25 resize.o \
24 slot_map.o \ 26 slot_map.o \
25 suballoc.o \ 27 suballoc.o \
26 super.o \ 28 super.o \
27 symlink.o \ 29 symlink.o \
28 sysfile.o \ 30 sysfile.o \
29 uptodate.o \ 31 uptodate.o \
30 ver.o \ 32 ver.o
31 vote.o
32 33
33obj-$(CONFIG_OCFS2_FS) += cluster/ 34obj-$(CONFIG_OCFS2_FS) += cluster/
34obj-$(CONFIG_OCFS2_FS) += dlm/ 35obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ce62c152823d..e6df06ac6405 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2389,6 +2389,18 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2389 goto out; 2389 goto out;
2390 } 2390 }
2391 2391
2392 /*
2393 * Caller might still want to make changes to the
2394 * tree root, so re-add it to the journal here.
2395 */
2396 ret = ocfs2_journal_access(handle, inode,
2397 path_root_bh(left_path),
2398 OCFS2_JOURNAL_ACCESS_WRITE);
2399 if (ret) {
2400 mlog_errno(ret);
2401 goto out;
2402 }
2403
2392 ret = ocfs2_rotate_subtree_left(inode, handle, left_path, 2404 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2393 right_path, subtree_root, 2405 right_path, subtree_root,
2394 dealloc, &deleted); 2406 dealloc, &deleted);
@@ -3289,16 +3301,6 @@ static int ocfs2_insert_path(struct inode *inode,
3289 int ret, subtree_index; 3301 int ret, subtree_index;
3290 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 3302 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3291 3303
3292 /*
3293 * Pass both paths to the journal. The majority of inserts
3294 * will be touching all components anyway.
3295 */
3296 ret = ocfs2_journal_access_path(inode, handle, right_path);
3297 if (ret < 0) {
3298 mlog_errno(ret);
3299 goto out;
3300 }
3301
3302 if (left_path) { 3304 if (left_path) {
3303 int credits = handle->h_buffer_credits; 3305 int credits = handle->h_buffer_credits;
3304 3306
@@ -3323,6 +3325,16 @@ static int ocfs2_insert_path(struct inode *inode,
3323 } 3325 }
3324 } 3326 }
3325 3327
3328 /*
3329 * Pass both paths to the journal. The majority of inserts
3330 * will be touching all components anyway.
3331 */
3332 ret = ocfs2_journal_access_path(inode, handle, right_path);
3333 if (ret < 0) {
3334 mlog_errno(ret);
3335 goto out;
3336 }
3337
3326 if (insert->ins_split != SPLIT_NONE) { 3338 if (insert->ins_split != SPLIT_NONE) {
3327 /* 3339 /*
3328 * We could call ocfs2_insert_at_leaf() for some types 3340 * We could call ocfs2_insert_at_leaf() for some types
@@ -3331,6 +3343,17 @@ static int ocfs2_insert_path(struct inode *inode,
3331 */ 3343 */
3332 ocfs2_split_record(inode, left_path, right_path, 3344 ocfs2_split_record(inode, left_path, right_path,
3333 insert_rec, insert->ins_split); 3345 insert_rec, insert->ins_split);
3346
3347 /*
3348 * Split might have modified either leaf and we don't
3349 * have a guarantee that the later edge insert will
3350 * dirty this for us.
3351 */
3352 if (left_path)
3353 ret = ocfs2_journal_dirty(handle,
3354 path_leaf_bh(left_path));
3355 if (ret)
3356 mlog_errno(ret);
3334 } else 3357 } else
3335 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), 3358 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3336 insert, inode); 3359 insert, inode);
@@ -3430,6 +3453,17 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3430 mlog_errno(ret); 3453 mlog_errno(ret);
3431 goto out; 3454 goto out;
3432 } 3455 }
3456
3457 /*
3458 * ocfs2_rotate_tree_right() might have extended the
3459 * transaction without re-journaling our tree root.
3460 */
3461 ret = ocfs2_journal_access(handle, inode, di_bh,
3462 OCFS2_JOURNAL_ACCESS_WRITE);
3463 if (ret) {
3464 mlog_errno(ret);
3465 goto out;
3466 }
3433 } else if (type->ins_appending == APPEND_TAIL 3467 } else if (type->ins_appending == APPEND_TAIL
3434 && type->ins_contig != CONTIG_LEFT) { 3468 && type->ins_contig != CONTIG_LEFT) {
3435 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, 3469 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
@@ -3941,7 +3975,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
3941{ 3975{
3942 int ret = 0; 3976 int ret = 0;
3943 struct ocfs2_extent_list *el = path_leaf_el(path); 3977 struct ocfs2_extent_list *el = path_leaf_el(path);
3944 struct buffer_head *eb_bh, *last_eb_bh = NULL; 3978 struct buffer_head *last_eb_bh = NULL;
3945 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 3979 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3946 struct ocfs2_merge_ctxt ctxt; 3980 struct ocfs2_merge_ctxt ctxt;
3947 struct ocfs2_extent_list *rightmost_el; 3981 struct ocfs2_extent_list *rightmost_el;
@@ -3960,14 +3994,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
3960 goto out; 3994 goto out;
3961 } 3995 }
3962 3996
3963 eb_bh = path_leaf_bh(path);
3964 ret = ocfs2_journal_access(handle, inode, eb_bh,
3965 OCFS2_JOURNAL_ACCESS_WRITE);
3966 if (ret) {
3967 mlog_errno(ret);
3968 goto out;
3969 }
3970
3971 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el, 3997 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
3972 split_index, 3998 split_index,
3973 split_rec); 3999 split_rec);
@@ -4029,8 +4055,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4029 mlog_errno(ret); 4055 mlog_errno(ret);
4030 } 4056 }
4031 4057
4032 ocfs2_journal_dirty(handle, eb_bh);
4033
4034out: 4058out:
4035 brelse(last_eb_bh); 4059 brelse(last_eb_bh);
4036 return ret; 4060 return ret;
@@ -4707,7 +4731,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
4707 4731
4708 mutex_lock(&data_alloc_inode->i_mutex); 4732 mutex_lock(&data_alloc_inode->i_mutex);
4709 4733
4710 status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1); 4734 status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
4711 if (status < 0) { 4735 if (status < 0) {
4712 mlog_errno(status); 4736 mlog_errno(status);
4713 goto out_mutex; 4737 goto out_mutex;
@@ -4729,7 +4753,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
4729 4753
4730out_unlock: 4754out_unlock:
4731 brelse(data_alloc_bh); 4755 brelse(data_alloc_bh);
4732 ocfs2_meta_unlock(data_alloc_inode, 1); 4756 ocfs2_inode_unlock(data_alloc_inode, 1);
4733 4757
4734out_mutex: 4758out_mutex:
4735 mutex_unlock(&data_alloc_inode->i_mutex); 4759 mutex_unlock(&data_alloc_inode->i_mutex);
@@ -5053,7 +5077,7 @@ static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5053 5077
5054 mutex_lock(&inode->i_mutex); 5078 mutex_lock(&inode->i_mutex);
5055 5079
5056 ret = ocfs2_meta_lock(inode, &di_bh, 1); 5080 ret = ocfs2_inode_lock(inode, &di_bh, 1);
5057 if (ret) { 5081 if (ret) {
5058 mlog_errno(ret); 5082 mlog_errno(ret);
5059 goto out_mutex; 5083 goto out_mutex;
@@ -5094,7 +5118,7 @@ out_journal:
5094 ocfs2_commit_trans(osb, handle); 5118 ocfs2_commit_trans(osb, handle);
5095 5119
5096out_unlock: 5120out_unlock:
5097 ocfs2_meta_unlock(inode, 1); 5121 ocfs2_inode_unlock(inode, 1);
5098 brelse(di_bh); 5122 brelse(di_bh);
5099out_mutex: 5123out_mutex:
5100 mutex_unlock(&inode->i_mutex); 5124 mutex_unlock(&inode->i_mutex);
@@ -6093,8 +6117,6 @@ start:
6093 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 6117 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6094 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); 6118 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6095 6119
6096 BUG_ON(clusters_to_del == 0);
6097
6098 mutex_lock(&tl_inode->i_mutex); 6120 mutex_lock(&tl_inode->i_mutex);
6099 tl_sem = 1; 6121 tl_sem = 1;
6100 /* ocfs2_truncate_log_needs_flush guarantees us at least one 6122 /* ocfs2_truncate_log_needs_flush guarantees us at least one
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 56f7790cad46..bc7b4cbbe8ec 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -26,6 +26,7 @@
26#include <asm/byteorder.h> 26#include <asm/byteorder.h>
27#include <linux/swap.h> 27#include <linux/swap.h>
28#include <linux/pipe_fs_i.h> 28#include <linux/pipe_fs_i.h>
29#include <linux/mpage.h>
29 30
30#define MLOG_MASK_PREFIX ML_FILE_IO 31#define MLOG_MASK_PREFIX ML_FILE_IO
31#include <cluster/masklog.h> 32#include <cluster/masklog.h>
@@ -139,7 +140,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
139{ 140{
140 int err = 0; 141 int err = 0;
141 unsigned int ext_flags; 142 unsigned int ext_flags;
142 u64 p_blkno, past_eof; 143 u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
144 u64 p_blkno, count, past_eof;
143 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 145 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
144 146
145 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 147 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
@@ -155,7 +157,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
155 goto bail; 157 goto bail;
156 } 158 }
157 159
158 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, 160 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
159 &ext_flags); 161 &ext_flags);
160 if (err) { 162 if (err) {
161 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " 163 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
@@ -164,6 +166,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
164 goto bail; 166 goto bail;
165 } 167 }
166 168
169 if (max_blocks < count)
170 count = max_blocks;
171
167 /* 172 /*
168 * ocfs2 never allocates in this function - the only time we 173 * ocfs2 never allocates in this function - the only time we
169 * need to use BH_New is when we're extending i_size on a file 174 * need to use BH_New is when we're extending i_size on a file
@@ -178,6 +183,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
178 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 183 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
179 map_bh(bh_result, inode->i_sb, p_blkno); 184 map_bh(bh_result, inode->i_sb, p_blkno);
180 185
186 bh_result->b_size = count << inode->i_blkbits;
187
181 if (!ocfs2_sparse_alloc(osb)) { 188 if (!ocfs2_sparse_alloc(osb)) {
182 if (p_blkno == 0) { 189 if (p_blkno == 0) {
183 err = -EIO; 190 err = -EIO;
@@ -210,7 +217,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
210 struct buffer_head *di_bh) 217 struct buffer_head *di_bh)
211{ 218{
212 void *kaddr; 219 void *kaddr;
213 unsigned int size; 220 loff_t size;
214 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 221 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
215 222
216 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { 223 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
@@ -224,8 +231,9 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
224 if (size > PAGE_CACHE_SIZE || 231 if (size > PAGE_CACHE_SIZE ||
225 size > ocfs2_max_inline_data(inode->i_sb)) { 232 size > ocfs2_max_inline_data(inode->i_sb)) {
226 ocfs2_error(inode->i_sb, 233 ocfs2_error(inode->i_sb,
227 "Inode %llu has with inline data has bad size: %u", 234 "Inode %llu has with inline data has bad size: %Lu",
228 (unsigned long long)OCFS2_I(inode)->ip_blkno, size); 235 (unsigned long long)OCFS2_I(inode)->ip_blkno,
236 (unsigned long long)size);
229 return -EROFS; 237 return -EROFS;
230 } 238 }
231 239
@@ -275,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
275 283
276 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); 284 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
277 285
278 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); 286 ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
279 if (ret != 0) { 287 if (ret != 0) {
280 if (ret == AOP_TRUNCATED_PAGE) 288 if (ret == AOP_TRUNCATED_PAGE)
281 unlock = 0; 289 unlock = 0;
@@ -285,7 +293,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
285 293
286 if (down_read_trylock(&oi->ip_alloc_sem) == 0) { 294 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
287 ret = AOP_TRUNCATED_PAGE; 295 ret = AOP_TRUNCATED_PAGE;
288 goto out_meta_unlock; 296 goto out_inode_unlock;
289 } 297 }
290 298
291 /* 299 /*
@@ -305,25 +313,16 @@ static int ocfs2_readpage(struct file *file, struct page *page)
305 goto out_alloc; 313 goto out_alloc;
306 } 314 }
307 315
308 ret = ocfs2_data_lock_with_page(inode, 0, page);
309 if (ret != 0) {
310 if (ret == AOP_TRUNCATED_PAGE)
311 unlock = 0;
312 mlog_errno(ret);
313 goto out_alloc;
314 }
315
316 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 316 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
317 ret = ocfs2_readpage_inline(inode, page); 317 ret = ocfs2_readpage_inline(inode, page);
318 else 318 else
319 ret = block_read_full_page(page, ocfs2_get_block); 319 ret = block_read_full_page(page, ocfs2_get_block);
320 unlock = 0; 320 unlock = 0;
321 321
322 ocfs2_data_unlock(inode, 0);
323out_alloc: 322out_alloc:
324 up_read(&OCFS2_I(inode)->ip_alloc_sem); 323 up_read(&OCFS2_I(inode)->ip_alloc_sem);
325out_meta_unlock: 324out_inode_unlock:
326 ocfs2_meta_unlock(inode, 0); 325 ocfs2_inode_unlock(inode, 0);
327out: 326out:
328 if (unlock) 327 if (unlock)
329 unlock_page(page); 328 unlock_page(page);
@@ -331,6 +330,62 @@ out:
331 return ret; 330 return ret;
332} 331}
333 332
333/*
334 * This is used only for read-ahead. Failures or difficult to handle
335 * situations are safe to ignore.
336 *
337 * Right now, we don't bother with BH_Boundary - in-inode extent lists
338 * are quite large (243 extents on 4k blocks), so most inodes don't
339 * grow out to a tree. If need be, detecting boundary extents could
340 * trivially be added in a future version of ocfs2_get_block().
341 */
342static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
343 struct list_head *pages, unsigned nr_pages)
344{
345 int ret, err = -EIO;
346 struct inode *inode = mapping->host;
347 struct ocfs2_inode_info *oi = OCFS2_I(inode);
348 loff_t start;
349 struct page *last;
350
351 /*
352 * Use the nonblocking flag for the dlm code to avoid page
353 * lock inversion, but don't bother with retrying.
354 */
355 ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
356 if (ret)
357 return err;
358
359 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
360 ocfs2_inode_unlock(inode, 0);
361 return err;
362 }
363
364 /*
365 * Don't bother with inline-data. There isn't anything
366 * to read-ahead in that case anyway...
367 */
368 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
369 goto out_unlock;
370
371 /*
372 * Check whether a remote node truncated this file - we just
373 * drop out in that case as it's not worth handling here.
374 */
375 last = list_entry(pages->prev, struct page, lru);
376 start = (loff_t)last->index << PAGE_CACHE_SHIFT;
377 if (start >= i_size_read(inode))
378 goto out_unlock;
379
380 err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
381
382out_unlock:
383 up_read(&oi->ip_alloc_sem);
384 ocfs2_inode_unlock(inode, 0);
385
386 return err;
387}
388
334/* Note: Because we don't support holes, our allocation has 389/* Note: Because we don't support holes, our allocation has
335 * already happened (allocation writes zeros to the file data) 390 * already happened (allocation writes zeros to the file data)
336 * so we don't have to worry about ordered writes in 391 * so we don't have to worry about ordered writes in
@@ -452,7 +507,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
452 * accessed concurrently from multiple nodes. 507 * accessed concurrently from multiple nodes.
453 */ 508 */
454 if (!INODE_JOURNAL(inode)) { 509 if (!INODE_JOURNAL(inode)) {
455 err = ocfs2_meta_lock(inode, NULL, 0); 510 err = ocfs2_inode_lock(inode, NULL, 0);
456 if (err) { 511 if (err) {
457 if (err != -ENOENT) 512 if (err != -ENOENT)
458 mlog_errno(err); 513 mlog_errno(err);
@@ -467,7 +522,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
467 522
468 if (!INODE_JOURNAL(inode)) { 523 if (!INODE_JOURNAL(inode)) {
469 up_read(&OCFS2_I(inode)->ip_alloc_sem); 524 up_read(&OCFS2_I(inode)->ip_alloc_sem);
470 ocfs2_meta_unlock(inode, 0); 525 ocfs2_inode_unlock(inode, 0);
471 } 526 }
472 527
473 if (err) { 528 if (err) {
@@ -638,34 +693,12 @@ static ssize_t ocfs2_direct_IO(int rw,
638 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 693 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
639 return 0; 694 return 0;
640 695
641 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
642 /*
643 * We get PR data locks even for O_DIRECT. This
644 * allows concurrent O_DIRECT I/O but doesn't let
645 * O_DIRECT with extending and buffered zeroing writes
646 * race. If they did race then the buffered zeroing
647 * could be written back after the O_DIRECT I/O. It's
648 * one thing to tell people not to mix buffered and
649 * O_DIRECT writes, but expecting them to understand
650 * that file extension is also an implicit buffered
651 * write is too much. By getting the PR we force
652 * writeback of the buffered zeroing before
653 * proceeding.
654 */
655 ret = ocfs2_data_lock(inode, 0);
656 if (ret < 0) {
657 mlog_errno(ret);
658 goto out;
659 }
660 ocfs2_data_unlock(inode, 0);
661 }
662
663 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 696 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
664 inode->i_sb->s_bdev, iov, offset, 697 inode->i_sb->s_bdev, iov, offset,
665 nr_segs, 698 nr_segs,
666 ocfs2_direct_IO_get_blocks, 699 ocfs2_direct_IO_get_blocks,
667 ocfs2_dio_end_io); 700 ocfs2_dio_end_io);
668out: 701
669 mlog_exit(ret); 702 mlog_exit(ret);
670 return ret; 703 return ret;
671} 704}
@@ -1754,7 +1787,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1754 struct buffer_head *di_bh = NULL; 1787 struct buffer_head *di_bh = NULL;
1755 struct inode *inode = mapping->host; 1788 struct inode *inode = mapping->host;
1756 1789
1757 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1790 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1758 if (ret) { 1791 if (ret) {
1759 mlog_errno(ret); 1792 mlog_errno(ret);
1760 return ret; 1793 return ret;
@@ -1769,30 +1802,22 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1769 */ 1802 */
1770 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1803 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1771 1804
1772 ret = ocfs2_data_lock(inode, 1);
1773 if (ret) {
1774 mlog_errno(ret);
1775 goto out_fail;
1776 }
1777
1778 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, 1805 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
1779 fsdata, di_bh, NULL); 1806 fsdata, di_bh, NULL);
1780 if (ret) { 1807 if (ret) {
1781 mlog_errno(ret); 1808 mlog_errno(ret);
1782 goto out_fail_data; 1809 goto out_fail;
1783 } 1810 }
1784 1811
1785 brelse(di_bh); 1812 brelse(di_bh);
1786 1813
1787 return 0; 1814 return 0;
1788 1815
1789out_fail_data:
1790 ocfs2_data_unlock(inode, 1);
1791out_fail: 1816out_fail:
1792 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1817 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1793 1818
1794 brelse(di_bh); 1819 brelse(di_bh);
1795 ocfs2_meta_unlock(inode, 1); 1820 ocfs2_inode_unlock(inode, 1);
1796 1821
1797 return ret; 1822 return ret;
1798} 1823}
@@ -1908,15 +1933,15 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
1908 1933
1909 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); 1934 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
1910 1935
1911 ocfs2_data_unlock(inode, 1);
1912 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1936 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1913 ocfs2_meta_unlock(inode, 1); 1937 ocfs2_inode_unlock(inode, 1);
1914 1938
1915 return ret; 1939 return ret;
1916} 1940}
1917 1941
1918const struct address_space_operations ocfs2_aops = { 1942const struct address_space_operations ocfs2_aops = {
1919 .readpage = ocfs2_readpage, 1943 .readpage = ocfs2_readpage,
1944 .readpages = ocfs2_readpages,
1920 .writepage = ocfs2_writepage, 1945 .writepage = ocfs2_writepage,
1921 .write_begin = ocfs2_write_begin, 1946 .write_begin = ocfs2_write_begin,
1922 .write_end = ocfs2_write_end, 1947 .write_end = ocfs2_write_end,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index c9037414f4f6..f136639f5b41 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
79 * information for this bh as it's not marked locally 79 * information for this bh as it's not marked locally
80 * uptodate. */ 80 * uptodate. */
81 ret = -EIO; 81 ret = -EIO;
82 brelse(bh); 82 put_bh(bh);
83 } 83 }
84 84
85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
@@ -256,7 +256,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
256 * for this bh as it's not marked locally 256 * for this bh as it's not marked locally
257 * uptodate. */ 257 * uptodate. */
258 status = -EIO; 258 status = -EIO;
259 brelse(bh); 259 put_bh(bh);
260 bhs[i] = NULL; 260 bhs[i] = NULL;
261 continue; 261 continue;
262 } 262 }
@@ -280,3 +280,64 @@ bail:
280 mlog_exit(status); 280 mlog_exit(status);
281 return status; 281 return status;
282} 282}
283
284/* Check whether the blkno is the super block or one of the backups. */
285static void ocfs2_check_super_or_backup(struct super_block *sb,
286 sector_t blkno)
287{
288 int i;
289 u64 backup_blkno;
290
291 if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
292 return;
293
294 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
295 backup_blkno = ocfs2_backup_super_blkno(sb, i);
296 if (backup_blkno == blkno)
297 return;
298 }
299
300 BUG();
301}
302
303/*
304 * Write super block and backups doesn't need to collaborate with journal,
305 * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed
306 * into this function.
307 */
308int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
309 struct buffer_head *bh)
310{
311 int ret = 0;
312
313 mlog_entry_void();
314
315 BUG_ON(buffer_jbd(bh));
316 ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
317
318 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
319 ret = -EROFS;
320 goto out;
321 }
322
323 lock_buffer(bh);
324 set_buffer_uptodate(bh);
325
326 /* remove from dirty list before I/O. */
327 clear_buffer_dirty(bh);
328
329 get_bh(bh); /* for end_buffer_write_sync() */
330 bh->b_end_io = end_buffer_write_sync;
331 submit_bh(WRITE, bh);
332
333 wait_on_buffer(bh);
334
335 if (!buffer_uptodate(bh)) {
336 ret = -EIO;
337 put_bh(bh);
338 }
339
340out:
341 mlog_exit(ret);
342 return ret;
343}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 6cc20930fac3..c2e78614c3e5 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -47,6 +47,8 @@ int ocfs2_read_blocks(struct ocfs2_super *osb,
47 int flags, 47 int flags,
48 struct inode *inode); 48 struct inode *inode);
49 49
50int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
51 struct buffer_head *bh);
50 52
51#define OCFS2_BH_CACHED 1 53#define OCFS2_BH_CACHED 1
52#define OCFS2_BH_READAHEAD 8 54#define OCFS2_BH_READAHEAD 8
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 35397dd5ecdb..e511339886b3 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -35,7 +35,7 @@
35#define O2HB_LIVE_THRESHOLD 2 35#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */ 36/* number of equal samples to be seen as dead */
37extern unsigned int o2hb_dead_threshold; 37extern unsigned int o2hb_dead_threshold;
38#define O2HB_DEFAULT_DEAD_THRESHOLD 7 38#define O2HB_DEFAULT_DEAD_THRESHOLD 31
39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */ 39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
40#define O2HB_MIN_DEAD_THRESHOLD 2 40#define O2HB_MIN_DEAD_THRESHOLD 2
41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1)) 41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index a4882c8df945..23c732f27529 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -146,7 +146,7 @@ static struct kset mlog_kset = {
146 .kobj = {.ktype = &mlog_ktype}, 146 .kobj = {.ktype = &mlog_ktype},
147}; 147};
148 148
149int mlog_sys_init(struct kset *o2cb_subsys) 149int mlog_sys_init(struct kset *o2cb_kset)
150{ 150{
151 int i = 0; 151 int i = 0;
152 152
@@ -157,7 +157,7 @@ int mlog_sys_init(struct kset *o2cb_subsys)
157 mlog_attr_ptrs[i] = NULL; 157 mlog_attr_ptrs[i] = NULL;
158 158
159 kobject_set_name(&mlog_kset.kobj, "logmask"); 159 kobject_set_name(&mlog_kset.kobj, "logmask");
160 kobj_set_kset_s(&mlog_kset, *o2cb_subsys); 160 mlog_kset.kobj.kset = o2cb_kset;
161 return kset_register(&mlog_kset); 161 return kset_register(&mlog_kset);
162} 162}
163 163
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 64f6f378fd09..0c095ce7723d 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -28,96 +28,55 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/kobject.h> 29#include <linux/kobject.h>
30#include <linux/sysfs.h> 30#include <linux/sysfs.h>
31#include <linux/fs.h>
31 32
32#include "ocfs2_nodemanager.h" 33#include "ocfs2_nodemanager.h"
33#include "masklog.h" 34#include "masklog.h"
34#include "sys.h" 35#include "sys.h"
35 36
36struct o2cb_attribute {
37 struct attribute attr;
38 ssize_t (*show)(char *buf);
39 ssize_t (*store)(const char *buf, size_t count);
40};
41
42#define O2CB_ATTR(_name, _mode, _show, _store) \
43struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
44
45#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
46 37
47static ssize_t o2cb_interface_revision_show(char *buf) 38static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
39 char *buf)
48{ 40{
49 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); 41 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
50} 42}
51 43static struct kobj_attribute attr_version =
52static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL); 44 __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
53 45
54static struct attribute *o2cb_attrs[] = { 46static struct attribute *o2cb_attrs[] = {
55 &o2cb_attr_interface_revision.attr, 47 &attr_version.attr,
56 NULL, 48 NULL,
57}; 49};
58 50
59static ssize_t 51static struct attribute_group o2cb_attr_group = {
60o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer); 52 .attrs = o2cb_attrs,
61static ssize_t
62o2cb_store(struct kobject * kobj, struct attribute * attr,
63 const char * buffer, size_t count);
64static struct sysfs_ops o2cb_sysfs_ops = {
65 .show = o2cb_show,
66 .store = o2cb_store,
67}; 53};
68 54
69static struct kobj_type o2cb_subsys_type = { 55static struct kset *o2cb_kset;
70 .default_attrs = o2cb_attrs,
71 .sysfs_ops = &o2cb_sysfs_ops,
72};
73
74/* gives us o2cb_subsys */
75static decl_subsys(o2cb, NULL, NULL);
76
77static ssize_t
78o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
79{
80 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
81 struct kset *sbs = to_kset(kobj);
82
83 BUG_ON(sbs != &o2cb_subsys);
84
85 if (o2cb_attr->show)
86 return o2cb_attr->show(buffer);
87 return -EIO;
88}
89
90static ssize_t
91o2cb_store(struct kobject * kobj, struct attribute * attr,
92 const char * buffer, size_t count)
93{
94 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
95 struct kset *sbs = to_kset(kobj);
96
97 BUG_ON(sbs != &o2cb_subsys);
98
99 if (o2cb_attr->store)
100 return o2cb_attr->store(buffer, count);
101 return -EIO;
102}
103 56
104void o2cb_sys_shutdown(void) 57void o2cb_sys_shutdown(void)
105{ 58{
106 mlog_sys_shutdown(); 59 mlog_sys_shutdown();
107 subsystem_unregister(&o2cb_subsys); 60 kset_unregister(o2cb_kset);
108} 61}
109 62
110int o2cb_sys_init(void) 63int o2cb_sys_init(void)
111{ 64{
112 int ret; 65 int ret;
113 66
114 o2cb_subsys.kobj.ktype = &o2cb_subsys_type; 67 o2cb_kset = kset_create_and_add("o2cb", NULL, NULL);
115 ret = subsystem_register(&o2cb_subsys); 68 if (!o2cb_kset)
69 return -ENOMEM;
70
71 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
116 if (ret) 72 if (ret)
117 return ret; 73 goto error;
118 74
119 ret = mlog_sys_init(&o2cb_subsys); 75 ret = mlog_sys_init(o2cb_kset);
120 if (ret) 76 if (ret)
121 subsystem_unregister(&o2cb_subsys); 77 goto error;
78 return 0;
79error:
80 kset_unregister(o2cb_kset);
122 return ret; 81 return ret;
123} 82}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index da880fc215f0..f36f66aab3dd 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -60,8 +60,8 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
60/* same as hb delay, we're waiting for another node to recognize our hb */ 60/* same as hb delay, we're waiting for another node to recognize our hb */
61#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000 61#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000
62 62
63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 5000 63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 10000 64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
65 65
66 66
67/* TODO: figure this out.... */ 67/* TODO: figure this out.... */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 9606111fe89d..b2e832aca567 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,12 @@
38 * locking semantics of the file system using the protocol. It should 38 * locking semantics of the file system using the protocol. It should
39 * be somewhere else, I'm sure, but right now it isn't. 39 * be somewhere else, I'm sure, but right now it isn't.
40 * 40 *
41 * New in version 10:
42 * - Meta/data locks combined
43 *
44 * New in version 9:
45 * - All votes removed
46 *
41 * New in version 8: 47 * New in version 8:
42 * - Replace delete inode votes with a cluster lock 48 * - Replace delete inode votes with a cluster lock
43 * 49 *
@@ -60,7 +66,7 @@
60 * - full 64 bit i_size in the metadata lock lvbs 66 * - full 64 bit i_size in the metadata lock lvbs
61 * - introduction of "rw" lock and pushing meta/data locking down 67 * - introduction of "rw" lock and pushing meta/data locking down
62 */ 68 */
63#define O2NET_PROTOCOL_VERSION 8ULL 69#define O2NET_PROTOCOL_VERSION 10ULL
64struct o2net_handshake { 70struct o2net_handshake {
65 __be64 protocol_version; 71 __be64 protocol_version;
66 __be64 connector_id; 72 __be64 connector_id;
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
index 7286c48bb30d..a56eee6abad3 100644
--- a/fs/ocfs2/cluster/ver.c
+++ b/fs/ocfs2/cluster/ver.c
@@ -28,7 +28,7 @@
28 28
29#include "ver.h" 29#include "ver.h"
30 30
31#define CLUSTER_BUILD_VERSION "1.3.3" 31#define CLUSTER_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION 33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 9923278ea6d4..b1cc7c381e88 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -128,9 +128,9 @@ static int ocfs2_match_dentry(struct dentry *dentry,
128/* 128/*
129 * Walk the inode alias list, and find a dentry which has a given 129 * Walk the inode alias list, and find a dentry which has a given
130 * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it 130 * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
131 * is looking for a dentry_lock reference. The vote thread is looking 131 * is looking for a dentry_lock reference. The downconvert thread is
132 * to unhash aliases, so we allow it to skip any that already have 132 * looking to unhash aliases, so we allow it to skip any that already
133 * that property. 133 * have that property.
134 */ 134 */
135struct dentry *ocfs2_find_local_alias(struct inode *inode, 135struct dentry *ocfs2_find_local_alias(struct inode *inode,
136 u64 parent_blkno, 136 u64 parent_blkno,
@@ -266,7 +266,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
266 dl->dl_count = 0; 266 dl->dl_count = 0;
267 /* 267 /*
268 * Does this have to happen below, for all attaches, in case 268 * Does this have to happen below, for all attaches, in case
269 * the struct inode gets blown away by votes? 269 * the struct inode gets blown away by the downconvert thread?
270 */ 270 */
271 dl->dl_inode = igrab(inode); 271 dl->dl_inode = igrab(inode);
272 dl->dl_parent_blkno = parent_blkno; 272 dl->dl_parent_blkno = parent_blkno;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 63b28fdceb4a..6b0107f21344 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -846,14 +846,14 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
846 mlog_entry("dirino=%llu\n", 846 mlog_entry("dirino=%llu\n",
847 (unsigned long long)OCFS2_I(inode)->ip_blkno); 847 (unsigned long long)OCFS2_I(inode)->ip_blkno);
848 848
849 error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 849 error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
850 if (lock_level && error >= 0) { 850 if (lock_level && error >= 0) {
851 /* We release EX lock which used to update atime 851 /* We release EX lock which used to update atime
852 * and get PR lock again to reduce contention 852 * and get PR lock again to reduce contention
853 * on commonly accessed directories. */ 853 * on commonly accessed directories. */
854 ocfs2_meta_unlock(inode, 1); 854 ocfs2_inode_unlock(inode, 1);
855 lock_level = 0; 855 lock_level = 0;
856 error = ocfs2_meta_lock(inode, NULL, 0); 856 error = ocfs2_inode_lock(inode, NULL, 0);
857 } 857 }
858 if (error < 0) { 858 if (error < 0) {
859 if (error != -ENOENT) 859 if (error != -ENOENT)
@@ -865,7 +865,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
865 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos, 865 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
866 dirent, filldir, NULL); 866 dirent, filldir, NULL);
867 867
868 ocfs2_meta_unlock(inode, lock_level); 868 ocfs2_inode_unlock(inode, lock_level);
869 869
870bail_nolock: 870bail_nolock:
871 mlog_exit(error); 871 mlog_exit(error);
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
index d2be3ad841f9..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -28,7 +28,7 @@
28 28
29#include "dlmfsver.h" 29#include "dlmfsver.h"
30 30
31#define DLM_BUILD_VERSION "1.3.3" 31#define DLM_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION 33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 2fde7bf91434..91f747b8a538 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2270,6 +2270,12 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2270 } 2270 }
2271 } 2271 }
2272 2272
2273 /* Clean up join state on node death. */
2274 if (dlm->joining_node == idx) {
2275 mlog(0, "Clearing join state for node %u\n", idx);
2276 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
2277 }
2278
2273 /* check to see if the node is already considered dead */ 2279 /* check to see if the node is already considered dead */
2274 if (!test_bit(idx, dlm->live_nodes_map)) { 2280 if (!test_bit(idx, dlm->live_nodes_map)) {
2275 mlog(0, "for domain %s, node %d is already dead. " 2281 mlog(0, "for domain %s, node %d is already dead. "
@@ -2288,12 +2294,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2288 2294
2289 clear_bit(idx, dlm->live_nodes_map); 2295 clear_bit(idx, dlm->live_nodes_map);
2290 2296
2291 /* Clean up join state on node death. */
2292 if (dlm->joining_node == idx) {
2293 mlog(0, "Clearing join state for node %u\n", idx);
2294 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
2295 }
2296
2297 /* make sure local cleanup occurs before the heartbeat events */ 2297 /* make sure local cleanup occurs before the heartbeat events */
2298 if (!test_bit(idx, dlm->recovery_map)) 2298 if (!test_bit(idx, dlm->recovery_map))
2299 dlm_do_local_recovery_cleanup(dlm, idx); 2299 dlm_do_local_recovery_cleanup(dlm, idx);
@@ -2321,6 +2321,13 @@ void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
2321 if (!dlm_grab(dlm)) 2321 if (!dlm_grab(dlm))
2322 return; 2322 return;
2323 2323
2324 /*
2325 * This will notify any dlm users that a node in our domain
2326 * went away without notifying us first.
2327 */
2328 if (test_bit(idx, dlm->domain_map))
2329 dlm_fire_domain_eviction_callbacks(dlm, idx);
2330
2324 spin_lock(&dlm->spinlock); 2331 spin_lock(&dlm->spinlock);
2325 __dlm_hb_node_down(dlm, idx); 2332 __dlm_hb_node_down(dlm, idx);
2326 spin_unlock(&dlm->spinlock); 2333 spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
index 7ef2653f8f41..dfc0da4d158d 100644
--- a/fs/ocfs2/dlm/dlmver.c
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -28,7 +28,7 @@
28 28
29#include "dlmver.h" 29#include "dlmver.h"
30 30
31#define DLM_BUILD_VERSION "1.3.3" 31#define DLM_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION 33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 4e97dcceaf8f..3867244fb144 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -55,7 +55,6 @@
55#include "slot_map.h" 55#include "slot_map.h"
56#include "super.h" 56#include "super.h"
57#include "uptodate.h" 57#include "uptodate.h"
58#include "vote.h"
59 58
60#include "buffer_head_io.h" 59#include "buffer_head_io.h"
61 60
@@ -69,6 +68,7 @@ struct ocfs2_mask_waiter {
69 68
70static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); 69static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
71static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); 70static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
71static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
72 72
73/* 73/*
74 * Return value from ->downconvert_worker functions. 74 * Return value from ->downconvert_worker functions.
@@ -153,10 +153,10 @@ struct ocfs2_lock_res_ops {
153 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *); 153 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
154 154
155 /* 155 /*
156 * Optionally called in the downconvert (or "vote") thread 156 * Optionally called in the downconvert thread after a
157 * after a successful downconvert. The lockres will not be 157 * successful downconvert. The lockres will not be referenced
158 * referenced after this callback is called, so it is safe to 158 * after this callback is called, so it is safe to free
159 * free memory, etc. 159 * memory, etc.
160 * 160 *
161 * The exact semantics of when this is called are controlled 161 * The exact semantics of when this is called are controlled
162 * by ->downconvert_worker() 162 * by ->downconvert_worker()
@@ -225,17 +225,12 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
225 .flags = 0, 225 .flags = 0,
226}; 226};
227 227
228static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { 228static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
229 .get_osb = ocfs2_get_inode_osb, 229 .get_osb = ocfs2_get_inode_osb,
230 .check_downconvert = ocfs2_check_meta_downconvert, 230 .check_downconvert = ocfs2_check_meta_downconvert,
231 .set_lvb = ocfs2_set_meta_lvb, 231 .set_lvb = ocfs2_set_meta_lvb,
232 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
233};
234
235static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
236 .get_osb = ocfs2_get_inode_osb,
237 .downconvert_worker = ocfs2_data_convert_worker, 232 .downconvert_worker = ocfs2_data_convert_worker,
238 .flags = 0, 233 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
239}; 234};
240 235
241static struct ocfs2_lock_res_ops ocfs2_super_lops = { 236static struct ocfs2_lock_res_ops ocfs2_super_lops = {
@@ -258,10 +253,14 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
258 .flags = 0, 253 .flags = 0,
259}; 254};
260 255
256static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
257 .get_osb = ocfs2_get_file_osb,
258 .flags = 0,
259};
260
261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
262{ 262{
263 return lockres->l_type == OCFS2_LOCK_TYPE_META || 263 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
264 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
265 lockres->l_type == OCFS2_LOCK_TYPE_RW || 264 lockres->l_type == OCFS2_LOCK_TYPE_RW ||
266 lockres->l_type == OCFS2_LOCK_TYPE_OPEN; 265 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
267} 266}
@@ -310,12 +309,24 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
310 "resource %s: %s\n", dlm_errname(_stat), _func, \ 309 "resource %s: %s\n", dlm_errname(_stat), _func, \
311 _lockres->l_name, dlm_errmsg(_stat)); \ 310 _lockres->l_name, dlm_errmsg(_stat)); \
312} while (0) 311} while (0)
313static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 312static int ocfs2_downconvert_thread(void *arg);
314 struct ocfs2_lock_res *lockres); 313static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
315static int ocfs2_meta_lock_update(struct inode *inode, 314 struct ocfs2_lock_res *lockres);
315static int ocfs2_inode_lock_update(struct inode *inode,
316 struct buffer_head **bh); 316 struct buffer_head **bh);
317static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); 317static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
318static inline int ocfs2_highest_compat_lock_level(int level); 318static inline int ocfs2_highest_compat_lock_level(int level);
319static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
320 int new_level);
321static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
322 struct ocfs2_lock_res *lockres,
323 int new_level,
324 int lvb);
325static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
326 struct ocfs2_lock_res *lockres);
327static int ocfs2_cancel_convert(struct ocfs2_super *osb,
328 struct ocfs2_lock_res *lockres);
329
319 330
320static void ocfs2_build_lock_name(enum ocfs2_lock_type type, 331static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
321 u64 blkno, 332 u64 blkno,
@@ -402,10 +413,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
402 ops = &ocfs2_inode_rw_lops; 413 ops = &ocfs2_inode_rw_lops;
403 break; 414 break;
404 case OCFS2_LOCK_TYPE_META: 415 case OCFS2_LOCK_TYPE_META:
405 ops = &ocfs2_inode_meta_lops; 416 ops = &ocfs2_inode_inode_lops;
406 break;
407 case OCFS2_LOCK_TYPE_DATA:
408 ops = &ocfs2_inode_data_lops;
409 break; 417 break;
410 case OCFS2_LOCK_TYPE_OPEN: 418 case OCFS2_LOCK_TYPE_OPEN:
411 ops = &ocfs2_inode_open_lops; 419 ops = &ocfs2_inode_open_lops;
@@ -428,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
428 return OCFS2_SB(inode->i_sb); 436 return OCFS2_SB(inode->i_sb);
429} 437}
430 438
439static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
440{
441 struct ocfs2_file_private *fp = lockres->l_priv;
442
443 return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
444}
445
431static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres) 446static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
432{ 447{
433 __be64 inode_blkno_be; 448 __be64 inode_blkno_be;
@@ -508,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
508 &ocfs2_rename_lops, osb); 523 &ocfs2_rename_lops, osb);
509} 524}
510 525
526void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
527 struct ocfs2_file_private *fp)
528{
529 struct inode *inode = fp->fp_file->f_mapping->host;
530 struct ocfs2_inode_info *oi = OCFS2_I(inode);
531
532 ocfs2_lock_res_init_once(lockres);
533 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
534 inode->i_generation, lockres->l_name);
535 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
536 OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
537 fp);
538 lockres->l_flags |= OCFS2_LOCK_NOCACHE;
539}
540
511void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 541void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
512{ 542{
513 mlog_entry_void(); 543 mlog_entry_void();
@@ -724,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level)
724 lockres->l_name, level, lockres->l_level, 754 lockres->l_name, level, lockres->l_level,
725 ocfs2_lock_type_string(lockres->l_type)); 755 ocfs2_lock_type_string(lockres->l_type));
726 756
757 /*
758 * We can skip the bast for locks which don't enable caching -
759 * they'll be dropped at the earliest possible time anyway.
760 */
761 if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
762 return;
763
727 spin_lock_irqsave(&lockres->l_lock, flags); 764 spin_lock_irqsave(&lockres->l_lock, flags);
728 needs_downconvert = ocfs2_generic_handle_bast(lockres, level); 765 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
729 if (needs_downconvert) 766 if (needs_downconvert)
@@ -732,7 +769,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
732 769
733 wake_up(&lockres->l_event); 770 wake_up(&lockres->l_event);
734 771
735 ocfs2_kick_vote_thread(osb); 772 ocfs2_wake_downconvert_thread(osb);
736} 773}
737 774
738static void ocfs2_locking_ast(void *opaque) 775static void ocfs2_locking_ast(void *opaque)
@@ -935,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
935 972
936} 973}
937 974
975static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
976 struct ocfs2_lock_res *lockres)
977{
978 int ret;
979
980 ret = wait_for_completion_interruptible(&mw->mw_complete);
981 if (ret)
982 lockres_remove_mask_waiter(lockres, mw);
983 else
984 ret = mw->mw_status;
985 /* Re-arm the completion in case we want to wait on it again */
986 INIT_COMPLETION(mw->mw_complete);
987 return ret;
988}
989
938static int ocfs2_cluster_lock(struct ocfs2_super *osb, 990static int ocfs2_cluster_lock(struct ocfs2_super *osb,
939 struct ocfs2_lock_res *lockres, 991 struct ocfs2_lock_res *lockres,
940 int level, 992 int level,
@@ -1089,7 +1141,7 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1089 mlog_entry_void(); 1141 mlog_entry_void();
1090 spin_lock_irqsave(&lockres->l_lock, flags); 1142 spin_lock_irqsave(&lockres->l_lock, flags);
1091 ocfs2_dec_holders(lockres, level); 1143 ocfs2_dec_holders(lockres, level);
1092 ocfs2_vote_on_unlock(osb, lockres); 1144 ocfs2_downconvert_on_unlock(osb, lockres);
1093 spin_unlock_irqrestore(&lockres->l_lock, flags); 1145 spin_unlock_irqrestore(&lockres->l_lock, flags);
1094 mlog_exit_void(); 1146 mlog_exit_void();
1095} 1147}
@@ -1147,13 +1199,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1147 * We don't want to use LKM_LOCAL on a meta data lock as they 1199 * We don't want to use LKM_LOCAL on a meta data lock as they
1148 * don't use a generation in their lock names. 1200 * don't use a generation in their lock names.
1149 */ 1201 */
1150 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0); 1202 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
1151 if (ret) {
1152 mlog_errno(ret);
1153 goto bail;
1154 }
1155
1156 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
1157 if (ret) { 1203 if (ret) {
1158 mlog_errno(ret); 1204 mlog_errno(ret);
1159 goto bail; 1205 goto bail;
@@ -1311,76 +1357,221 @@ out:
1311 mlog_exit_void(); 1357 mlog_exit_void();
1312} 1358}
1313 1359
1314int ocfs2_data_lock_full(struct inode *inode, 1360static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
1315 int write, 1361 int level)
1316 int arg_flags)
1317{ 1362{
1318 int status = 0, level; 1363 int ret;
1319 struct ocfs2_lock_res *lockres; 1364 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1320 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1365 unsigned long flags;
1366 struct ocfs2_mask_waiter mw;
1321 1367
1322 BUG_ON(!inode); 1368 ocfs2_init_mask_waiter(&mw);
1323 1369
1324 mlog_entry_void(); 1370retry_cancel:
1371 spin_lock_irqsave(&lockres->l_lock, flags);
1372 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
1373 ret = ocfs2_prepare_cancel_convert(osb, lockres);
1374 if (ret) {
1375 spin_unlock_irqrestore(&lockres->l_lock, flags);
1376 ret = ocfs2_cancel_convert(osb, lockres);
1377 if (ret < 0) {
1378 mlog_errno(ret);
1379 goto out;
1380 }
1381 goto retry_cancel;
1382 }
1383 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1384 spin_unlock_irqrestore(&lockres->l_lock, flags);
1325 1385
1326 mlog(0, "inode %llu take %s DATA lock\n", 1386 ocfs2_wait_for_mask(&mw);
1327 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1387 goto retry_cancel;
1328 write ? "EXMODE" : "PRMODE"); 1388 }
1329 1389
1330 /* We'll allow faking a readonly data lock for 1390 ret = -ERESTARTSYS;
1331 * rodevices. */ 1391 /*
1332 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { 1392 * We may still have gotten the lock, in which case there's no
1333 if (write) { 1393 * point to restarting the syscall.
1334 status = -EROFS; 1394 */
1335 mlog_errno(status); 1395 if (lockres->l_level == level)
1396 ret = 0;
1397
1398 mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
1399 lockres->l_flags, lockres->l_level, lockres->l_action);
1400
1401 spin_unlock_irqrestore(&lockres->l_lock, flags);
1402
1403out:
1404 return ret;
1405}
1406
1407/*
1408 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
1409 * flock() calls. The locking approach this requires is sufficiently
1410 * different from all other cluster lock types that we implement a
1411 * seperate path to the "low-level" dlm calls. In particular:
1412 *
1413 * - No optimization of lock levels is done - we take at exactly
1414 * what's been requested.
1415 *
1416 * - No lock caching is employed. We immediately downconvert to
1417 * no-lock at unlock time. This also means flock locks never go on
1418 * the blocking list).
1419 *
1420 * - Since userspace can trivially deadlock itself with flock, we make
1421 * sure to allow cancellation of a misbehaving applications flock()
1422 * request.
1423 *
1424 * - Access to any flock lockres doesn't require concurrency, so we
1425 * can simplify the code by requiring the caller to guarantee
1426 * serialization of dlmglue flock calls.
1427 */
1428int ocfs2_file_lock(struct file *file, int ex, int trylock)
1429{
1430 int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
1431 unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
1432 unsigned long flags;
1433 struct ocfs2_file_private *fp = file->private_data;
1434 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1435 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1436 struct ocfs2_mask_waiter mw;
1437
1438 ocfs2_init_mask_waiter(&mw);
1439
1440 if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
1441 (lockres->l_level > LKM_NLMODE)) {
1442 mlog(ML_ERROR,
1443 "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
1444 "level: %u\n", lockres->l_name, lockres->l_flags,
1445 lockres->l_level);
1446 return -EINVAL;
1447 }
1448
1449 spin_lock_irqsave(&lockres->l_lock, flags);
1450 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1451 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1452 spin_unlock_irqrestore(&lockres->l_lock, flags);
1453
1454 /*
1455 * Get the lock at NLMODE to start - that way we
1456 * can cancel the upconvert request if need be.
1457 */
1458 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
1459 if (ret < 0) {
1460 mlog_errno(ret);
1461 goto out;
1336 } 1462 }
1337 goto out; 1463
1464 ret = ocfs2_wait_for_mask(&mw);
1465 if (ret) {
1466 mlog_errno(ret);
1467 goto out;
1468 }
1469 spin_lock_irqsave(&lockres->l_lock, flags);
1338 } 1470 }
1339 1471
1340 if (ocfs2_mount_local(osb)) 1472 lockres->l_action = OCFS2_AST_CONVERT;
1341 goto out; 1473 lkm_flags |= LKM_CONVERT;
1474 lockres->l_requested = level;
1475 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1342 1476
1343 lockres = &OCFS2_I(inode)->ip_data_lockres; 1477 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1478 spin_unlock_irqrestore(&lockres->l_lock, flags);
1344 1479
1345 level = write ? LKM_EXMODE : LKM_PRMODE; 1480 ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
1481 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
1482 ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
1483 if (ret != DLM_NORMAL) {
1484 if (trylock && ret == DLM_NOTQUEUED)
1485 ret = -EAGAIN;
1486 else {
1487 ocfs2_log_dlm_error("dlmlock", ret, lockres);
1488 ret = -EINVAL;
1489 }
1346 1490
1347 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 1491 ocfs2_recover_from_dlm_error(lockres, 1);
1348 0, arg_flags); 1492 lockres_remove_mask_waiter(lockres, &mw);
1349 if (status < 0 && status != -EAGAIN) 1493 goto out;
1350 mlog_errno(status); 1494 }
1495
1496 ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
1497 if (ret == -ERESTARTSYS) {
1498 /*
1499 * Userspace can cause deadlock itself with
1500 * flock(). Current behavior locally is to allow the
1501 * deadlock, but abort the system call if a signal is
1502 * received. We follow this example, otherwise a
1503 * poorly written program could sit in kernel until
1504 * reboot.
1505 *
1506 * Handling this is a bit more complicated for Ocfs2
1507 * though. We can't exit this function with an
1508 * outstanding lock request, so a cancel convert is
1509 * required. We intentionally overwrite 'ret' - if the
1510 * cancel fails and the lock was granted, it's easier
1511 * to just bubble sucess back up to the user.
1512 */
1513 ret = ocfs2_flock_handle_signal(lockres, level);
1514 }
1351 1515
1352out: 1516out:
1353 mlog_exit(status); 1517
1354 return status; 1518 mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
1519 lockres->l_name, ex, trylock, ret);
1520 return ret;
1355} 1521}
1356 1522
1357/* see ocfs2_meta_lock_with_page() */ 1523void ocfs2_file_unlock(struct file *file)
1358int ocfs2_data_lock_with_page(struct inode *inode,
1359 int write,
1360 struct page *page)
1361{ 1524{
1362 int ret; 1525 int ret;
1526 unsigned long flags;
1527 struct ocfs2_file_private *fp = file->private_data;
1528 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1529 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1530 struct ocfs2_mask_waiter mw;
1363 1531
1364 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); 1532 ocfs2_init_mask_waiter(&mw);
1365 if (ret == -EAGAIN) { 1533
1366 unlock_page(page); 1534 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
1367 if (ocfs2_data_lock(inode, write) == 0) 1535 return;
1368 ocfs2_data_unlock(inode, write); 1536
1369 ret = AOP_TRUNCATED_PAGE; 1537 if (lockres->l_level == LKM_NLMODE)
1538 return;
1539
1540 mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
1541 lockres->l_name, lockres->l_flags, lockres->l_level,
1542 lockres->l_action);
1543
1544 spin_lock_irqsave(&lockres->l_lock, flags);
1545 /*
1546 * Fake a blocking ast for the downconvert code.
1547 */
1548 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
1549 lockres->l_blocking = LKM_EXMODE;
1550
1551 ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
1552 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1553 spin_unlock_irqrestore(&lockres->l_lock, flags);
1554
1555 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
1556 if (ret) {
1557 mlog_errno(ret);
1558 return;
1370 } 1559 }
1371 1560
1372 return ret; 1561 ret = ocfs2_wait_for_mask(&mw);
1562 if (ret)
1563 mlog_errno(ret);
1373} 1564}
1374 1565
1375static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 1566static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
1376 struct ocfs2_lock_res *lockres) 1567 struct ocfs2_lock_res *lockres)
1377{ 1568{
1378 int kick = 0; 1569 int kick = 0;
1379 1570
1380 mlog_entry_void(); 1571 mlog_entry_void();
1381 1572
1382 /* If we know that another node is waiting on our lock, kick 1573 /* If we know that another node is waiting on our lock, kick
1383 * the vote thread * pre-emptively when we reach a release 1574 * the downconvert thread * pre-emptively when we reach a release
1384 * condition. */ 1575 * condition. */
1385 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { 1576 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1386 switch(lockres->l_blocking) { 1577 switch(lockres->l_blocking) {
@@ -1398,27 +1589,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1398 } 1589 }
1399 1590
1400 if (kick) 1591 if (kick)
1401 ocfs2_kick_vote_thread(osb); 1592 ocfs2_wake_downconvert_thread(osb);
1402
1403 mlog_exit_void();
1404}
1405
1406void ocfs2_data_unlock(struct inode *inode,
1407 int write)
1408{
1409 int level = write ? LKM_EXMODE : LKM_PRMODE;
1410 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1411 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1412
1413 mlog_entry_void();
1414
1415 mlog(0, "inode %llu drop %s DATA lock\n",
1416 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1417 write ? "EXMODE" : "PRMODE");
1418
1419 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
1420 !ocfs2_mount_local(osb))
1421 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1422 1593
1423 mlog_exit_void(); 1594 mlog_exit_void();
1424} 1595}
@@ -1442,11 +1613,11 @@ static u64 ocfs2_pack_timespec(struct timespec *spec)
1442 1613
1443/* Call this with the lockres locked. I am reasonably sure we don't 1614/* Call this with the lockres locked. I am reasonably sure we don't
1444 * need ip_lock in this function as anyone who would be changing those 1615 * need ip_lock in this function as anyone who would be changing those
1445 * values is supposed to be blocked in ocfs2_meta_lock right now. */ 1616 * values is supposed to be blocked in ocfs2_inode_lock right now. */
1446static void __ocfs2_stuff_meta_lvb(struct inode *inode) 1617static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1447{ 1618{
1448 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1619 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1449 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1620 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1450 struct ocfs2_meta_lvb *lvb; 1621 struct ocfs2_meta_lvb *lvb;
1451 1622
1452 mlog_entry_void(); 1623 mlog_entry_void();
@@ -1496,7 +1667,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
1496static void ocfs2_refresh_inode_from_lvb(struct inode *inode) 1667static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1497{ 1668{
1498 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1669 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1499 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1670 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1500 struct ocfs2_meta_lvb *lvb; 1671 struct ocfs2_meta_lvb *lvb;
1501 1672
1502 mlog_entry_void(); 1673 mlog_entry_void();
@@ -1604,12 +1775,12 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
1604} 1775}
1605 1776
1606/* may or may not return a bh if it went to disk. */ 1777/* may or may not return a bh if it went to disk. */
1607static int ocfs2_meta_lock_update(struct inode *inode, 1778static int ocfs2_inode_lock_update(struct inode *inode,
1608 struct buffer_head **bh) 1779 struct buffer_head **bh)
1609{ 1780{
1610 int status = 0; 1781 int status = 0;
1611 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1782 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1612 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1783 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1613 struct ocfs2_dinode *fe; 1784 struct ocfs2_dinode *fe;
1614 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1785 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1615 1786
@@ -1721,7 +1892,7 @@ static int ocfs2_assign_bh(struct inode *inode,
1721 * returns < 0 error if the callback will never be called, otherwise 1892 * returns < 0 error if the callback will never be called, otherwise
1722 * the result of the lock will be communicated via the callback. 1893 * the result of the lock will be communicated via the callback.
1723 */ 1894 */
1724int ocfs2_meta_lock_full(struct inode *inode, 1895int ocfs2_inode_lock_full(struct inode *inode,
1725 struct buffer_head **ret_bh, 1896 struct buffer_head **ret_bh,
1726 int ex, 1897 int ex,
1727 int arg_flags) 1898 int arg_flags)
@@ -1756,7 +1927,7 @@ int ocfs2_meta_lock_full(struct inode *inode,
1756 wait_event(osb->recovery_event, 1927 wait_event(osb->recovery_event,
1757 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1928 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1758 1929
1759 lockres = &OCFS2_I(inode)->ip_meta_lockres; 1930 lockres = &OCFS2_I(inode)->ip_inode_lockres;
1760 level = ex ? LKM_EXMODE : LKM_PRMODE; 1931 level = ex ? LKM_EXMODE : LKM_PRMODE;
1761 dlm_flags = 0; 1932 dlm_flags = 0;
1762 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 1933 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
@@ -1795,11 +1966,11 @@ local:
1795 } 1966 }
1796 1967
1797 /* This is fun. The caller may want a bh back, or it may 1968 /* This is fun. The caller may want a bh back, or it may
1798 * not. ocfs2_meta_lock_update definitely wants one in, but 1969 * not. ocfs2_inode_lock_update definitely wants one in, but
1799 * may or may not read one, depending on what's in the 1970 * may or may not read one, depending on what's in the
1800 * LVB. The result of all of this is that we've *only* gone to 1971 * LVB. The result of all of this is that we've *only* gone to
1801 * disk if we have to, so the complexity is worthwhile. */ 1972 * disk if we have to, so the complexity is worthwhile. */
1802 status = ocfs2_meta_lock_update(inode, &local_bh); 1973 status = ocfs2_inode_lock_update(inode, &local_bh);
1803 if (status < 0) { 1974 if (status < 0) {
1804 if (status != -ENOENT) 1975 if (status != -ENOENT)
1805 mlog_errno(status); 1976 mlog_errno(status);
@@ -1821,7 +1992,7 @@ bail:
1821 *ret_bh = NULL; 1992 *ret_bh = NULL;
1822 } 1993 }
1823 if (acquired) 1994 if (acquired)
1824 ocfs2_meta_unlock(inode, ex); 1995 ocfs2_inode_unlock(inode, ex);
1825 } 1996 }
1826 1997
1827 if (local_bh) 1998 if (local_bh)
@@ -1832,19 +2003,20 @@ bail:
1832} 2003}
1833 2004
1834/* 2005/*
1835 * This is working around a lock inversion between tasks acquiring DLM locks 2006 * This is working around a lock inversion between tasks acquiring DLM
1836 * while holding a page lock and the vote thread which blocks dlm lock acquiry 2007 * locks while holding a page lock and the downconvert thread which
1837 * while acquiring page locks. 2008 * blocks dlm lock acquiry while acquiring page locks.
1838 * 2009 *
1839 * ** These _with_page variantes are only intended to be called from aop 2010 * ** These _with_page variantes are only intended to be called from aop
1840 * methods that hold page locks and return a very specific *positive* error 2011 * methods that hold page locks and return a very specific *positive* error
1841 * code that aop methods pass up to the VFS -- test for errors with != 0. ** 2012 * code that aop methods pass up to the VFS -- test for errors with != 0. **
1842 * 2013 *
1843 * The DLM is called such that it returns -EAGAIN if it would have blocked 2014 * The DLM is called such that it returns -EAGAIN if it would have
1844 * waiting for the vote thread. In that case we unlock our page so the vote 2015 * blocked waiting for the downconvert thread. In that case we unlock
1845 * thread can make progress. Once we've done this we have to return 2016 * our page so the downconvert thread can make progress. Once we've
1846 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up 2017 * done this we have to return AOP_TRUNCATED_PAGE so the aop method
1847 * into the VFS who will then immediately retry the aop call. 2018 * that called us can bubble that back up into the VFS who will then
2019 * immediately retry the aop call.
1848 * 2020 *
1849 * We do a blocking lock and immediate unlock before returning, though, so that 2021 * We do a blocking lock and immediate unlock before returning, though, so that
1850 * the lock has a great chance of being cached on this node by the time the VFS 2022 * the lock has a great chance of being cached on this node by the time the VFS
@@ -1852,32 +2024,32 @@ bail:
1852 * ping locks back and forth, but that's a risk we're willing to take to avoid 2024 * ping locks back and forth, but that's a risk we're willing to take to avoid
1853 * the lock inversion simply. 2025 * the lock inversion simply.
1854 */ 2026 */
1855int ocfs2_meta_lock_with_page(struct inode *inode, 2027int ocfs2_inode_lock_with_page(struct inode *inode,
1856 struct buffer_head **ret_bh, 2028 struct buffer_head **ret_bh,
1857 int ex, 2029 int ex,
1858 struct page *page) 2030 struct page *page)
1859{ 2031{
1860 int ret; 2032 int ret;
1861 2033
1862 ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); 2034 ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
1863 if (ret == -EAGAIN) { 2035 if (ret == -EAGAIN) {
1864 unlock_page(page); 2036 unlock_page(page);
1865 if (ocfs2_meta_lock(inode, ret_bh, ex) == 0) 2037 if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
1866 ocfs2_meta_unlock(inode, ex); 2038 ocfs2_inode_unlock(inode, ex);
1867 ret = AOP_TRUNCATED_PAGE; 2039 ret = AOP_TRUNCATED_PAGE;
1868 } 2040 }
1869 2041
1870 return ret; 2042 return ret;
1871} 2043}
1872 2044
1873int ocfs2_meta_lock_atime(struct inode *inode, 2045int ocfs2_inode_lock_atime(struct inode *inode,
1874 struct vfsmount *vfsmnt, 2046 struct vfsmount *vfsmnt,
1875 int *level) 2047 int *level)
1876{ 2048{
1877 int ret; 2049 int ret;
1878 2050
1879 mlog_entry_void(); 2051 mlog_entry_void();
1880 ret = ocfs2_meta_lock(inode, NULL, 0); 2052 ret = ocfs2_inode_lock(inode, NULL, 0);
1881 if (ret < 0) { 2053 if (ret < 0) {
1882 mlog_errno(ret); 2054 mlog_errno(ret);
1883 return ret; 2055 return ret;
@@ -1890,8 +2062,8 @@ int ocfs2_meta_lock_atime(struct inode *inode,
1890 if (ocfs2_should_update_atime(inode, vfsmnt)) { 2062 if (ocfs2_should_update_atime(inode, vfsmnt)) {
1891 struct buffer_head *bh = NULL; 2063 struct buffer_head *bh = NULL;
1892 2064
1893 ocfs2_meta_unlock(inode, 0); 2065 ocfs2_inode_unlock(inode, 0);
1894 ret = ocfs2_meta_lock(inode, &bh, 1); 2066 ret = ocfs2_inode_lock(inode, &bh, 1);
1895 if (ret < 0) { 2067 if (ret < 0) {
1896 mlog_errno(ret); 2068 mlog_errno(ret);
1897 return ret; 2069 return ret;
@@ -1908,11 +2080,11 @@ int ocfs2_meta_lock_atime(struct inode *inode,
1908 return ret; 2080 return ret;
1909} 2081}
1910 2082
1911void ocfs2_meta_unlock(struct inode *inode, 2083void ocfs2_inode_unlock(struct inode *inode,
1912 int ex) 2084 int ex)
1913{ 2085{
1914 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2086 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1915 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; 2087 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
1916 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2088 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1917 2089
1918 mlog_entry_void(); 2090 mlog_entry_void();
@@ -2320,11 +2492,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2320 goto bail; 2492 goto bail;
2321 } 2493 }
2322 2494
2323 /* launch vote thread */ 2495 /* launch downconvert thread */
2324 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote"); 2496 osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
2325 if (IS_ERR(osb->vote_task)) { 2497 if (IS_ERR(osb->dc_task)) {
2326 status = PTR_ERR(osb->vote_task); 2498 status = PTR_ERR(osb->dc_task);
2327 osb->vote_task = NULL; 2499 osb->dc_task = NULL;
2328 mlog_errno(status); 2500 mlog_errno(status);
2329 goto bail; 2501 goto bail;
2330 } 2502 }
@@ -2353,8 +2525,8 @@ local:
2353bail: 2525bail:
2354 if (status < 0) { 2526 if (status < 0) {
2355 ocfs2_dlm_shutdown_debug(osb); 2527 ocfs2_dlm_shutdown_debug(osb);
2356 if (osb->vote_task) 2528 if (osb->dc_task)
2357 kthread_stop(osb->vote_task); 2529 kthread_stop(osb->dc_task);
2358 } 2530 }
2359 2531
2360 mlog_exit(status); 2532 mlog_exit(status);
@@ -2369,9 +2541,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2369 2541
2370 ocfs2_drop_osb_locks(osb); 2542 ocfs2_drop_osb_locks(osb);
2371 2543
2372 if (osb->vote_task) { 2544 if (osb->dc_task) {
2373 kthread_stop(osb->vote_task); 2545 kthread_stop(osb->dc_task);
2374 osb->vote_task = NULL; 2546 osb->dc_task = NULL;
2375 } 2547 }
2376 2548
2377 ocfs2_lock_res_free(&osb->osb_super_lockres); 2549 ocfs2_lock_res_free(&osb->osb_super_lockres);
@@ -2527,7 +2699,7 @@ out:
2527 2699
2528/* Mark the lockres as being dropped. It will no longer be 2700/* Mark the lockres as being dropped. It will no longer be
2529 * queued if blocking, but we still may have to wait on it 2701 * queued if blocking, but we still may have to wait on it
2530 * being dequeued from the vote thread before we can consider 2702 * being dequeued from the downconvert thread before we can consider
2531 * it safe to drop. 2703 * it safe to drop.
2532 * 2704 *
2533 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 2705 * You can *not* attempt to call cluster_lock on this lockres anymore. */
@@ -2590,14 +2762,7 @@ int ocfs2_drop_inode_locks(struct inode *inode)
2590 status = err; 2762 status = err;
2591 2763
2592 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2764 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2593 &OCFS2_I(inode)->ip_data_lockres); 2765 &OCFS2_I(inode)->ip_inode_lockres);
2594 if (err < 0)
2595 mlog_errno(err);
2596 if (err < 0 && !status)
2597 status = err;
2598
2599 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2600 &OCFS2_I(inode)->ip_meta_lockres);
2601 if (err < 0) 2766 if (err < 0)
2602 mlog_errno(err); 2767 mlog_errno(err);
2603 if (err < 0 && !status) 2768 if (err < 0 && !status)
@@ -2850,6 +3015,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2850 inode = ocfs2_lock_res_inode(lockres); 3015 inode = ocfs2_lock_res_inode(lockres);
2851 mapping = inode->i_mapping; 3016 mapping = inode->i_mapping;
2852 3017
3018 if (S_ISREG(inode->i_mode))
3019 goto out;
3020
2853 /* 3021 /*
2854 * We need this before the filemap_fdatawrite() so that it can 3022 * We need this before the filemap_fdatawrite() so that it can
2855 * transfer the dirty bit from the PTE to the 3023 * transfer the dirty bit from the PTE to the
@@ -2875,6 +3043,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2875 filemap_fdatawait(mapping); 3043 filemap_fdatawait(mapping);
2876 } 3044 }
2877 3045
3046out:
2878 return UNBLOCK_CONTINUE; 3047 return UNBLOCK_CONTINUE;
2879} 3048}
2880 3049
@@ -2903,7 +3072,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
2903 3072
2904/* 3073/*
2905 * Does the final reference drop on our dentry lock. Right now this 3074 * Does the final reference drop on our dentry lock. Right now this
2906 * happens in the vote thread, but we could choose to simplify the 3075 * happens in the downconvert thread, but we could choose to simplify the
2907 * dlmglue API and push these off to the ocfs2_wq in the future. 3076 * dlmglue API and push these off to the ocfs2_wq in the future.
2908 */ 3077 */
2909static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, 3078static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
@@ -3042,7 +3211,7 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3042 mlog(0, "lockres %s blocked.\n", lockres->l_name); 3211 mlog(0, "lockres %s blocked.\n", lockres->l_name);
3043 3212
3044 /* Detect whether a lock has been marked as going away while 3213 /* Detect whether a lock has been marked as going away while
3045 * the vote thread was processing other things. A lock can 3214 * the downconvert thread was processing other things. A lock can
3046 * still be marked with OCFS2_LOCK_FREEING after this check, 3215 * still be marked with OCFS2_LOCK_FREEING after this check,
3047 * but short circuiting here will still save us some 3216 * but short circuiting here will still save us some
3048 * performance. */ 3217 * performance. */
@@ -3091,13 +3260,104 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
3091 3260
3092 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); 3261 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
3093 3262
3094 spin_lock(&osb->vote_task_lock); 3263 spin_lock(&osb->dc_task_lock);
3095 if (list_empty(&lockres->l_blocked_list)) { 3264 if (list_empty(&lockres->l_blocked_list)) {
3096 list_add_tail(&lockres->l_blocked_list, 3265 list_add_tail(&lockres->l_blocked_list,
3097 &osb->blocked_lock_list); 3266 &osb->blocked_lock_list);
3098 osb->blocked_lock_count++; 3267 osb->blocked_lock_count++;
3099 } 3268 }
3100 spin_unlock(&osb->vote_task_lock); 3269 spin_unlock(&osb->dc_task_lock);
3270
3271 mlog_exit_void();
3272}
3273
3274static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
3275{
3276 unsigned long processed;
3277 struct ocfs2_lock_res *lockres;
3278
3279 mlog_entry_void();
3280
3281 spin_lock(&osb->dc_task_lock);
3282 /* grab this early so we know to try again if a state change and
3283 * wake happens part-way through our work */
3284 osb->dc_work_sequence = osb->dc_wake_sequence;
3285
3286 processed = osb->blocked_lock_count;
3287 while (processed) {
3288 BUG_ON(list_empty(&osb->blocked_lock_list));
3289
3290 lockres = list_entry(osb->blocked_lock_list.next,
3291 struct ocfs2_lock_res, l_blocked_list);
3292 list_del_init(&lockres->l_blocked_list);
3293 osb->blocked_lock_count--;
3294 spin_unlock(&osb->dc_task_lock);
3295
3296 BUG_ON(!processed);
3297 processed--;
3298
3299 ocfs2_process_blocked_lock(osb, lockres);
3300
3301 spin_lock(&osb->dc_task_lock);
3302 }
3303 spin_unlock(&osb->dc_task_lock);
3101 3304
3102 mlog_exit_void(); 3305 mlog_exit_void();
3103} 3306}
3307
3308static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
3309{
3310 int empty = 0;
3311
3312 spin_lock(&osb->dc_task_lock);
3313 if (list_empty(&osb->blocked_lock_list))
3314 empty = 1;
3315
3316 spin_unlock(&osb->dc_task_lock);
3317 return empty;
3318}
3319
3320static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
3321{
3322 int should_wake = 0;
3323
3324 spin_lock(&osb->dc_task_lock);
3325 if (osb->dc_work_sequence != osb->dc_wake_sequence)
3326 should_wake = 1;
3327 spin_unlock(&osb->dc_task_lock);
3328
3329 return should_wake;
3330}
3331
3332int ocfs2_downconvert_thread(void *arg)
3333{
3334 int status = 0;
3335 struct ocfs2_super *osb = arg;
3336
3337 /* only quit once we've been asked to stop and there is no more
3338 * work available */
3339 while (!(kthread_should_stop() &&
3340 ocfs2_downconvert_thread_lists_empty(osb))) {
3341
3342 wait_event_interruptible(osb->dc_event,
3343 ocfs2_downconvert_thread_should_wake(osb) ||
3344 kthread_should_stop());
3345
3346 mlog(0, "downconvert_thread: awoken\n");
3347
3348 ocfs2_downconvert_thread_do_work(osb);
3349 }
3350
3351 osb->dc_task = NULL;
3352 return status;
3353}
3354
3355void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
3356{
3357 spin_lock(&osb->dc_task_lock);
3358 /* make sure the voting thread gets a swipe at whatever changes
3359 * the caller may have made to the voting state */
3360 osb->dc_wake_sequence++;
3361 spin_unlock(&osb->dc_task_lock);
3362 wake_up(&osb->dc_event);
3363}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 87a785e41205..5f17243ba501 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,12 +49,12 @@ struct ocfs2_meta_lvb {
49 __be32 lvb_reserved2; 49 __be32 lvb_reserved2;
50}; 50};
51 51
52/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */ 52/* ocfs2_inode_lock_full() 'arg_flags' flags */
53/* don't wait on recovery. */ 53/* don't wait on recovery. */
54#define OCFS2_META_LOCK_RECOVERY (0x01) 54#define OCFS2_META_LOCK_RECOVERY (0x01)
55/* Instruct the dlm not to queue ourselves on the other node. */ 55/* Instruct the dlm not to queue ourselves on the other node. */
56#define OCFS2_META_LOCK_NOQUEUE (0x02) 56#define OCFS2_META_LOCK_NOQUEUE (0x02)
57/* don't block waiting for the vote thread, instead return -EAGAIN */ 57/* don't block waiting for the downconvert thread, instead return -EAGAIN */
58#define OCFS2_LOCK_NONBLOCK (0x04) 58#define OCFS2_LOCK_NONBLOCK (0x04)
59 59
60int ocfs2_dlm_init(struct ocfs2_super *osb); 60int ocfs2_dlm_init(struct ocfs2_super *osb);
@@ -66,38 +66,32 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
66 struct inode *inode); 66 struct inode *inode);
67void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, 67void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
68 u64 parent, struct inode *inode); 68 u64 parent, struct inode *inode);
69struct ocfs2_file_private;
70void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
71 struct ocfs2_file_private *fp);
69void ocfs2_lock_res_free(struct ocfs2_lock_res *res); 72void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
70int ocfs2_create_new_inode_locks(struct inode *inode); 73int ocfs2_create_new_inode_locks(struct inode *inode);
71int ocfs2_drop_inode_locks(struct inode *inode); 74int ocfs2_drop_inode_locks(struct inode *inode);
72int ocfs2_data_lock_full(struct inode *inode,
73 int write,
74 int arg_flags);
75#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
76int ocfs2_data_lock_with_page(struct inode *inode,
77 int write,
78 struct page *page);
79void ocfs2_data_unlock(struct inode *inode,
80 int write);
81int ocfs2_rw_lock(struct inode *inode, int write); 75int ocfs2_rw_lock(struct inode *inode, int write);
82void ocfs2_rw_unlock(struct inode *inode, int write); 76void ocfs2_rw_unlock(struct inode *inode, int write);
83int ocfs2_open_lock(struct inode *inode); 77int ocfs2_open_lock(struct inode *inode);
84int ocfs2_try_open_lock(struct inode *inode, int write); 78int ocfs2_try_open_lock(struct inode *inode, int write);
85void ocfs2_open_unlock(struct inode *inode); 79void ocfs2_open_unlock(struct inode *inode);
86int ocfs2_meta_lock_atime(struct inode *inode, 80int ocfs2_inode_lock_atime(struct inode *inode,
87 struct vfsmount *vfsmnt, 81 struct vfsmount *vfsmnt,
88 int *level); 82 int *level);
89int ocfs2_meta_lock_full(struct inode *inode, 83int ocfs2_inode_lock_full(struct inode *inode,
90 struct buffer_head **ret_bh, 84 struct buffer_head **ret_bh,
91 int ex, 85 int ex,
92 int arg_flags); 86 int arg_flags);
93int ocfs2_meta_lock_with_page(struct inode *inode, 87int ocfs2_inode_lock_with_page(struct inode *inode,
94 struct buffer_head **ret_bh, 88 struct buffer_head **ret_bh,
95 int ex, 89 int ex,
96 struct page *page); 90 struct page *page);
97/* 99% of the time we don't want to supply any additional flags -- 91/* 99% of the time we don't want to supply any additional flags --
98 * those are for very specific cases only. */ 92 * those are for very specific cases only. */
99#define ocfs2_meta_lock(i, b, e) ocfs2_meta_lock_full(i, b, e, 0) 93#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
100void ocfs2_meta_unlock(struct inode *inode, 94void ocfs2_inode_unlock(struct inode *inode,
101 int ex); 95 int ex);
102int ocfs2_super_lock(struct ocfs2_super *osb, 96int ocfs2_super_lock(struct ocfs2_super *osb,
103 int ex); 97 int ex);
@@ -107,14 +101,17 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
107void ocfs2_rename_unlock(struct ocfs2_super *osb); 101void ocfs2_rename_unlock(struct ocfs2_super *osb);
108int ocfs2_dentry_lock(struct dentry *dentry, int ex); 102int ocfs2_dentry_lock(struct dentry *dentry, int ex);
109void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 103void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
104int ocfs2_file_lock(struct file *file, int ex, int trylock);
105void ocfs2_file_unlock(struct file *file);
110 106
111void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 107void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
112void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, 108void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
113 struct ocfs2_lock_res *lockres); 109 struct ocfs2_lock_res *lockres);
114 110
115/* for the vote thread */ 111/* for the downconvert thread */
116void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 112void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
117 struct ocfs2_lock_res *lockres); 113 struct ocfs2_lock_res *lockres);
114void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
118 115
119struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); 116struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
120void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); 117void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index ff257628af16..1942e09f6ee5 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -37,11 +37,6 @@ static inline void le64_add_cpu(__le64 *var, u64 val)
37 *var = cpu_to_le64(le64_to_cpu(*var) + val); 37 *var = cpu_to_le64(le64_to_cpu(*var) + val);
38} 38}
39 39
40static inline void le32_and_cpu(__le32 *var, u32 val)
41{
42 *var = cpu_to_le32(le32_to_cpu(*var) & val);
43}
44
45static inline void be32_add_cpu(__be32 *var, u32 val) 40static inline void be32_add_cpu(__be32 *var, u32 val)
46{ 41{
47 *var = cpu_to_be32(be32_to_cpu(*var) + val); 42 *var = cpu_to_be32(be32_to_cpu(*var) + val);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 535bfa9568a4..67527cebf214 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -58,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
58 return ERR_PTR(-ESTALE); 58 return ERR_PTR(-ESTALE);
59 } 59 }
60 60
61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0); 61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
62 62
63 if (IS_ERR(inode)) 63 if (IS_ERR(inode))
64 return (void *)inode; 64 return (void *)inode;
@@ -95,7 +95,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
95 mlog(0, "find parent of directory %llu\n", 95 mlog(0, "find parent of directory %llu\n",
96 (unsigned long long)OCFS2_I(dir)->ip_blkno); 96 (unsigned long long)OCFS2_I(dir)->ip_blkno);
97 97
98 status = ocfs2_meta_lock(dir, NULL, 0); 98 status = ocfs2_inode_lock(dir, NULL, 0);
99 if (status < 0) { 99 if (status < 0) {
100 if (status != -ENOENT) 100 if (status != -ENOENT)
101 mlog_errno(status); 101 mlog_errno(status);
@@ -109,7 +109,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
109 goto bail_unlock; 109 goto bail_unlock;
110 } 110 }
111 111
112 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 112 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
113 if (IS_ERR(inode)) { 113 if (IS_ERR(inode)) {
114 mlog(ML_ERROR, "Unable to create inode %llu\n", 114 mlog(ML_ERROR, "Unable to create inode %llu\n",
115 (unsigned long long)blkno); 115 (unsigned long long)blkno);
@@ -126,7 +126,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
126 parent->d_op = &ocfs2_dentry_ops; 126 parent->d_op = &ocfs2_dentry_ops;
127 127
128bail_unlock: 128bail_unlock:
129 ocfs2_meta_unlock(dir, 0); 129 ocfs2_inode_unlock(dir, 0);
130 130
131bail: 131bail:
132 mlog_exit_ptr(parent); 132 mlog_exit_ptr(parent);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b75b2e1f0e42..ed5d5232e85d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -51,6 +51,7 @@
51#include "inode.h" 51#include "inode.h"
52#include "ioctl.h" 52#include "ioctl.h"
53#include "journal.h" 53#include "journal.h"
54#include "locks.h"
54#include "mmap.h" 55#include "mmap.h"
55#include "suballoc.h" 56#include "suballoc.h"
56#include "super.h" 57#include "super.h"
@@ -63,6 +64,35 @@ static int ocfs2_sync_inode(struct inode *inode)
63 return sync_mapping_buffers(inode->i_mapping); 64 return sync_mapping_buffers(inode->i_mapping);
64} 65}
65 66
67static int ocfs2_init_file_private(struct inode *inode, struct file *file)
68{
69 struct ocfs2_file_private *fp;
70
71 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
72 if (!fp)
73 return -ENOMEM;
74
75 fp->fp_file = file;
76 mutex_init(&fp->fp_mutex);
77 ocfs2_file_lock_res_init(&fp->fp_flock, fp);
78 file->private_data = fp;
79
80 return 0;
81}
82
83static void ocfs2_free_file_private(struct inode *inode, struct file *file)
84{
85 struct ocfs2_file_private *fp = file->private_data;
86 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
87
88 if (fp) {
89 ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
90 ocfs2_lock_res_free(&fp->fp_flock);
91 kfree(fp);
92 file->private_data = NULL;
93 }
94}
95
66static int ocfs2_file_open(struct inode *inode, struct file *file) 96static int ocfs2_file_open(struct inode *inode, struct file *file)
67{ 97{
68 int status; 98 int status;
@@ -89,7 +119,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
89 119
90 oi->ip_open_count++; 120 oi->ip_open_count++;
91 spin_unlock(&oi->ip_lock); 121 spin_unlock(&oi->ip_lock);
92 status = 0; 122
123 status = ocfs2_init_file_private(inode, file);
124 if (status) {
125 /*
126 * We want to set open count back if we're failing the
127 * open.
128 */
129 spin_lock(&oi->ip_lock);
130 oi->ip_open_count--;
131 spin_unlock(&oi->ip_lock);
132 }
133
93leave: 134leave:
94 mlog_exit(status); 135 mlog_exit(status);
95 return status; 136 return status;
@@ -108,11 +149,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
108 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 149 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
109 spin_unlock(&oi->ip_lock); 150 spin_unlock(&oi->ip_lock);
110 151
152 ocfs2_free_file_private(inode, file);
153
111 mlog_exit(0); 154 mlog_exit(0);
112 155
113 return 0; 156 return 0;
114} 157}
115 158
159static int ocfs2_dir_open(struct inode *inode, struct file *file)
160{
161 return ocfs2_init_file_private(inode, file);
162}
163
164static int ocfs2_dir_release(struct inode *inode, struct file *file)
165{
166 ocfs2_free_file_private(inode, file);
167 return 0;
168}
169
116static int ocfs2_sync_file(struct file *file, 170static int ocfs2_sync_file(struct file *file,
117 struct dentry *dentry, 171 struct dentry *dentry,
118 int datasync) 172 int datasync)
@@ -382,18 +436,13 @@ static int ocfs2_truncate_file(struct inode *inode,
382 436
383 down_write(&OCFS2_I(inode)->ip_alloc_sem); 437 down_write(&OCFS2_I(inode)->ip_alloc_sem);
384 438
385 /* This forces other nodes to sync and drop their pages. Do 439 /*
386 * this even if we have a truncate without allocation change - 440 * The inode lock forced other nodes to sync and drop their
387 * ocfs2 cluster sizes can be much greater than page size, so 441 * pages, which (correctly) happens even if we have a truncate
388 * we have to truncate them anyway. */ 442 * without allocation change - ocfs2 cluster sizes can be much
389 status = ocfs2_data_lock(inode, 1); 443 * greater than page size, so we have to truncate them
390 if (status < 0) { 444 * anyway.
391 up_write(&OCFS2_I(inode)->ip_alloc_sem); 445 */
392
393 mlog_errno(status);
394 goto bail;
395 }
396
397 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 446 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
398 truncate_inode_pages(inode->i_mapping, new_i_size); 447 truncate_inode_pages(inode->i_mapping, new_i_size);
399 448
@@ -403,7 +452,7 @@ static int ocfs2_truncate_file(struct inode *inode,
403 if (status) 452 if (status)
404 mlog_errno(status); 453 mlog_errno(status);
405 454
406 goto bail_unlock_data; 455 goto bail_unlock_sem;
407 } 456 }
408 457
409 /* alright, we're going to need to do a full blown alloc size 458 /* alright, we're going to need to do a full blown alloc size
@@ -413,25 +462,23 @@ static int ocfs2_truncate_file(struct inode *inode,
413 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 462 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
414 if (status < 0) { 463 if (status < 0) {
415 mlog_errno(status); 464 mlog_errno(status);
416 goto bail_unlock_data; 465 goto bail_unlock_sem;
417 } 466 }
418 467
419 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 468 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
420 if (status < 0) { 469 if (status < 0) {
421 mlog_errno(status); 470 mlog_errno(status);
422 goto bail_unlock_data; 471 goto bail_unlock_sem;
423 } 472 }
424 473
425 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 474 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
426 if (status < 0) { 475 if (status < 0) {
427 mlog_errno(status); 476 mlog_errno(status);
428 goto bail_unlock_data; 477 goto bail_unlock_sem;
429 } 478 }
430 479
431 /* TODO: orphan dir cleanup here. */ 480 /* TODO: orphan dir cleanup here. */
432bail_unlock_data: 481bail_unlock_sem:
433 ocfs2_data_unlock(inode, 1);
434
435 up_write(&OCFS2_I(inode)->ip_alloc_sem); 482 up_write(&OCFS2_I(inode)->ip_alloc_sem);
436 483
437bail: 484bail:
@@ -579,7 +626,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
579 626
580 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 627 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
581 "clusters_to_add = %u, extents_to_split = %u\n", 628 "clusters_to_add = %u, extents_to_split = %u\n",
582 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 629 (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
583 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); 630 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
584 631
585 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 632 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
@@ -760,7 +807,7 @@ restarted_transaction:
760 le32_to_cpu(fe->i_clusters), 807 le32_to_cpu(fe->i_clusters),
761 (unsigned long long)le64_to_cpu(fe->i_size)); 808 (unsigned long long)le64_to_cpu(fe->i_size));
762 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 809 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
763 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 810 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
764 811
765leave: 812leave:
766 if (handle) { 813 if (handle) {
@@ -917,7 +964,7 @@ static int ocfs2_extend_file(struct inode *inode,
917 struct buffer_head *di_bh, 964 struct buffer_head *di_bh,
918 u64 new_i_size) 965 u64 new_i_size)
919{ 966{
920 int ret = 0, data_locked = 0; 967 int ret = 0;
921 struct ocfs2_inode_info *oi = OCFS2_I(inode); 968 struct ocfs2_inode_info *oi = OCFS2_I(inode);
922 969
923 BUG_ON(!di_bh); 970 BUG_ON(!di_bh);
@@ -943,20 +990,6 @@ static int ocfs2_extend_file(struct inode *inode,
943 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 990 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
944 goto out_update_size; 991 goto out_update_size;
945 992
946 /*
947 * protect the pages that ocfs2_zero_extend is going to be
948 * pulling into the page cache.. we do this before the
949 * metadata extend so that we don't get into the situation
950 * where we've extended the metadata but can't get the data
951 * lock to zero.
952 */
953 ret = ocfs2_data_lock(inode, 1);
954 if (ret < 0) {
955 mlog_errno(ret);
956 goto out;
957 }
958 data_locked = 1;
959
960 /* 993 /*
961 * The alloc sem blocks people in read/write from reading our 994 * The alloc sem blocks people in read/write from reading our
962 * allocation until we're done changing it. We depend on 995 * allocation until we're done changing it. We depend on
@@ -980,7 +1013,7 @@ static int ocfs2_extend_file(struct inode *inode,
980 up_write(&oi->ip_alloc_sem); 1013 up_write(&oi->ip_alloc_sem);
981 1014
982 mlog_errno(ret); 1015 mlog_errno(ret);
983 goto out_unlock; 1016 goto out;
984 } 1017 }
985 } 1018 }
986 1019
@@ -991,7 +1024,7 @@ static int ocfs2_extend_file(struct inode *inode,
991 1024
992 if (ret < 0) { 1025 if (ret < 0) {
993 mlog_errno(ret); 1026 mlog_errno(ret);
994 goto out_unlock; 1027 goto out;
995 } 1028 }
996 1029
997out_update_size: 1030out_update_size:
@@ -999,10 +1032,6 @@ out_update_size:
999 if (ret < 0) 1032 if (ret < 0)
1000 mlog_errno(ret); 1033 mlog_errno(ret);
1001 1034
1002out_unlock:
1003 if (data_locked)
1004 ocfs2_data_unlock(inode, 1);
1005
1006out: 1035out:
1007 return ret; 1036 return ret;
1008} 1037}
@@ -1050,7 +1079,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1050 } 1079 }
1051 } 1080 }
1052 1081
1053 status = ocfs2_meta_lock(inode, &bh, 1); 1082 status = ocfs2_inode_lock(inode, &bh, 1);
1054 if (status < 0) { 1083 if (status < 0) {
1055 if (status != -ENOENT) 1084 if (status != -ENOENT)
1056 mlog_errno(status); 1085 mlog_errno(status);
@@ -1102,7 +1131,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1102bail_commit: 1131bail_commit:
1103 ocfs2_commit_trans(osb, handle); 1132 ocfs2_commit_trans(osb, handle);
1104bail_unlock: 1133bail_unlock:
1105 ocfs2_meta_unlock(inode, 1); 1134 ocfs2_inode_unlock(inode, 1);
1106bail_unlock_rw: 1135bail_unlock_rw:
1107 if (size_change) 1136 if (size_change)
1108 ocfs2_rw_unlock(inode, 1); 1137 ocfs2_rw_unlock(inode, 1);
@@ -1149,7 +1178,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
1149 1178
1150 mlog_entry_void(); 1179 mlog_entry_void();
1151 1180
1152 ret = ocfs2_meta_lock(inode, NULL, 0); 1181 ret = ocfs2_inode_lock(inode, NULL, 0);
1153 if (ret) { 1182 if (ret) {
1154 if (ret != -ENOENT) 1183 if (ret != -ENOENT)
1155 mlog_errno(ret); 1184 mlog_errno(ret);
@@ -1158,7 +1187,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
1158 1187
1159 ret = generic_permission(inode, mask, NULL); 1188 ret = generic_permission(inode, mask, NULL);
1160 1189
1161 ocfs2_meta_unlock(inode, 0); 1190 ocfs2_inode_unlock(inode, 0);
1162out: 1191out:
1163 mlog_exit(ret); 1192 mlog_exit(ret);
1164 return ret; 1193 return ret;
@@ -1630,7 +1659,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1630 goto out; 1659 goto out;
1631 } 1660 }
1632 1661
1633 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1662 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1634 if (ret) { 1663 if (ret) {
1635 mlog_errno(ret); 1664 mlog_errno(ret);
1636 goto out_rw_unlock; 1665 goto out_rw_unlock;
@@ -1638,7 +1667,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1638 1667
1639 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1668 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1640 ret = -EPERM; 1669 ret = -EPERM;
1641 goto out_meta_unlock; 1670 goto out_inode_unlock;
1642 } 1671 }
1643 1672
1644 switch (sr->l_whence) { 1673 switch (sr->l_whence) {
@@ -1652,7 +1681,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1652 break; 1681 break;
1653 default: 1682 default:
1654 ret = -EINVAL; 1683 ret = -EINVAL;
1655 goto out_meta_unlock; 1684 goto out_inode_unlock;
1656 } 1685 }
1657 sr->l_whence = 0; 1686 sr->l_whence = 0;
1658 1687
@@ -1663,14 +1692,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1663 || (sr->l_start + llen) < 0 1692 || (sr->l_start + llen) < 0
1664 || (sr->l_start + llen) > max_off) { 1693 || (sr->l_start + llen) > max_off) {
1665 ret = -EINVAL; 1694 ret = -EINVAL;
1666 goto out_meta_unlock; 1695 goto out_inode_unlock;
1667 } 1696 }
1668 size = sr->l_start + sr->l_len; 1697 size = sr->l_start + sr->l_len;
1669 1698
1670 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1699 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1671 if (sr->l_len <= 0) { 1700 if (sr->l_len <= 0) {
1672 ret = -EINVAL; 1701 ret = -EINVAL;
1673 goto out_meta_unlock; 1702 goto out_inode_unlock;
1674 } 1703 }
1675 } 1704 }
1676 1705
@@ -1678,7 +1707,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1678 ret = __ocfs2_write_remove_suid(inode, di_bh); 1707 ret = __ocfs2_write_remove_suid(inode, di_bh);
1679 if (ret) { 1708 if (ret) {
1680 mlog_errno(ret); 1709 mlog_errno(ret);
1681 goto out_meta_unlock; 1710 goto out_inode_unlock;
1682 } 1711 }
1683 } 1712 }
1684 1713
@@ -1704,7 +1733,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1704 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1733 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1705 if (ret) { 1734 if (ret) {
1706 mlog_errno(ret); 1735 mlog_errno(ret);
1707 goto out_meta_unlock; 1736 goto out_inode_unlock;
1708 } 1737 }
1709 1738
1710 /* 1739 /*
@@ -1714,7 +1743,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1714 if (IS_ERR(handle)) { 1743 if (IS_ERR(handle)) {
1715 ret = PTR_ERR(handle); 1744 ret = PTR_ERR(handle);
1716 mlog_errno(ret); 1745 mlog_errno(ret);
1717 goto out_meta_unlock; 1746 goto out_inode_unlock;
1718 } 1747 }
1719 1748
1720 if (change_size && i_size_read(inode) < size) 1749 if (change_size && i_size_read(inode) < size)
@@ -1727,9 +1756,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1727 1756
1728 ocfs2_commit_trans(osb, handle); 1757 ocfs2_commit_trans(osb, handle);
1729 1758
1730out_meta_unlock: 1759out_inode_unlock:
1731 brelse(di_bh); 1760 brelse(di_bh);
1732 ocfs2_meta_unlock(inode, 1); 1761 ocfs2_inode_unlock(inode, 1);
1733out_rw_unlock: 1762out_rw_unlock:
1734 ocfs2_rw_unlock(inode, 1); 1763 ocfs2_rw_unlock(inode, 1);
1735 1764
@@ -1799,7 +1828,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1799 * if we need to make modifications here. 1828 * if we need to make modifications here.
1800 */ 1829 */
1801 for(;;) { 1830 for(;;) {
1802 ret = ocfs2_meta_lock(inode, NULL, meta_level); 1831 ret = ocfs2_inode_lock(inode, NULL, meta_level);
1803 if (ret < 0) { 1832 if (ret < 0) {
1804 meta_level = -1; 1833 meta_level = -1;
1805 mlog_errno(ret); 1834 mlog_errno(ret);
@@ -1817,7 +1846,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1817 * set inode->i_size at the end of a write. */ 1846 * set inode->i_size at the end of a write. */
1818 if (should_remove_suid(dentry)) { 1847 if (should_remove_suid(dentry)) {
1819 if (meta_level == 0) { 1848 if (meta_level == 0) {
1820 ocfs2_meta_unlock(inode, meta_level); 1849 ocfs2_inode_unlock(inode, meta_level);
1821 meta_level = 1; 1850 meta_level = 1;
1822 continue; 1851 continue;
1823 } 1852 }
@@ -1886,7 +1915,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1886 *ppos = saved_pos; 1915 *ppos = saved_pos;
1887 1916
1888out_unlock: 1917out_unlock:
1889 ocfs2_meta_unlock(inode, meta_level); 1918 ocfs2_inode_unlock(inode, meta_level);
1890 1919
1891out: 1920out:
1892 return ret; 1921 return ret;
@@ -2099,12 +2128,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
2099 /* 2128 /*
2100 * See the comment in ocfs2_file_aio_read() 2129 * See the comment in ocfs2_file_aio_read()
2101 */ 2130 */
2102 ret = ocfs2_meta_lock(inode, NULL, 0); 2131 ret = ocfs2_inode_lock(inode, NULL, 0);
2103 if (ret < 0) { 2132 if (ret < 0) {
2104 mlog_errno(ret); 2133 mlog_errno(ret);
2105 goto bail; 2134 goto bail;
2106 } 2135 }
2107 ocfs2_meta_unlock(inode, 0); 2136 ocfs2_inode_unlock(inode, 0);
2108 2137
2109 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 2138 ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2110 2139
@@ -2160,12 +2189,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2160 * like i_size. This allows the checks down below 2189 * like i_size. This allows the checks down below
2161 * generic_file_aio_read() a chance of actually working. 2190 * generic_file_aio_read() a chance of actually working.
2162 */ 2191 */
2163 ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2192 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2164 if (ret < 0) { 2193 if (ret < 0) {
2165 mlog_errno(ret); 2194 mlog_errno(ret);
2166 goto bail; 2195 goto bail;
2167 } 2196 }
2168 ocfs2_meta_unlock(inode, lock_level); 2197 ocfs2_inode_unlock(inode, lock_level);
2169 2198
2170 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2199 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2171 if (ret == -EINVAL) 2200 if (ret == -EINVAL)
@@ -2204,6 +2233,7 @@ const struct inode_operations ocfs2_special_file_iops = {
2204}; 2233};
2205 2234
2206const struct file_operations ocfs2_fops = { 2235const struct file_operations ocfs2_fops = {
2236 .llseek = generic_file_llseek,
2207 .read = do_sync_read, 2237 .read = do_sync_read,
2208 .write = do_sync_write, 2238 .write = do_sync_write,
2209 .mmap = ocfs2_mmap, 2239 .mmap = ocfs2_mmap,
@@ -2216,16 +2246,21 @@ const struct file_operations ocfs2_fops = {
2216#ifdef CONFIG_COMPAT 2246#ifdef CONFIG_COMPAT
2217 .compat_ioctl = ocfs2_compat_ioctl, 2247 .compat_ioctl = ocfs2_compat_ioctl,
2218#endif 2248#endif
2249 .flock = ocfs2_flock,
2219 .splice_read = ocfs2_file_splice_read, 2250 .splice_read = ocfs2_file_splice_read,
2220 .splice_write = ocfs2_file_splice_write, 2251 .splice_write = ocfs2_file_splice_write,
2221}; 2252};
2222 2253
2223const struct file_operations ocfs2_dops = { 2254const struct file_operations ocfs2_dops = {
2255 .llseek = generic_file_llseek,
2224 .read = generic_read_dir, 2256 .read = generic_read_dir,
2225 .readdir = ocfs2_readdir, 2257 .readdir = ocfs2_readdir,
2226 .fsync = ocfs2_sync_file, 2258 .fsync = ocfs2_sync_file,
2259 .release = ocfs2_dir_release,
2260 .open = ocfs2_dir_open,
2227 .ioctl = ocfs2_ioctl, 2261 .ioctl = ocfs2_ioctl,
2228#ifdef CONFIG_COMPAT 2262#ifdef CONFIG_COMPAT
2229 .compat_ioctl = ocfs2_compat_ioctl, 2263 .compat_ioctl = ocfs2_compat_ioctl,
2230#endif 2264#endif
2265 .flock = ocfs2_flock,
2231}; 2266};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 066f14add3a8..048ddcaf5c80 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -32,6 +32,12 @@ extern const struct inode_operations ocfs2_file_iops;
32extern const struct inode_operations ocfs2_special_file_iops; 32extern const struct inode_operations ocfs2_special_file_iops;
33struct ocfs2_alloc_context; 33struct ocfs2_alloc_context;
34 34
35struct ocfs2_file_private {
36 struct file *fp_file;
37 struct mutex fp_mutex;
38 struct ocfs2_lock_res fp_flock;
39};
40
35enum ocfs2_alloc_restarted { 41enum ocfs2_alloc_restarted {
36 RESTART_NONE = 0, 42 RESTART_NONE = 0,
37 RESTART_TRANS, 43 RESTART_TRANS,
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c4c36171240d..c0efd9489fe8 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -30,9 +30,6 @@
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/kmod.h> 31#include <linux/kmod.h>
32 32
33#include <cluster/heartbeat.h>
34#include <cluster/nodemanager.h>
35
36#include <dlm/dlmapi.h> 33#include <dlm/dlmapi.h>
37 34
38#define MLOG_MASK_PREFIX ML_SUPER 35#define MLOG_MASK_PREFIX ML_SUPER
@@ -44,13 +41,9 @@
44#include "heartbeat.h" 41#include "heartbeat.h"
45#include "inode.h" 42#include "inode.h"
46#include "journal.h" 43#include "journal.h"
47#include "vote.h"
48 44
49#include "buffer_head_io.h" 45#include "buffer_head_io.h"
50 46
51#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
52#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
53
54static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, 47static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
55 int bit); 48 int bit);
56static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, 49static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
@@ -64,9 +57,7 @@ static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
64void ocfs2_init_node_maps(struct ocfs2_super *osb) 57void ocfs2_init_node_maps(struct ocfs2_super *osb)
65{ 58{
66 spin_lock_init(&osb->node_map_lock); 59 spin_lock_init(&osb->node_map_lock);
67 ocfs2_node_map_init(&osb->mounted_map);
68 ocfs2_node_map_init(&osb->recovery_map); 60 ocfs2_node_map_init(&osb->recovery_map);
69 ocfs2_node_map_init(&osb->umount_map);
70 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); 61 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
71} 62}
72 63
@@ -87,24 +78,7 @@ static void ocfs2_do_node_down(int node_num,
87 return; 78 return;
88 } 79 }
89 80
90 if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
91 /* If a node is in the umount map, then we've been
92 * expecting him to go down and we know ahead of time
93 * that recovery is not necessary. */
94 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
95 return;
96 }
97
98 ocfs2_recovery_thread(osb, node_num); 81 ocfs2_recovery_thread(osb, node_num);
99
100 ocfs2_remove_node_from_vote_queues(osb, node_num);
101}
102
103static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
104 int node_num,
105 void *data)
106{
107 ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
108} 82}
109 83
110/* Called from the dlm when it's about to evict a node. We may also 84/* Called from the dlm when it's about to evict a node. We may also
@@ -121,27 +95,8 @@ static void ocfs2_dlm_eviction_cb(int node_num,
121 ocfs2_do_node_down(node_num, osb); 95 ocfs2_do_node_down(node_num, osb);
122} 96}
123 97
124static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
125 int node_num,
126 void *data)
127{
128 struct ocfs2_super *osb = data;
129
130 BUG_ON(osb->node_num == node_num);
131
132 mlog(0, "node up event for %d\n", node_num);
133 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
134}
135
136void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) 98void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
137{ 99{
138 o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
139 ocfs2_hb_node_down_cb, osb,
140 OCFS2_HB_NODE_DOWN_PRI);
141
142 o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
143 ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
144
145 /* Not exactly a heartbeat callback, but leads to essentially 100 /* Not exactly a heartbeat callback, but leads to essentially
146 * the same path so we set it up here. */ 101 * the same path so we set it up here. */
147 dlm_setup_eviction_cb(&osb->osb_eviction_cb, 102 dlm_setup_eviction_cb(&osb->osb_eviction_cb,
@@ -149,39 +104,6 @@ void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
149 osb); 104 osb);
150} 105}
151 106
152/* Most functions here are just stubs for now... */
153int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
154{
155 int status;
156
157 if (ocfs2_mount_local(osb))
158 return 0;
159
160 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
161 if (status < 0) {
162 mlog_errno(status);
163 goto bail;
164 }
165
166 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
167 if (status < 0) {
168 mlog_errno(status);
169 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
170 }
171
172bail:
173 return status;
174}
175
176void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
177{
178 if (ocfs2_mount_local(osb))
179 return;
180
181 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
182 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
183}
184
185void ocfs2_stop_heartbeat(struct ocfs2_super *osb) 107void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
186{ 108{
187 int ret; 109 int ret;
@@ -341,8 +263,6 @@ int ocfs2_recovery_map_set(struct ocfs2_super *osb,
341 263
342 spin_lock(&osb->node_map_lock); 264 spin_lock(&osb->node_map_lock);
343 265
344 __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
345
346 if (!test_bit(num, osb->recovery_map.map)) { 266 if (!test_bit(num, osb->recovery_map.map)) {
347 __ocfs2_node_map_set_bit(&osb->recovery_map, num); 267 __ocfs2_node_map_set_bit(&osb->recovery_map, num);
348 set = 1; 268 set = 1;
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index e8fb079122e4..56859211888a 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -29,8 +29,6 @@
29void ocfs2_init_node_maps(struct ocfs2_super *osb); 29void ocfs2_init_node_maps(struct ocfs2_super *osb);
30 30
31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); 31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
32int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
33void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
34void ocfs2_stop_heartbeat(struct ocfs2_super *osb); 32void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
35 33
36/* node map functions - used to keep track of mounted and in-recovery 34/* node map functions - used to keep track of mounted and in-recovery
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ebb2bbe30f35..7e9e4c79aec7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,7 +49,6 @@
49#include "symlink.h" 49#include "symlink.h"
50#include "sysfile.h" 50#include "sysfile.h"
51#include "uptodate.h" 51#include "uptodate.h"
52#include "vote.h"
53 52
54#include "buffer_head_io.h" 53#include "buffer_head_io.h"
55 54
@@ -58,8 +57,11 @@ struct ocfs2_find_inode_args
58 u64 fi_blkno; 57 u64 fi_blkno;
59 unsigned long fi_ino; 58 unsigned long fi_ino;
60 unsigned int fi_flags; 59 unsigned int fi_flags;
60 unsigned int fi_sysfile_type;
61}; 61};
62 62
63static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
64
63static int ocfs2_read_locked_inode(struct inode *inode, 65static int ocfs2_read_locked_inode(struct inode *inode,
64 struct ocfs2_find_inode_args *args); 66 struct ocfs2_find_inode_args *args);
65static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); 67static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -107,7 +109,8 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
107 oi->ip_attr |= OCFS2_DIRSYNC_FL; 109 oi->ip_attr |= OCFS2_DIRSYNC_FL;
108} 110}
109 111
110struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) 112struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
113 int sysfile_type)
111{ 114{
112 struct inode *inode = NULL; 115 struct inode *inode = NULL;
113 struct super_block *sb = osb->sb; 116 struct super_block *sb = osb->sb;
@@ -127,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
127 args.fi_blkno = blkno; 130 args.fi_blkno = blkno;
128 args.fi_flags = flags; 131 args.fi_flags = flags;
129 args.fi_ino = ino_from_blkno(sb, blkno); 132 args.fi_ino = ino_from_blkno(sb, blkno);
133 args.fi_sysfile_type = sysfile_type;
130 134
131 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, 135 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
132 ocfs2_init_locked_inode, &args); 136 ocfs2_init_locked_inode, &args);
@@ -201,6 +205,9 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
201 205
202 inode->i_ino = args->fi_ino; 206 inode->i_ino = args->fi_ino;
203 OCFS2_I(inode)->ip_blkno = args->fi_blkno; 207 OCFS2_I(inode)->ip_blkno = args->fi_blkno;
208 if (args->fi_sysfile_type != 0)
209 lockdep_set_class(&inode->i_mutex,
210 &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
204 211
205 mlog_exit(0); 212 mlog_exit(0);
206 return 0; 213 return 0;
@@ -322,7 +329,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
322 */ 329 */
323 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL); 330 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
324 331
325 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 332 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
326 OCFS2_LOCK_TYPE_META, 0, inode); 333 OCFS2_LOCK_TYPE_META, 0, inode);
327 334
328 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, 335 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
@@ -333,10 +340,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
333 OCFS2_LOCK_TYPE_RW, inode->i_generation, 340 OCFS2_LOCK_TYPE_RW, inode->i_generation,
334 inode); 341 inode);
335 342
336 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
337 OCFS2_LOCK_TYPE_DATA, inode->i_generation,
338 inode);
339
340 ocfs2_set_inode_flags(inode); 343 ocfs2_set_inode_flags(inode);
341 344
342 status = 0; 345 status = 0;
@@ -414,7 +417,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
414 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) 417 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
415 generation = osb->fs_generation; 418 generation = osb->fs_generation;
416 419
417 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 420 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
418 OCFS2_LOCK_TYPE_META, 421 OCFS2_LOCK_TYPE_META,
419 generation, inode); 422 generation, inode);
420 423
@@ -429,7 +432,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
429 mlog_errno(status); 432 mlog_errno(status);
430 return status; 433 return status;
431 } 434 }
432 status = ocfs2_meta_lock(inode, NULL, 0); 435 status = ocfs2_inode_lock(inode, NULL, 0);
433 if (status) { 436 if (status) {
434 make_bad_inode(inode); 437 make_bad_inode(inode);
435 mlog_errno(status); 438 mlog_errno(status);
@@ -484,7 +487,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
484 487
485bail: 488bail:
486 if (can_lock) 489 if (can_lock)
487 ocfs2_meta_unlock(inode, 0); 490 ocfs2_inode_unlock(inode, 0);
488 491
489 if (status < 0) 492 if (status < 0)
490 make_bad_inode(inode); 493 make_bad_inode(inode);
@@ -586,7 +589,7 @@ static int ocfs2_remove_inode(struct inode *inode,
586 } 589 }
587 590
588 mutex_lock(&inode_alloc_inode->i_mutex); 591 mutex_lock(&inode_alloc_inode->i_mutex);
589 status = ocfs2_meta_lock(inode_alloc_inode, &inode_alloc_bh, 1); 592 status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
590 if (status < 0) { 593 if (status < 0) {
591 mutex_unlock(&inode_alloc_inode->i_mutex); 594 mutex_unlock(&inode_alloc_inode->i_mutex);
592 595
@@ -617,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
617 } 620 }
618 621
619 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); 622 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
620 le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 623 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
621 624
622 status = ocfs2_journal_dirty(handle, di_bh); 625 status = ocfs2_journal_dirty(handle, di_bh);
623 if (status < 0) { 626 if (status < 0) {
@@ -635,7 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
635bail_commit: 638bail_commit:
636 ocfs2_commit_trans(osb, handle); 639 ocfs2_commit_trans(osb, handle);
637bail_unlock: 640bail_unlock:
638 ocfs2_meta_unlock(inode_alloc_inode, 1); 641 ocfs2_inode_unlock(inode_alloc_inode, 1);
639 mutex_unlock(&inode_alloc_inode->i_mutex); 642 mutex_unlock(&inode_alloc_inode->i_mutex);
640 brelse(inode_alloc_bh); 643 brelse(inode_alloc_bh);
641bail: 644bail:
@@ -709,7 +712,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
709 * delete_inode operation. We do this now to avoid races with 712 * delete_inode operation. We do this now to avoid races with
710 * recovery completion on other nodes. */ 713 * recovery completion on other nodes. */
711 mutex_lock(&orphan_dir_inode->i_mutex); 714 mutex_lock(&orphan_dir_inode->i_mutex);
712 status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); 715 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
713 if (status < 0) { 716 if (status < 0) {
714 mutex_unlock(&orphan_dir_inode->i_mutex); 717 mutex_unlock(&orphan_dir_inode->i_mutex);
715 718
@@ -718,8 +721,8 @@ static int ocfs2_wipe_inode(struct inode *inode,
718 } 721 }
719 722
720 /* we do this while holding the orphan dir lock because we 723 /* we do this while holding the orphan dir lock because we
721 * don't want recovery being run from another node to vote for 724 * don't want recovery being run from another node to try an
722 * an inode delete on us -- this will result in two nodes 725 * inode delete underneath us -- this will result in two nodes
723 * truncating the same file! */ 726 * truncating the same file! */
724 status = ocfs2_truncate_for_delete(osb, inode, di_bh); 727 status = ocfs2_truncate_for_delete(osb, inode, di_bh);
725 if (status < 0) { 728 if (status < 0) {
@@ -733,7 +736,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
733 mlog_errno(status); 736 mlog_errno(status);
734 737
735bail_unlock_dir: 738bail_unlock_dir:
736 ocfs2_meta_unlock(orphan_dir_inode, 1); 739 ocfs2_inode_unlock(orphan_dir_inode, 1);
737 mutex_unlock(&orphan_dir_inode->i_mutex); 740 mutex_unlock(&orphan_dir_inode->i_mutex);
738 brelse(orphan_dir_bh); 741 brelse(orphan_dir_bh);
739bail: 742bail:
@@ -744,7 +747,7 @@ bail:
744} 747}
745 748
746/* There is a series of simple checks that should be done before a 749/* There is a series of simple checks that should be done before a
747 * vote is even considered. Encapsulate those in this function. */ 750 * trylock is even considered. Encapsulate those in this function. */
748static int ocfs2_inode_is_valid_to_delete(struct inode *inode) 751static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
749{ 752{
750 int ret = 0; 753 int ret = 0;
@@ -758,14 +761,14 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
758 goto bail; 761 goto bail;
759 } 762 }
760 763
761 /* If we're coming from process_vote we can't go into our own 764 /* If we're coming from downconvert_thread we can't go into our own
762 * voting [hello, deadlock city!], so unforuntately we just 765 * voting [hello, deadlock city!], so unforuntately we just
763 * have to skip deleting this guy. That's OK though because 766 * have to skip deleting this guy. That's OK though because
764 * the node who's doing the actual deleting should handle it 767 * the node who's doing the actual deleting should handle it
765 * anyway. */ 768 * anyway. */
766 if (current == osb->vote_task) { 769 if (current == osb->dc_task) {
767 mlog(0, "Skipping delete of %lu because we're currently " 770 mlog(0, "Skipping delete of %lu because we're currently "
768 "in process_vote\n", inode->i_ino); 771 "in downconvert\n", inode->i_ino);
769 goto bail; 772 goto bail;
770 } 773 }
771 774
@@ -779,10 +782,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
779 goto bail_unlock; 782 goto bail_unlock;
780 } 783 }
781 784
782 /* If we have voted "yes" on the wipe of this inode for 785 /* If we have allowd wipe of this inode for another node, it
783 * another node, it will be marked here so we can safely skip 786 * will be marked here so we can safely skip it. Recovery will
784 * it. Recovery will cleanup any inodes we might inadvertantly 787 * cleanup any inodes we might inadvertantly skip here. */
785 * skip here. */
786 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { 788 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
787 mlog(0, "Skipping delete of %lu because another node " 789 mlog(0, "Skipping delete of %lu because another node "
788 "has done this for us.\n", inode->i_ino); 790 "has done this for us.\n", inode->i_ino);
@@ -929,13 +931,13 @@ void ocfs2_delete_inode(struct inode *inode)
929 931
930 /* Lock down the inode. This gives us an up to date view of 932 /* Lock down the inode. This gives us an up to date view of
931 * it's metadata (for verification), and allows us to 933 * it's metadata (for verification), and allows us to
932 * serialize delete_inode votes. 934 * serialize delete_inode on multiple nodes.
933 * 935 *
934 * Even though we might be doing a truncate, we don't take the 936 * Even though we might be doing a truncate, we don't take the
935 * allocation lock here as it won't be needed - nobody will 937 * allocation lock here as it won't be needed - nobody will
936 * have the file open. 938 * have the file open.
937 */ 939 */
938 status = ocfs2_meta_lock(inode, &di_bh, 1); 940 status = ocfs2_inode_lock(inode, &di_bh, 1);
939 if (status < 0) { 941 if (status < 0) {
940 if (status != -ENOENT) 942 if (status != -ENOENT)
941 mlog_errno(status); 943 mlog_errno(status);
@@ -947,15 +949,15 @@ void ocfs2_delete_inode(struct inode *inode)
947 * before we go ahead and wipe the inode. */ 949 * before we go ahead and wipe the inode. */
948 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); 950 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
949 if (!wipe || status < 0) { 951 if (!wipe || status < 0) {
950 /* Error and inode busy vote both mean we won't be 952 /* Error and remote inode busy both mean we won't be
951 * removing the inode, so they take almost the same 953 * removing the inode, so they take almost the same
952 * path. */ 954 * path. */
953 if (status < 0) 955 if (status < 0)
954 mlog_errno(status); 956 mlog_errno(status);
955 957
956 /* Someone in the cluster has voted to not wipe this 958 /* Someone in the cluster has disallowed a wipe of
957 * inode, or it was never completely orphaned. Write 959 * this inode, or it was never completely
958 * out the pages and exit now. */ 960 * orphaned. Write out the pages and exit now. */
959 ocfs2_cleanup_delete_inode(inode, 1); 961 ocfs2_cleanup_delete_inode(inode, 1);
960 goto bail_unlock_inode; 962 goto bail_unlock_inode;
961 } 963 }
@@ -981,7 +983,7 @@ void ocfs2_delete_inode(struct inode *inode)
981 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; 983 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
982 984
983bail_unlock_inode: 985bail_unlock_inode:
984 ocfs2_meta_unlock(inode, 1); 986 ocfs2_inode_unlock(inode, 1);
985 brelse(di_bh); 987 brelse(di_bh);
986bail_unblock: 988bail_unblock:
987 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 989 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
@@ -1008,15 +1010,14 @@ void ocfs2_clear_inode(struct inode *inode)
1008 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 1010 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
1009 "Inode=%lu\n", inode->i_ino); 1011 "Inode=%lu\n", inode->i_ino);
1010 1012
1011 /* For remove delete_inode vote, we hold open lock before, 1013 /* To preven remote deletes we hold open lock before, now it
1012 * now it is time to unlock PR and EX open locks. */ 1014 * is time to unlock PR and EX open locks. */
1013 ocfs2_open_unlock(inode); 1015 ocfs2_open_unlock(inode);
1014 1016
1015 /* Do these before all the other work so that we don't bounce 1017 /* Do these before all the other work so that we don't bounce
1016 * the vote thread while waiting to destroy the locks. */ 1018 * the downconvert thread while waiting to destroy the locks. */
1017 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); 1019 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
1018 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); 1020 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
1019 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
1020 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); 1021 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
1021 1022
1022 /* We very well may get a clear_inode before all an inodes 1023 /* We very well may get a clear_inode before all an inodes
@@ -1039,8 +1040,7 @@ void ocfs2_clear_inode(struct inode *inode)
1039 mlog_errno(status); 1040 mlog_errno(status);
1040 1041
1041 ocfs2_lock_res_free(&oi->ip_rw_lockres); 1042 ocfs2_lock_res_free(&oi->ip_rw_lockres);
1042 ocfs2_lock_res_free(&oi->ip_meta_lockres); 1043 ocfs2_lock_res_free(&oi->ip_inode_lockres);
1043 ocfs2_lock_res_free(&oi->ip_data_lockres);
1044 ocfs2_lock_res_free(&oi->ip_open_lockres); 1044 ocfs2_lock_res_free(&oi->ip_open_lockres);
1045 1045
1046 ocfs2_metadata_cache_purge(inode); 1046 ocfs2_metadata_cache_purge(inode);
@@ -1184,15 +1184,15 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
1184 } 1184 }
1185 spin_unlock(&OCFS2_I(inode)->ip_lock); 1185 spin_unlock(&OCFS2_I(inode)->ip_lock);
1186 1186
1187 /* Let ocfs2_meta_lock do the work of updating our struct 1187 /* Let ocfs2_inode_lock do the work of updating our struct
1188 * inode for us. */ 1188 * inode for us. */
1189 status = ocfs2_meta_lock(inode, NULL, 0); 1189 status = ocfs2_inode_lock(inode, NULL, 0);
1190 if (status < 0) { 1190 if (status < 0) {
1191 if (status != -ENOENT) 1191 if (status != -ENOENT)
1192 mlog_errno(status); 1192 mlog_errno(status);
1193 goto bail; 1193 goto bail;
1194 } 1194 }
1195 ocfs2_meta_unlock(inode, 0); 1195 ocfs2_inode_unlock(inode, 0);
1196bail: 1196bail:
1197 mlog_exit(status); 1197 mlog_exit(status);
1198 1198
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 70e881c55536..390a85596aa0 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -34,8 +34,7 @@ struct ocfs2_inode_info
34 u64 ip_blkno; 34 u64 ip_blkno;
35 35
36 struct ocfs2_lock_res ip_rw_lockres; 36 struct ocfs2_lock_res ip_rw_lockres;
37 struct ocfs2_lock_res ip_meta_lockres; 37 struct ocfs2_lock_res ip_inode_lockres;
38 struct ocfs2_lock_res ip_data_lockres;
39 struct ocfs2_lock_res ip_open_lockres; 38 struct ocfs2_lock_res ip_open_lockres;
40 39
41 /* protects allocation changes on this inode. */ 40 /* protects allocation changes on this inode. */
@@ -121,9 +120,10 @@ void ocfs2_delete_inode(struct inode *inode);
121void ocfs2_drop_inode(struct inode *inode); 120void ocfs2_drop_inode(struct inode *inode);
122 121
123/* Flags for ocfs2_iget() */ 122/* Flags for ocfs2_iget() */
124#define OCFS2_FI_FLAG_SYSFILE 0x4 123#define OCFS2_FI_FLAG_SYSFILE 0x1
125#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8 124#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
126struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); 125struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
126 int sysfile_type);
127int ocfs2_inode_init_private(struct inode *inode); 127int ocfs2_inode_init_private(struct inode *inode);
128int ocfs2_inode_revalidate(struct dentry *dentry); 128int ocfs2_inode_revalidate(struct dentry *dentry);
129int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 129int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 87dcece7e1b5..5177fba5162b 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -20,6 +20,7 @@
20 20
21#include "ocfs2_fs.h" 21#include "ocfs2_fs.h"
22#include "ioctl.h" 22#include "ioctl.h"
23#include "resize.h"
23 24
24#include <linux/ext2_fs.h> 25#include <linux/ext2_fs.h>
25 26
@@ -27,14 +28,14 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
27{ 28{
28 int status; 29 int status;
29 30
30 status = ocfs2_meta_lock(inode, NULL, 0); 31 status = ocfs2_inode_lock(inode, NULL, 0);
31 if (status < 0) { 32 if (status < 0) {
32 mlog_errno(status); 33 mlog_errno(status);
33 return status; 34 return status;
34 } 35 }
35 ocfs2_get_inode_flags(OCFS2_I(inode)); 36 ocfs2_get_inode_flags(OCFS2_I(inode));
36 *flags = OCFS2_I(inode)->ip_attr; 37 *flags = OCFS2_I(inode)->ip_attr;
37 ocfs2_meta_unlock(inode, 0); 38 ocfs2_inode_unlock(inode, 0);
38 39
39 mlog_exit(status); 40 mlog_exit(status);
40 return status; 41 return status;
@@ -52,7 +53,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
52 53
53 mutex_lock(&inode->i_mutex); 54 mutex_lock(&inode->i_mutex);
54 55
55 status = ocfs2_meta_lock(inode, &bh, 1); 56 status = ocfs2_inode_lock(inode, &bh, 1);
56 if (status < 0) { 57 if (status < 0) {
57 mlog_errno(status); 58 mlog_errno(status);
58 goto bail; 59 goto bail;
@@ -100,7 +101,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
100 101
101 ocfs2_commit_trans(osb, handle); 102 ocfs2_commit_trans(osb, handle);
102bail_unlock: 103bail_unlock:
103 ocfs2_meta_unlock(inode, 1); 104 ocfs2_inode_unlock(inode, 1);
104bail: 105bail:
105 mutex_unlock(&inode->i_mutex); 106 mutex_unlock(&inode->i_mutex);
106 107
@@ -115,8 +116,10 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
115 unsigned int cmd, unsigned long arg) 116 unsigned int cmd, unsigned long arg)
116{ 117{
117 unsigned int flags; 118 unsigned int flags;
119 int new_clusters;
118 int status; 120 int status;
119 struct ocfs2_space_resv sr; 121 struct ocfs2_space_resv sr;
122 struct ocfs2_new_group_input input;
120 123
121 switch (cmd) { 124 switch (cmd) {
122 case OCFS2_IOC_GETFLAGS: 125 case OCFS2_IOC_GETFLAGS:
@@ -140,6 +143,23 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
140 return -EFAULT; 143 return -EFAULT;
141 144
142 return ocfs2_change_file_space(filp, cmd, &sr); 145 return ocfs2_change_file_space(filp, cmd, &sr);
146 case OCFS2_IOC_GROUP_EXTEND:
147 if (!capable(CAP_SYS_RESOURCE))
148 return -EPERM;
149
150 if (get_user(new_clusters, (int __user *)arg))
151 return -EFAULT;
152
153 return ocfs2_group_extend(inode, new_clusters);
154 case OCFS2_IOC_GROUP_ADD:
155 case OCFS2_IOC_GROUP_ADD64:
156 if (!capable(CAP_SYS_RESOURCE))
157 return -EPERM;
158
159 if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
160 return -EFAULT;
161
162 return ocfs2_group_add(inode, &input);
143 default: 163 default:
144 return -ENOTTY; 164 return -ENOTTY;
145 } 165 }
@@ -162,6 +182,9 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
162 case OCFS2_IOC_RESVSP64: 182 case OCFS2_IOC_RESVSP64:
163 case OCFS2_IOC_UNRESVSP: 183 case OCFS2_IOC_UNRESVSP:
164 case OCFS2_IOC_UNRESVSP64: 184 case OCFS2_IOC_UNRESVSP64:
185 case OCFS2_IOC_GROUP_EXTEND:
186 case OCFS2_IOC_GROUP_ADD:
187 case OCFS2_IOC_GROUP_ADD64:
165 break; 188 break;
166 default: 189 default:
167 return -ENOIOCTLCMD; 190 return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f9d01e25298d..f31c7e8c19c3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -44,7 +44,6 @@
44#include "localalloc.h" 44#include "localalloc.h"
45#include "slot_map.h" 45#include "slot_map.h"
46#include "super.h" 46#include "super.h"
47#include "vote.h"
48#include "sysfile.h" 47#include "sysfile.h"
49 48
50#include "buffer_head_io.h" 49#include "buffer_head_io.h"
@@ -103,7 +102,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
103 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", 102 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
104 journal->j_trans_id, flushed); 103 journal->j_trans_id, flushed);
105 104
106 ocfs2_kick_vote_thread(osb); 105 ocfs2_wake_downconvert_thread(osb);
107 wake_up(&journal->j_checkpointed); 106 wake_up(&journal->j_checkpointed);
108finally: 107finally:
109 mlog_exit(status); 108 mlog_exit(status);
@@ -174,6 +173,12 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
174 * transaction. extend_trans will either extend the current handle by 173 * transaction. extend_trans will either extend the current handle by
175 * nblocks, or commit it and start a new one with nblocks credits. 174 * nblocks, or commit it and start a new one with nblocks credits.
176 * 175 *
176 * This might call journal_restart() which will commit dirty buffers
177 * and then restart the transaction. Before calling
178 * ocfs2_extend_trans(), any changed blocks should have been
179 * dirtied. After calling it, all blocks which need to be changed must
180 * go through another set of journal_access/journal_dirty calls.
181 *
177 * WARNING: This will not release any semaphores or disk locks taken 182 * WARNING: This will not release any semaphores or disk locks taken
178 * during the transaction, so make sure they were taken *before* 183 * during the transaction, so make sure they were taken *before*
179 * start_trans or we'll have ordering deadlocks. 184 * start_trans or we'll have ordering deadlocks.
@@ -193,11 +198,15 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
193 198
194 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); 199 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
195 200
201#ifdef OCFS2_DEBUG_FS
202 status = 1;
203#else
196 status = journal_extend(handle, nblocks); 204 status = journal_extend(handle, nblocks);
197 if (status < 0) { 205 if (status < 0) {
198 mlog_errno(status); 206 mlog_errno(status);
199 goto bail; 207 goto bail;
200 } 208 }
209#endif
201 210
202 if (status > 0) { 211 if (status > 0) {
203 mlog(0, "journal_extend failed, trying journal_restart\n"); 212 mlog(0, "journal_extend failed, trying journal_restart\n");
@@ -304,14 +313,18 @@ int ocfs2_journal_dirty_data(handle_t *handle,
304 return err; 313 return err;
305} 314}
306 315
307#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) 316#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
308 317
309void ocfs2_set_journal_params(struct ocfs2_super *osb) 318void ocfs2_set_journal_params(struct ocfs2_super *osb)
310{ 319{
311 journal_t *journal = osb->journal->j_journal; 320 journal_t *journal = osb->journal->j_journal;
321 unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
322
323 if (osb->osb_commit_interval)
324 commit_interval = osb->osb_commit_interval;
312 325
313 spin_lock(&journal->j_state_lock); 326 spin_lock(&journal->j_state_lock);
314 journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; 327 journal->j_commit_interval = commit_interval;
315 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 328 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
316 journal->j_flags |= JFS_BARRIER; 329 journal->j_flags |= JFS_BARRIER;
317 else 330 else
@@ -327,7 +340,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
327 struct ocfs2_dinode *di = NULL; 340 struct ocfs2_dinode *di = NULL;
328 struct buffer_head *bh = NULL; 341 struct buffer_head *bh = NULL;
329 struct ocfs2_super *osb; 342 struct ocfs2_super *osb;
330 int meta_lock = 0; 343 int inode_lock = 0;
331 344
332 mlog_entry_void(); 345 mlog_entry_void();
333 346
@@ -357,14 +370,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
357 /* Skip recovery waits here - journal inode metadata never 370 /* Skip recovery waits here - journal inode metadata never
358 * changes in a live cluster so it can be considered an 371 * changes in a live cluster so it can be considered an
359 * exception to the rule. */ 372 * exception to the rule. */
360 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 373 status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
361 if (status < 0) { 374 if (status < 0) {
362 if (status != -ERESTARTSYS) 375 if (status != -ERESTARTSYS)
363 mlog(ML_ERROR, "Could not get lock on journal!\n"); 376 mlog(ML_ERROR, "Could not get lock on journal!\n");
364 goto done; 377 goto done;
365 } 378 }
366 379
367 meta_lock = 1; 380 inode_lock = 1;
368 di = (struct ocfs2_dinode *)bh->b_data; 381 di = (struct ocfs2_dinode *)bh->b_data;
369 382
370 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { 383 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
@@ -404,8 +417,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
404 status = 0; 417 status = 0;
405done: 418done:
406 if (status < 0) { 419 if (status < 0) {
407 if (meta_lock) 420 if (inode_lock)
408 ocfs2_meta_unlock(inode, 1); 421 ocfs2_inode_unlock(inode, 1);
409 if (bh != NULL) 422 if (bh != NULL)
410 brelse(bh); 423 brelse(bh);
411 if (inode) { 424 if (inode) {
@@ -534,7 +547,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
534 OCFS2_I(inode)->ip_open_count--; 547 OCFS2_I(inode)->ip_open_count--;
535 548
536 /* unlock our journal */ 549 /* unlock our journal */
537 ocfs2_meta_unlock(inode, 1); 550 ocfs2_inode_unlock(inode, 1);
538 551
539 brelse(journal->j_bh); 552 brelse(journal->j_bh);
540 journal->j_bh = NULL; 553 journal->j_bh = NULL;
@@ -873,8 +886,8 @@ restart:
873 ocfs2_super_unlock(osb, 1); 886 ocfs2_super_unlock(osb, 1);
874 887
875 /* We always run recovery on our own orphan dir - the dead 888 /* We always run recovery on our own orphan dir - the dead
876 * node(s) may have voted "no" on an inode delete earlier. A 889 * node(s) may have disallowd a previos inode delete. Re-processing
877 * revote is therefore required. */ 890 * is therefore required. */
878 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, 891 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
879 NULL); 892 NULL);
880 893
@@ -963,9 +976,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
963 } 976 }
964 SET_INODE_JOURNAL(inode); 977 SET_INODE_JOURNAL(inode);
965 978
966 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 979 status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
967 if (status < 0) { 980 if (status < 0) {
968 mlog(0, "status returned from ocfs2_meta_lock=%d\n", status); 981 mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
969 if (status != -ERESTARTSYS) 982 if (status != -ERESTARTSYS)
970 mlog(ML_ERROR, "Could not lock journal!\n"); 983 mlog(ML_ERROR, "Could not lock journal!\n");
971 goto done; 984 goto done;
@@ -1037,7 +1050,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1037done: 1050done:
1038 /* drop the lock on this nodes journal */ 1051 /* drop the lock on this nodes journal */
1039 if (got_lock) 1052 if (got_lock)
1040 ocfs2_meta_unlock(inode, 1); 1053 ocfs2_inode_unlock(inode, 1);
1041 1054
1042 if (inode) 1055 if (inode)
1043 iput(inode); 1056 iput(inode);
@@ -1152,14 +1165,14 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
1152 SET_INODE_JOURNAL(inode); 1165 SET_INODE_JOURNAL(inode);
1153 1166
1154 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; 1167 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
1155 status = ocfs2_meta_lock_full(inode, NULL, 1, flags); 1168 status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
1156 if (status < 0) { 1169 if (status < 0) {
1157 if (status != -EAGAIN) 1170 if (status != -EAGAIN)
1158 mlog_errno(status); 1171 mlog_errno(status);
1159 goto bail; 1172 goto bail;
1160 } 1173 }
1161 1174
1162 ocfs2_meta_unlock(inode, 1); 1175 ocfs2_inode_unlock(inode, 1);
1163bail: 1176bail:
1164 if (inode) 1177 if (inode)
1165 iput(inode); 1178 iput(inode);
@@ -1231,7 +1244,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
1231 1244
1232 /* Skip bad inodes so that recovery can continue */ 1245 /* Skip bad inodes so that recovery can continue */
1233 iter = ocfs2_iget(p->osb, ino, 1246 iter = ocfs2_iget(p->osb, ino,
1234 OCFS2_FI_FLAG_ORPHAN_RECOVERY); 1247 OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
1235 if (IS_ERR(iter)) 1248 if (IS_ERR(iter))
1236 return 0; 1249 return 0;
1237 1250
@@ -1267,7 +1280,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1267 } 1280 }
1268 1281
1269 mutex_lock(&orphan_dir_inode->i_mutex); 1282 mutex_lock(&orphan_dir_inode->i_mutex);
1270 status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0); 1283 status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
1271 if (status < 0) { 1284 if (status < 0) {
1272 mlog_errno(status); 1285 mlog_errno(status);
1273 goto out; 1286 goto out;
@@ -1277,12 +1290,13 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1277 ocfs2_orphan_filldir); 1290 ocfs2_orphan_filldir);
1278 if (status) { 1291 if (status) {
1279 mlog_errno(status); 1292 mlog_errno(status);
1280 goto out; 1293 goto out_cluster;
1281 } 1294 }
1282 1295
1283 *head = priv.head; 1296 *head = priv.head;
1284 1297
1285 ocfs2_meta_unlock(orphan_dir_inode, 0); 1298out_cluster:
1299 ocfs2_inode_unlock(orphan_dir_inode, 0);
1286out: 1300out:
1287 mutex_unlock(&orphan_dir_inode->i_mutex); 1301 mutex_unlock(&orphan_dir_inode->i_mutex);
1288 iput(orphan_dir_inode); 1302 iput(orphan_dir_inode);
@@ -1369,10 +1383,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1369 iter = oi->ip_next_orphan; 1383 iter = oi->ip_next_orphan;
1370 1384
1371 spin_lock(&oi->ip_lock); 1385 spin_lock(&oi->ip_lock);
1372 /* Delete voting may have set these on the assumption 1386 /* The remote delete code may have set these on the
1373 * that the other node would wipe them successfully. 1387 * assumption that the other node would wipe them
1374 * If they are still in the node's orphan dir, we need 1388 * successfully. If they are still in the node's
1375 * to reset that state. */ 1389 * orphan dir, we need to reset that state. */
1376 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); 1390 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
1377 1391
1378 /* Set the proper information to get us going into 1392 /* Set the proper information to get us going into
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 4b32e0961568..220f3e818e78 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -278,6 +278,12 @@ int ocfs2_journal_dirty_data(handle_t *handle,
278/* simple file updates like chmod, etc. */ 278/* simple file updates like chmod, etc. */
279#define OCFS2_INODE_UPDATE_CREDITS 1 279#define OCFS2_INODE_UPDATE_CREDITS 1
280 280
281/* group extend. inode update and last group update. */
282#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
283
284/* group add. inode update and the new group update. */
285#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
286
281/* get one bit out of a suballocator: dinode + group descriptor + 287/* get one bit out of a suballocator: dinode + group descriptor +
282 * prev. group desc. if we relink. */ 288 * prev. group desc. if we relink. */
283#define OCFS2_SUBALLOC_ALLOC (3) 289#define OCFS2_SUBALLOC_ALLOC (3)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 58ea88b5af36..add1ffdc5c6c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -75,18 +75,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
76 struct inode *local_alloc_inode); 76 struct inode *local_alloc_inode);
77 77
78/*
79 * Determine how large our local alloc window should be, in bits.
80 *
81 * These values (and the behavior in ocfs2_alloc_should_use_local) have
82 * been chosen so that most allocations, including new block groups go
83 * through local alloc.
84 */
85static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) 78static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
86{ 79{
87 BUG_ON(osb->s_clustersize_bits < 12); 80 BUG_ON(osb->s_clustersize_bits > 20);
88 81
89 return 2048 >> (osb->s_clustersize_bits - 12); 82 /* Size local alloc windows by the megabyte */
83 return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
90} 84}
91 85
92/* 86/*
@@ -96,18 +90,23 @@ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
96int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) 90int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
97{ 91{
98 int la_bits = ocfs2_local_alloc_window_bits(osb); 92 int la_bits = ocfs2_local_alloc_window_bits(osb);
93 int ret = 0;
99 94
100 if (osb->local_alloc_state != OCFS2_LA_ENABLED) 95 if (osb->local_alloc_state != OCFS2_LA_ENABLED)
101 return 0; 96 goto bail;
102 97
103 /* la_bits should be at least twice the size (in clusters) of 98 /* la_bits should be at least twice the size (in clusters) of
104 * a new block group. We want to be sure block group 99 * a new block group. We want to be sure block group
105 * allocations go through the local alloc, so allow an 100 * allocations go through the local alloc, so allow an
106 * allocation to take up to half the bitmap. */ 101 * allocation to take up to half the bitmap. */
107 if (bits > (la_bits / 2)) 102 if (bits > (la_bits / 2))
108 return 0; 103 goto bail;
109 104
110 return 1; 105 ret = 1;
106bail:
107 mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
108 osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
109 return ret;
111} 110}
112 111
113int ocfs2_load_local_alloc(struct ocfs2_super *osb) 112int ocfs2_load_local_alloc(struct ocfs2_super *osb)
@@ -121,6 +120,19 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
121 120
122 mlog_entry_void(); 121 mlog_entry_void();
123 122
123 if (ocfs2_mount_local(osb))
124 goto bail;
125
126 if (osb->local_alloc_size == 0)
127 goto bail;
128
129 if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
130 mlog(ML_NOTICE, "Requested local alloc window %d is larger "
131 "than max possible %u. Using defaults.\n",
132 ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
133 osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
134 }
135
124 /* read the alloc off disk */ 136 /* read the alloc off disk */
125 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, 137 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
126 osb->slot_num); 138 osb->slot_num);
@@ -181,6 +193,9 @@ bail:
181 if (inode) 193 if (inode)
182 iput(inode); 194 iput(inode);
183 195
196 mlog(0, "Local alloc window bits = %d\n",
197 ocfs2_local_alloc_window_bits(osb));
198
184 mlog_exit(status); 199 mlog_exit(status);
185 return status; 200 return status;
186} 201}
@@ -231,7 +246,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
231 246
232 mutex_lock(&main_bm_inode->i_mutex); 247 mutex_lock(&main_bm_inode->i_mutex);
233 248
234 status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1); 249 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
235 if (status < 0) { 250 if (status < 0) {
236 mlog_errno(status); 251 mlog_errno(status);
237 goto out_mutex; 252 goto out_mutex;
@@ -286,7 +301,7 @@ out_unlock:
286 if (main_bm_bh) 301 if (main_bm_bh)
287 brelse(main_bm_bh); 302 brelse(main_bm_bh);
288 303
289 ocfs2_meta_unlock(main_bm_inode, 1); 304 ocfs2_inode_unlock(main_bm_inode, 1);
290 305
291out_mutex: 306out_mutex:
292 mutex_unlock(&main_bm_inode->i_mutex); 307 mutex_unlock(&main_bm_inode->i_mutex);
@@ -399,7 +414,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
399 414
400 mutex_lock(&main_bm_inode->i_mutex); 415 mutex_lock(&main_bm_inode->i_mutex);
401 416
402 status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1); 417 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
403 if (status < 0) { 418 if (status < 0) {
404 mlog_errno(status); 419 mlog_errno(status);
405 goto out_mutex; 420 goto out_mutex;
@@ -424,7 +439,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
424 ocfs2_commit_trans(osb, handle); 439 ocfs2_commit_trans(osb, handle);
425 440
426out_unlock: 441out_unlock:
427 ocfs2_meta_unlock(main_bm_inode, 1); 442 ocfs2_inode_unlock(main_bm_inode, 1);
428 443
429out_mutex: 444out_mutex:
430 mutex_unlock(&main_bm_inode->i_mutex); 445 mutex_unlock(&main_bm_inode->i_mutex);
@@ -521,6 +536,9 @@ bail:
521 iput(local_alloc_inode); 536 iput(local_alloc_inode);
522 } 537 }
523 538
539 mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
540 status);
541
524 mlog_exit(status); 542 mlog_exit(status);
525 return status; 543 return status;
526} 544}
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
new file mode 100644
index 000000000000..203f87143877
--- /dev/null
+++ b/fs/ocfs2/locks.c
@@ -0,0 +1,125 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * locks.c
5 *
6 * Userspace file locking support
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27
28#define MLOG_MASK_PREFIX ML_INODE
29#include <cluster/masklog.h>
30
31#include "ocfs2.h"
32
33#include "dlmglue.h"
34#include "file.h"
35#include "locks.h"
36
37static int ocfs2_do_flock(struct file *file, struct inode *inode,
38 int cmd, struct file_lock *fl)
39{
40 int ret = 0, level = 0, trylock = 0;
41 struct ocfs2_file_private *fp = file->private_data;
42 struct ocfs2_lock_res *lockres = &fp->fp_flock;
43
44 if (fl->fl_type == F_WRLCK)
45 level = 1;
46 if (!IS_SETLKW(cmd))
47 trylock = 1;
48
49 mutex_lock(&fp->fp_mutex);
50
51 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
52 lockres->l_level > LKM_NLMODE) {
53 int old_level = 0;
54
55 if (lockres->l_level == LKM_EXMODE)
56 old_level = 1;
57
58 if (level == old_level)
59 goto out;
60
61 /*
62 * Converting an existing lock is not guaranteed to be
63 * atomic, so we can get away with simply unlocking
64 * here and allowing the lock code to try at the new
65 * level.
66 */
67
68 flock_lock_file_wait(file,
69 &(struct file_lock){.fl_type = F_UNLCK});
70
71 ocfs2_file_unlock(file);
72 }
73
74 ret = ocfs2_file_lock(file, level, trylock);
75 if (ret) {
76 if (ret == -EAGAIN && trylock)
77 ret = -EWOULDBLOCK;
78 else
79 mlog_errno(ret);
80 goto out;
81 }
82
83 ret = flock_lock_file_wait(file, fl);
84
85out:
86 mutex_unlock(&fp->fp_mutex);
87
88 return ret;
89}
90
91static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
92{
93 int ret;
94 struct ocfs2_file_private *fp = file->private_data;
95
96 mutex_lock(&fp->fp_mutex);
97 ocfs2_file_unlock(file);
98 ret = flock_lock_file_wait(file, fl);
99 mutex_unlock(&fp->fp_mutex);
100
101 return ret;
102}
103
104/*
105 * Overall flow of ocfs2_flock() was influenced by gfs2_flock().
106 */
107int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
108{
109 struct inode *inode = file->f_mapping->host;
110 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
111
112 if (!(fl->fl_flags & FL_FLOCK))
113 return -ENOLCK;
114 if (__mandatory_lock(inode))
115 return -ENOLCK;
116
117 if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
118 ocfs2_mount_local(osb))
119 return flock_lock_file_wait(file, fl);
120
121 if (fl->fl_type == F_UNLCK)
122 return ocfs2_do_funlock(file, cmd, fl);
123 else
124 return ocfs2_do_flock(file, inode, cmd, fl);
125}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/locks.h
index 9ea46f62de31..9743ef2324ec 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/locks.h
@@ -1,9 +1,9 @@
1/* -*- mode: c; c-basic-offset: 8; -*- 1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * vote.h 4 * locks.h
5 * 5 *
6 * description here 6 * Function prototypes for Userspace file locking support
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
@@ -23,26 +23,9 @@
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26#ifndef OCFS2_LOCKS_H
27#define OCFS2_LOCKS_H
26 28
27#ifndef VOTE_H 29int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
28#define VOTE_H
29 30
30int ocfs2_vote_thread(void *arg); 31#endif /* OCFS2_LOCKS_H */
31static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
32{
33 spin_lock(&osb->vote_task_lock);
34 /* make sure the voting thread gets a swipe at whatever changes
35 * the caller may have made to the voting state */
36 osb->vote_wake_sequence++;
37 spin_unlock(&osb->vote_task_lock);
38 wake_up(&osb->vote_event);
39}
40
41int ocfs2_request_mount_vote(struct ocfs2_super *osb);
42int ocfs2_request_umount_vote(struct ocfs2_super *osb);
43int ocfs2_register_net_handlers(struct ocfs2_super *osb);
44void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
45
46void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
47 int node_num);
48#endif
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 98756156d298..3dc18d67557c 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -168,7 +168,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
168 * node. Taking the data lock will also ensure that we don't 168 * node. Taking the data lock will also ensure that we don't
169 * attempt page truncation as part of a downconvert. 169 * attempt page truncation as part of a downconvert.
170 */ 170 */
171 ret = ocfs2_meta_lock(inode, &di_bh, 1); 171 ret = ocfs2_inode_lock(inode, &di_bh, 1);
172 if (ret < 0) { 172 if (ret < 0) {
173 mlog_errno(ret); 173 mlog_errno(ret);
174 goto out; 174 goto out;
@@ -181,21 +181,12 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
181 */ 181 */
182 down_write(&OCFS2_I(inode)->ip_alloc_sem); 182 down_write(&OCFS2_I(inode)->ip_alloc_sem);
183 183
184 ret = ocfs2_data_lock(inode, 1);
185 if (ret < 0) {
186 mlog_errno(ret);
187 goto out_meta_unlock;
188 }
189
190 ret = __ocfs2_page_mkwrite(inode, di_bh, page); 184 ret = __ocfs2_page_mkwrite(inode, di_bh, page);
191 185
192 ocfs2_data_unlock(inode, 1);
193
194out_meta_unlock:
195 up_write(&OCFS2_I(inode)->ip_alloc_sem); 186 up_write(&OCFS2_I(inode)->ip_alloc_sem);
196 187
197 brelse(di_bh); 188 brelse(di_bh);
198 ocfs2_meta_unlock(inode, 1); 189 ocfs2_inode_unlock(inode, 1);
199 190
200out: 191out:
201 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 192 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
@@ -214,13 +205,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
214{ 205{
215 int ret = 0, lock_level = 0; 206 int ret = 0, lock_level = 0;
216 207
217 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, 208 ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
218 file->f_vfsmnt, &lock_level); 209 file->f_vfsmnt, &lock_level);
219 if (ret < 0) { 210 if (ret < 0) {
220 mlog_errno(ret); 211 mlog_errno(ret);
221 goto out; 212 goto out;
222 } 213 }
223 ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level); 214 ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
224out: 215out:
225 vma->vm_ops = &ocfs2_file_vm_ops; 216 vma->vm_ops = &ocfs2_file_vm_ops;
226 vma->vm_flags |= VM_CAN_NONLINEAR; 217 vma->vm_flags |= VM_CAN_NONLINEAR;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 989ac2718587..ae9ad9587516 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,7 +60,6 @@
60#include "symlink.h" 60#include "symlink.h"
61#include "sysfile.h" 61#include "sysfile.h"
62#include "uptodate.h" 62#include "uptodate.h"
63#include "vote.h"
64 63
65#include "buffer_head_io.h" 64#include "buffer_head_io.h"
66 65
@@ -116,7 +115,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
116 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len, 115 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
117 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); 116 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
118 117
119 status = ocfs2_meta_lock(dir, NULL, 0); 118 status = ocfs2_inode_lock(dir, NULL, 0);
120 if (status < 0) { 119 if (status < 0) {
121 if (status != -ENOENT) 120 if (status != -ENOENT)
122 mlog_errno(status); 121 mlog_errno(status);
@@ -129,7 +128,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
129 if (status < 0) 128 if (status < 0)
130 goto bail_add; 129 goto bail_add;
131 130
132 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 131 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
133 if (IS_ERR(inode)) { 132 if (IS_ERR(inode)) {
134 ret = ERR_PTR(-EACCES); 133 ret = ERR_PTR(-EACCES);
135 goto bail_unlock; 134 goto bail_unlock;
@@ -176,8 +175,8 @@ bail_unlock:
176 /* Don't drop the cluster lock until *after* the d_add -- 175 /* Don't drop the cluster lock until *after* the d_add --
177 * unlink on another node will message us to remove that 176 * unlink on another node will message us to remove that
178 * dentry under this lock so otherwise we can race this with 177 * dentry under this lock so otherwise we can race this with
179 * the vote thread and have a stale dentry. */ 178 * the downconvert thread and have a stale dentry. */
180 ocfs2_meta_unlock(dir, 0); 179 ocfs2_inode_unlock(dir, 0);
181 180
182bail: 181bail:
183 182
@@ -209,7 +208,7 @@ static int ocfs2_mknod(struct inode *dir,
209 /* get our super block */ 208 /* get our super block */
210 osb = OCFS2_SB(dir->i_sb); 209 osb = OCFS2_SB(dir->i_sb);
211 210
212 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 211 status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
213 if (status < 0) { 212 if (status < 0) {
214 if (status != -ENOENT) 213 if (status != -ENOENT)
215 mlog_errno(status); 214 mlog_errno(status);
@@ -323,7 +322,7 @@ leave:
323 if (handle) 322 if (handle)
324 ocfs2_commit_trans(osb, handle); 323 ocfs2_commit_trans(osb, handle);
325 324
326 ocfs2_meta_unlock(dir, 1); 325 ocfs2_inode_unlock(dir, 1);
327 326
328 if (status == -ENOSPC) 327 if (status == -ENOSPC)
329 mlog(0, "Disk is full\n"); 328 mlog(0, "Disk is full\n");
@@ -553,7 +552,7 @@ static int ocfs2_link(struct dentry *old_dentry,
553 if (S_ISDIR(inode->i_mode)) 552 if (S_ISDIR(inode->i_mode))
554 return -EPERM; 553 return -EPERM;
555 554
556 err = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 555 err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
557 if (err < 0) { 556 if (err < 0) {
558 if (err != -ENOENT) 557 if (err != -ENOENT)
559 mlog_errno(err); 558 mlog_errno(err);
@@ -578,7 +577,7 @@ static int ocfs2_link(struct dentry *old_dentry,
578 goto out; 577 goto out;
579 } 578 }
580 579
581 err = ocfs2_meta_lock(inode, &fe_bh, 1); 580 err = ocfs2_inode_lock(inode, &fe_bh, 1);
582 if (err < 0) { 581 if (err < 0) {
583 if (err != -ENOENT) 582 if (err != -ENOENT)
584 mlog_errno(err); 583 mlog_errno(err);
@@ -643,10 +642,10 @@ static int ocfs2_link(struct dentry *old_dentry,
643out_commit: 642out_commit:
644 ocfs2_commit_trans(osb, handle); 643 ocfs2_commit_trans(osb, handle);
645out_unlock_inode: 644out_unlock_inode:
646 ocfs2_meta_unlock(inode, 1); 645 ocfs2_inode_unlock(inode, 1);
647 646
648out: 647out:
649 ocfs2_meta_unlock(dir, 1); 648 ocfs2_inode_unlock(dir, 1);
650 649
651 if (de_bh) 650 if (de_bh)
652 brelse(de_bh); 651 brelse(de_bh);
@@ -720,7 +719,7 @@ static int ocfs2_unlink(struct inode *dir,
720 return -EPERM; 719 return -EPERM;
721 } 720 }
722 721
723 status = ocfs2_meta_lock(dir, &parent_node_bh, 1); 722 status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
724 if (status < 0) { 723 if (status < 0) {
725 if (status != -ENOENT) 724 if (status != -ENOENT)
726 mlog_errno(status); 725 mlog_errno(status);
@@ -745,7 +744,7 @@ static int ocfs2_unlink(struct inode *dir,
745 goto leave; 744 goto leave;
746 } 745 }
747 746
748 status = ocfs2_meta_lock(inode, &fe_bh, 1); 747 status = ocfs2_inode_lock(inode, &fe_bh, 1);
749 if (status < 0) { 748 if (status < 0) {
750 if (status != -ENOENT) 749 if (status != -ENOENT)
751 mlog_errno(status); 750 mlog_errno(status);
@@ -765,7 +764,7 @@ static int ocfs2_unlink(struct inode *dir,
765 764
766 status = ocfs2_remote_dentry_delete(dentry); 765 status = ocfs2_remote_dentry_delete(dentry);
767 if (status < 0) { 766 if (status < 0) {
768 /* This vote should succeed under all normal 767 /* This remote delete should succeed under all normal
769 * circumstances. */ 768 * circumstances. */
770 mlog_errno(status); 769 mlog_errno(status);
771 goto leave; 770 goto leave;
@@ -841,13 +840,13 @@ leave:
841 ocfs2_commit_trans(osb, handle); 840 ocfs2_commit_trans(osb, handle);
842 841
843 if (child_locked) 842 if (child_locked)
844 ocfs2_meta_unlock(inode, 1); 843 ocfs2_inode_unlock(inode, 1);
845 844
846 ocfs2_meta_unlock(dir, 1); 845 ocfs2_inode_unlock(dir, 1);
847 846
848 if (orphan_dir) { 847 if (orphan_dir) {
849 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 848 /* This was locked for us in ocfs2_prepare_orphan_dir() */
850 ocfs2_meta_unlock(orphan_dir, 1); 849 ocfs2_inode_unlock(orphan_dir, 1);
851 mutex_unlock(&orphan_dir->i_mutex); 850 mutex_unlock(&orphan_dir->i_mutex);
852 iput(orphan_dir); 851 iput(orphan_dir);
853 } 852 }
@@ -908,7 +907,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
908 inode1 = tmpinode; 907 inode1 = tmpinode;
909 } 908 }
910 /* lock id2 */ 909 /* lock id2 */
911 status = ocfs2_meta_lock(inode2, bh2, 1); 910 status = ocfs2_inode_lock(inode2, bh2, 1);
912 if (status < 0) { 911 if (status < 0) {
913 if (status != -ENOENT) 912 if (status != -ENOENT)
914 mlog_errno(status); 913 mlog_errno(status);
@@ -917,14 +916,14 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
917 } 916 }
918 917
919 /* lock id1 */ 918 /* lock id1 */
920 status = ocfs2_meta_lock(inode1, bh1, 1); 919 status = ocfs2_inode_lock(inode1, bh1, 1);
921 if (status < 0) { 920 if (status < 0) {
922 /* 921 /*
923 * An error return must mean that no cluster locks 922 * An error return must mean that no cluster locks
924 * were held on function exit. 923 * were held on function exit.
925 */ 924 */
926 if (oi1->ip_blkno != oi2->ip_blkno) 925 if (oi1->ip_blkno != oi2->ip_blkno)
927 ocfs2_meta_unlock(inode2, 1); 926 ocfs2_inode_unlock(inode2, 1);
928 927
929 if (status != -ENOENT) 928 if (status != -ENOENT)
930 mlog_errno(status); 929 mlog_errno(status);
@@ -937,10 +936,10 @@ bail:
937 936
938static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) 937static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
939{ 938{
940 ocfs2_meta_unlock(inode1, 1); 939 ocfs2_inode_unlock(inode1, 1);
941 940
942 if (inode1 != inode2) 941 if (inode1 != inode2)
943 ocfs2_meta_unlock(inode2, 1); 942 ocfs2_inode_unlock(inode2, 1);
944} 943}
945 944
946static int ocfs2_rename(struct inode *old_dir, 945static int ocfs2_rename(struct inode *old_dir,
@@ -1031,10 +1030,11 @@ static int ocfs2_rename(struct inode *old_dir,
1031 1030
1032 /* 1031 /*
1033 * Aside from allowing a meta data update, the locking here 1032 * Aside from allowing a meta data update, the locking here
1034 * also ensures that the vote thread on other nodes won't have 1033 * also ensures that the downconvert thread on other nodes
1035 * to concurrently downconvert the inode and the dentry locks. 1034 * won't have to concurrently downconvert the inode and the
1035 * dentry locks.
1036 */ 1036 */
1037 status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1); 1037 status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
1038 if (status < 0) { 1038 if (status < 0) {
1039 if (status != -ENOENT) 1039 if (status != -ENOENT)
1040 mlog_errno(status); 1040 mlog_errno(status);
@@ -1143,7 +1143,7 @@ static int ocfs2_rename(struct inode *old_dir,
1143 goto bail; 1143 goto bail;
1144 } 1144 }
1145 1145
1146 status = ocfs2_meta_lock(new_inode, &newfe_bh, 1); 1146 status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
1147 if (status < 0) { 1147 if (status < 0) {
1148 if (status != -ENOENT) 1148 if (status != -ENOENT)
1149 mlog_errno(status); 1149 mlog_errno(status);
@@ -1355,14 +1355,14 @@ bail:
1355 ocfs2_double_unlock(old_dir, new_dir); 1355 ocfs2_double_unlock(old_dir, new_dir);
1356 1356
1357 if (old_child_locked) 1357 if (old_child_locked)
1358 ocfs2_meta_unlock(old_inode, 1); 1358 ocfs2_inode_unlock(old_inode, 1);
1359 1359
1360 if (new_child_locked) 1360 if (new_child_locked)
1361 ocfs2_meta_unlock(new_inode, 1); 1361 ocfs2_inode_unlock(new_inode, 1);
1362 1362
1363 if (orphan_dir) { 1363 if (orphan_dir) {
1364 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 1364 /* This was locked for us in ocfs2_prepare_orphan_dir() */
1365 ocfs2_meta_unlock(orphan_dir, 1); 1365 ocfs2_inode_unlock(orphan_dir, 1);
1366 mutex_unlock(&orphan_dir->i_mutex); 1366 mutex_unlock(&orphan_dir->i_mutex);
1367 iput(orphan_dir); 1367 iput(orphan_dir);
1368 } 1368 }
@@ -1530,7 +1530,7 @@ static int ocfs2_symlink(struct inode *dir,
1530 credits = ocfs2_calc_symlink_credits(sb); 1530 credits = ocfs2_calc_symlink_credits(sb);
1531 1531
1532 /* lock the parent directory */ 1532 /* lock the parent directory */
1533 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 1533 status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
1534 if (status < 0) { 1534 if (status < 0) {
1535 if (status != -ENOENT) 1535 if (status != -ENOENT)
1536 mlog_errno(status); 1536 mlog_errno(status);
@@ -1657,7 +1657,7 @@ bail:
1657 if (handle) 1657 if (handle)
1658 ocfs2_commit_trans(osb, handle); 1658 ocfs2_commit_trans(osb, handle);
1659 1659
1660 ocfs2_meta_unlock(dir, 1); 1660 ocfs2_inode_unlock(dir, 1);
1661 1661
1662 if (new_fe_bh) 1662 if (new_fe_bh)
1663 brelse(new_fe_bh); 1663 brelse(new_fe_bh);
@@ -1735,7 +1735,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1735 1735
1736 mutex_lock(&orphan_dir_inode->i_mutex); 1736 mutex_lock(&orphan_dir_inode->i_mutex);
1737 1737
1738 status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); 1738 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
1739 if (status < 0) { 1739 if (status < 0) {
1740 mlog_errno(status); 1740 mlog_errno(status);
1741 goto leave; 1741 goto leave;
@@ -1745,7 +1745,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1745 orphan_dir_bh, name, 1745 orphan_dir_bh, name,
1746 OCFS2_ORPHAN_NAMELEN, de_bh); 1746 OCFS2_ORPHAN_NAMELEN, de_bh);
1747 if (status < 0) { 1747 if (status < 0) {
1748 ocfs2_meta_unlock(orphan_dir_inode, 1); 1748 ocfs2_inode_unlock(orphan_dir_inode, 1);
1749 1749
1750 mlog_errno(status); 1750 mlog_errno(status);
1751 goto leave; 1751 goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 60a23e1906b0..d08480580470 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -101,6 +101,7 @@ enum ocfs2_unlock_action {
101 * about to be 101 * about to be
102 * dropped. */ 102 * dropped. */
103#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ 103#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
104#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
104 105
105struct ocfs2_lock_res_ops; 106struct ocfs2_lock_res_ops;
106 107
@@ -170,6 +171,7 @@ enum ocfs2_mount_options
170 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ 171 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
171 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 172 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
172 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ 173 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
174 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
173}; 175};
174 176
175#define OCFS2_OSB_SOFT_RO 0x0001 177#define OCFS2_OSB_SOFT_RO 0x0001
@@ -189,9 +191,7 @@ struct ocfs2_super
189 struct ocfs2_slot_info *slot_info; 191 struct ocfs2_slot_info *slot_info;
190 192
191 spinlock_t node_map_lock; 193 spinlock_t node_map_lock;
192 struct ocfs2_node_map mounted_map;
193 struct ocfs2_node_map recovery_map; 194 struct ocfs2_node_map recovery_map;
194 struct ocfs2_node_map umount_map;
195 195
196 u64 root_blkno; 196 u64 root_blkno;
197 u64 system_dir_blkno; 197 u64 system_dir_blkno;
@@ -231,7 +231,9 @@ struct ocfs2_super
231 wait_queue_head_t checkpoint_event; 231 wait_queue_head_t checkpoint_event;
232 atomic_t needs_checkpoint; 232 atomic_t needs_checkpoint;
233 struct ocfs2_journal *journal; 233 struct ocfs2_journal *journal;
234 unsigned long osb_commit_interval;
234 235
236 int local_alloc_size;
235 enum ocfs2_local_alloc_state local_alloc_state; 237 enum ocfs2_local_alloc_state local_alloc_state;
236 struct buffer_head *local_alloc_bh; 238 struct buffer_head *local_alloc_bh;
237 u64 la_last_gd; 239 u64 la_last_gd;
@@ -254,28 +256,21 @@ struct ocfs2_super
254 256
255 wait_queue_head_t recovery_event; 257 wait_queue_head_t recovery_event;
256 258
257 spinlock_t vote_task_lock; 259 spinlock_t dc_task_lock;
258 struct task_struct *vote_task; 260 struct task_struct *dc_task;
259 wait_queue_head_t vote_event; 261 wait_queue_head_t dc_event;
260 unsigned long vote_wake_sequence; 262 unsigned long dc_wake_sequence;
261 unsigned long vote_work_sequence; 263 unsigned long dc_work_sequence;
262 264
265 /*
266 * Any thread can add locks to the list, but the downconvert
267 * thread is the only one allowed to remove locks. Any change
268 * to this rule requires updating
269 * ocfs2_downconvert_thread_do_work().
270 */
263 struct list_head blocked_lock_list; 271 struct list_head blocked_lock_list;
264 unsigned long blocked_lock_count; 272 unsigned long blocked_lock_count;
265 273
266 struct list_head vote_list;
267 int vote_count;
268
269 u32 net_key;
270 spinlock_t net_response_lock;
271 unsigned int net_response_ids;
272 struct list_head net_response_list;
273
274 struct o2hb_callback_func osb_hb_up;
275 struct o2hb_callback_func osb_hb_down;
276
277 struct list_head osb_net_handlers;
278
279 wait_queue_head_t osb_mount_event; 274 wait_queue_head_t osb_mount_event;
280 275
281 /* Truncate log info */ 276 /* Truncate log info */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 6ef876759a73..3633edd3982f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -231,6 +231,20 @@ struct ocfs2_space_resv {
231#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv) 231#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
232#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv) 232#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
233 233
234/* Used to pass group descriptor data when online resize is done */
235struct ocfs2_new_group_input {
236 __u64 group; /* Group descriptor's blkno. */
237 __u32 clusters; /* Total number of clusters in this group */
238 __u32 frees; /* Total free clusters in this group */
239 __u16 chain; /* Chain for this group */
240 __u16 reserved1;
241 __u32 reserved2;
242};
243
244#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
245#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
246#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
247
234/* 248/*
235 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 249 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
236 */ 250 */
@@ -256,6 +270,14 @@ struct ocfs2_space_resv {
256/* Journal limits (in bytes) */ 270/* Journal limits (in bytes) */
257#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 271#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
258 272
273/*
274 * Default local alloc size (in megabytes)
275 *
276 * The value chosen should be such that most allocations, including new
277 * block groups, use local alloc.
278 */
279#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
280
259struct ocfs2_system_inode_info { 281struct ocfs2_system_inode_info {
260 char *si_name; 282 char *si_name;
261 int si_iflags; 283 int si_iflags;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4ca02b1c38ac..86f3e3799c2b 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -45,6 +45,7 @@ enum ocfs2_lock_type {
45 OCFS2_LOCK_TYPE_RW, 45 OCFS2_LOCK_TYPE_RW,
46 OCFS2_LOCK_TYPE_DENTRY, 46 OCFS2_LOCK_TYPE_DENTRY,
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK,
48 OCFS2_NUM_LOCK_TYPES 49 OCFS2_NUM_LOCK_TYPES
49}; 50};
50 51
@@ -73,6 +74,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
73 case OCFS2_LOCK_TYPE_OPEN: 74 case OCFS2_LOCK_TYPE_OPEN:
74 c = 'O'; 75 c = 'O';
75 break; 76 break;
77 case OCFS2_LOCK_TYPE_FLOCK:
78 c = 'F';
79 break;
76 default: 80 default:
77 c = '\0'; 81 c = '\0';
78 } 82 }
@@ -90,6 +94,7 @@ static char *ocfs2_lock_type_strings[] = {
90 [OCFS2_LOCK_TYPE_RW] = "Write/Read", 94 [OCFS2_LOCK_TYPE_RW] = "Write/Read",
91 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", 95 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
92 [OCFS2_LOCK_TYPE_OPEN] = "Open", 96 [OCFS2_LOCK_TYPE_OPEN] = "Open",
97 [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
93}; 98};
94 99
95static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
new file mode 100644
index 000000000000..37835ffcb039
--- /dev/null
+++ b/fs/ocfs2/resize.c
@@ -0,0 +1,634 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * resize.c
5 *
6 * volume resize.
7 * Inspired by ext3/resize.c.
8 *
9 * Copyright (C) 2007 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29
30#define MLOG_MASK_PREFIX ML_DISK_ALLOC
31#include <cluster/masklog.h>
32
33#include "ocfs2.h"
34
35#include "alloc.h"
36#include "dlmglue.h"
37#include "inode.h"
38#include "journal.h"
39#include "super.h"
40#include "sysfile.h"
41#include "uptodate.h"
42
43#include "buffer_head_io.h"
44#include "suballoc.h"
45#include "resize.h"
46
47/*
48 * Check whether there are new backup superblocks exist
49 * in the last group. If there are some, mark them or clear
50 * them in the bitmap.
51 *
52 * Return how many backups we find in the last group.
53 */
54static u16 ocfs2_calc_new_backup_super(struct inode *inode,
55 struct ocfs2_group_desc *gd,
56 int new_clusters,
57 u32 first_new_cluster,
58 u16 cl_cpg,
59 int set)
60{
61 int i;
62 u16 backups = 0;
63 u32 cluster;
64 u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
65
66 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
67 blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
68 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
69
70 gd_blkno = ocfs2_which_cluster_group(inode, cluster);
71 if (gd_blkno < lgd_blkno)
72 continue;
73 else if (gd_blkno > lgd_blkno)
74 break;
75
76 if (set)
77 ocfs2_set_bit(cluster % cl_cpg,
78 (unsigned long *)gd->bg_bitmap);
79 else
80 ocfs2_clear_bit(cluster % cl_cpg,
81 (unsigned long *)gd->bg_bitmap);
82 backups++;
83 }
84
85 mlog_exit_void();
86 return backups;
87}
88
89static int ocfs2_update_last_group_and_inode(handle_t *handle,
90 struct inode *bm_inode,
91 struct buffer_head *bm_bh,
92 struct buffer_head *group_bh,
93 u32 first_new_cluster,
94 int new_clusters)
95{
96 int ret = 0;
97 struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
98 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
99 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
100 struct ocfs2_chain_rec *cr;
101 struct ocfs2_group_desc *group;
102 u16 chain, num_bits, backups = 0;
103 u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
104 u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
105
106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
107 new_clusters, first_new_cluster);
108
109 ret = ocfs2_journal_access(handle, bm_inode, group_bh,
110 OCFS2_JOURNAL_ACCESS_WRITE);
111 if (ret < 0) {
112 mlog_errno(ret);
113 goto out;
114 }
115
116 group = (struct ocfs2_group_desc *)group_bh->b_data;
117
118 /* update the group first. */
119 num_bits = new_clusters * cl_bpc;
120 le16_add_cpu(&group->bg_bits, num_bits);
121 le16_add_cpu(&group->bg_free_bits_count, num_bits);
122
123 /*
124 * check whether there are some new backup superblocks exist in
125 * this group and update the group bitmap accordingly.
126 */
127 if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
128 OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
129 backups = ocfs2_calc_new_backup_super(bm_inode,
130 group,
131 new_clusters,
132 first_new_cluster,
133 cl_cpg, 1);
134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
135 }
136
137 ret = ocfs2_journal_dirty(handle, group_bh);
138 if (ret < 0) {
139 mlog_errno(ret);
140 goto out_rollback;
141 }
142
143 /* update the inode accordingly. */
144 ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
145 OCFS2_JOURNAL_ACCESS_WRITE);
146 if (ret < 0) {
147 mlog_errno(ret);
148 goto out_rollback;
149 }
150
151 chain = le16_to_cpu(group->bg_chain);
152 cr = (&cl->cl_recs[chain]);
153 le32_add_cpu(&cr->c_total, num_bits);
154 le32_add_cpu(&cr->c_free, num_bits);
155 le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
156 le32_add_cpu(&fe->i_clusters, new_clusters);
157
158 if (backups) {
159 le32_add_cpu(&cr->c_free, -1 * backups);
160 le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
161 }
162
163 spin_lock(&OCFS2_I(bm_inode)->ip_lock);
164 OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
165 le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
166 spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
167 i_size_write(bm_inode, le64_to_cpu(fe->i_size));
168
169 ocfs2_journal_dirty(handle, bm_bh);
170
171out_rollback:
172 if (ret < 0) {
173 ocfs2_calc_new_backup_super(bm_inode,
174 group,
175 new_clusters,
176 first_new_cluster,
177 cl_cpg, 0);
178 le16_add_cpu(&group->bg_free_bits_count, backups);
179 le16_add_cpu(&group->bg_bits, -1 * num_bits);
180 le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
181 }
182out:
183 mlog_exit(ret);
184 return ret;
185}
186
187static int update_backups(struct inode * inode, u32 clusters, char *data)
188{
189 int i, ret = 0;
190 u32 cluster;
191 u64 blkno;
192 struct buffer_head *backup = NULL;
193 struct ocfs2_dinode *backup_di = NULL;
194 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
195
196 /* calculate the real backups we need to update. */
197 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
198 blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
199 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
200 if (cluster > clusters)
201 break;
202
203 ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
204 if (ret < 0) {
205 mlog_errno(ret);
206 break;
207 }
208
209 memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
210
211 backup_di = (struct ocfs2_dinode *)backup->b_data;
212 backup_di->i_blkno = cpu_to_le64(blkno);
213
214 ret = ocfs2_write_super_or_backup(osb, backup);
215 brelse(backup);
216 backup = NULL;
217 if (ret < 0) {
218 mlog_errno(ret);
219 break;
220 }
221 }
222
223 return ret;
224}
225
226static void ocfs2_update_super_and_backups(struct inode *inode,
227 int new_clusters)
228{
229 int ret;
230 u32 clusters = 0;
231 struct buffer_head *super_bh = NULL;
232 struct ocfs2_dinode *super_di = NULL;
233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
234
235 /*
236 * update the superblock last.
237 * It doesn't matter if the write failed.
238 */
239 ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
240 &super_bh, 0, NULL);
241 if (ret < 0) {
242 mlog_errno(ret);
243 goto out;
244 }
245
246 super_di = (struct ocfs2_dinode *)super_bh->b_data;
247 le32_add_cpu(&super_di->i_clusters, new_clusters);
248 clusters = le32_to_cpu(super_di->i_clusters);
249
250 ret = ocfs2_write_super_or_backup(osb, super_bh);
251 if (ret < 0) {
252 mlog_errno(ret);
253 goto out;
254 }
255
256 if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
257 ret = update_backups(inode, clusters, super_bh->b_data);
258
259out:
260 brelse(super_bh);
261 if (ret)
262 printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
263 " during fs resize. This condition is not fatal,"
264 " but fsck.ocfs2 should be run to fix it\n",
265 osb->dev_str);
266 return;
267}
268
269/*
270 * Extend the filesystem to the new number of clusters specified. This entry
271 * point is only used to extend the current filesystem to the end of the last
272 * existing group.
273 */
274int ocfs2_group_extend(struct inode * inode, int new_clusters)
275{
276 int ret;
277 handle_t *handle;
278 struct buffer_head *main_bm_bh = NULL;
279 struct buffer_head *group_bh = NULL;
280 struct inode *main_bm_inode = NULL;
281 struct ocfs2_dinode *fe = NULL;
282 struct ocfs2_group_desc *group = NULL;
283 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
284 u16 cl_bpc;
285 u32 first_new_cluster;
286 u64 lgd_blkno;
287
288 mlog_entry_void();
289
290 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
291 return -EROFS;
292
293 if (new_clusters < 0)
294 return -EINVAL;
295 else if (new_clusters == 0)
296 return 0;
297
298 main_bm_inode = ocfs2_get_system_file_inode(osb,
299 GLOBAL_BITMAP_SYSTEM_INODE,
300 OCFS2_INVALID_SLOT);
301 if (!main_bm_inode) {
302 ret = -EINVAL;
303 mlog_errno(ret);
304 goto out;
305 }
306
307 mutex_lock(&main_bm_inode->i_mutex);
308
309 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
310 if (ret < 0) {
311 mlog_errno(ret);
312 goto out_mutex;
313 }
314
315 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
316
317 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
318 ocfs2_group_bitmap_size(osb->sb) * 8) {
319 mlog(ML_ERROR, "The disk is too old and small. "
320 "Force to do offline resize.");
321 ret = -EINVAL;
322 goto out_unlock;
323 }
324
325 if (!OCFS2_IS_VALID_DINODE(fe)) {
326 OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
327 ret = -EIO;
328 goto out_unlock;
329 }
330
331 first_new_cluster = le32_to_cpu(fe->i_clusters);
332 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
333 first_new_cluster - 1);
334
335 ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
336 main_bm_inode);
337 if (ret < 0) {
338 mlog_errno(ret);
339 goto out_unlock;
340 }
341
342 group = (struct ocfs2_group_desc *)group_bh->b_data;
343
344 ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
345 if (ret) {
346 mlog_errno(ret);
347 goto out_unlock;
348 }
349
350 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
351 if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
352 le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
353 ret = -EINVAL;
354 goto out_unlock;
355 }
356
357 mlog(0, "extend the last group at %llu, new clusters = %d\n",
358 (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
359
360 handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
361 if (IS_ERR(handle)) {
362 mlog_errno(PTR_ERR(handle));
363 ret = -EINVAL;
364 goto out_unlock;
365 }
366
367 /* update the last group descriptor and inode. */
368 ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
369 main_bm_bh, group_bh,
370 first_new_cluster,
371 new_clusters);
372 if (ret) {
373 mlog_errno(ret);
374 goto out_commit;
375 }
376
377 ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
378
379out_commit:
380 ocfs2_commit_trans(osb, handle);
381out_unlock:
382 brelse(group_bh);
383 brelse(main_bm_bh);
384
385 ocfs2_inode_unlock(main_bm_inode, 1);
386
387out_mutex:
388 mutex_unlock(&main_bm_inode->i_mutex);
389 iput(main_bm_inode);
390
391out:
392 mlog_exit_void();
393 return ret;
394}
395
396static int ocfs2_check_new_group(struct inode *inode,
397 struct ocfs2_dinode *di,
398 struct ocfs2_new_group_input *input,
399 struct buffer_head *group_bh)
400{
401 int ret;
402 struct ocfs2_group_desc *gd;
403 u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
404 unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
405 le16_to_cpu(di->id2.i_chain.cl_bpc);
406
407
408 gd = (struct ocfs2_group_desc *)group_bh->b_data;
409
410 ret = -EIO;
411 if (!OCFS2_IS_VALID_GROUP_DESC(gd))
412 mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
413 (unsigned long long)le64_to_cpu(gd->bg_blkno));
414 else if (di->i_blkno != gd->bg_parent_dinode)
415 mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
416 "pointer (%llu, expected %llu)\n",
417 (unsigned long long)le64_to_cpu(gd->bg_blkno),
418 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
419 (unsigned long long)le64_to_cpu(di->i_blkno));
420 else if (le16_to_cpu(gd->bg_bits) > max_bits)
421 mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
422 (unsigned long long)le64_to_cpu(gd->bg_blkno),
423 le16_to_cpu(gd->bg_bits));
424 else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
425 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
426 "claims that %u are free\n",
427 (unsigned long long)le64_to_cpu(gd->bg_blkno),
428 le16_to_cpu(gd->bg_bits),
429 le16_to_cpu(gd->bg_free_bits_count));
430 else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
431 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
432 "max bitmap bits of %u\n",
433 (unsigned long long)le64_to_cpu(gd->bg_blkno),
434 le16_to_cpu(gd->bg_bits),
435 8 * le16_to_cpu(gd->bg_size));
436 else if (le16_to_cpu(gd->bg_chain) != input->chain)
437 mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
438 "while input has %u set.\n",
439 (unsigned long long)le64_to_cpu(gd->bg_blkno),
440 le16_to_cpu(gd->bg_chain), input->chain);
441 else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
442 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
443 "input has %u clusters set\n",
444 (unsigned long long)le64_to_cpu(gd->bg_blkno),
445 le16_to_cpu(gd->bg_bits), input->clusters);
446 else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
447 mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
448 "but it should have %u set\n",
449 (unsigned long long)le64_to_cpu(gd->bg_blkno),
450 le16_to_cpu(gd->bg_bits),
451 input->frees * cl_bpc);
452 else
453 ret = 0;
454
455 return ret;
456}
457
458static int ocfs2_verify_group_and_input(struct inode *inode,
459 struct ocfs2_dinode *di,
460 struct ocfs2_new_group_input *input,
461 struct buffer_head *group_bh)
462{
463 u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
464 u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
465 u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
466 u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
467 u32 total_clusters = le32_to_cpu(di->i_clusters);
468 int ret = -EINVAL;
469
470 if (cluster < total_clusters)
471 mlog(ML_ERROR, "add a group which is in the current volume.\n");
472 else if (input->chain >= cl_count)
473 mlog(ML_ERROR, "input chain exceeds the limit.\n");
474 else if (next_free != cl_count && next_free != input->chain)
475 mlog(ML_ERROR,
476 "the add group should be in chain %u\n", next_free);
477 else if (total_clusters + input->clusters < total_clusters)
478 mlog(ML_ERROR, "add group's clusters overflow.\n");
479 else if (input->clusters > cl_cpg)
480 mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
481 else if (input->frees > input->clusters)
482 mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
483 else if (total_clusters % cl_cpg != 0)
484 mlog(ML_ERROR,
485 "the last group isn't full. Use group extend first.\n");
486 else if (input->group != ocfs2_which_cluster_group(inode, cluster))
487 mlog(ML_ERROR, "group blkno is invalid\n");
488 else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
489 mlog(ML_ERROR, "group descriptor check failed.\n");
490 else
491 ret = 0;
492
493 return ret;
494}
495
496/* Add a new group descriptor to global_bitmap. */
497int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
498{
499 int ret;
500 handle_t *handle;
501 struct buffer_head *main_bm_bh = NULL;
502 struct inode *main_bm_inode = NULL;
503 struct ocfs2_dinode *fe = NULL;
504 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
505 struct buffer_head *group_bh = NULL;
506 struct ocfs2_group_desc *group = NULL;
507 struct ocfs2_chain_list *cl;
508 struct ocfs2_chain_rec *cr;
509 u16 cl_bpc;
510
511 mlog_entry_void();
512
513 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
514 return -EROFS;
515
516 main_bm_inode = ocfs2_get_system_file_inode(osb,
517 GLOBAL_BITMAP_SYSTEM_INODE,
518 OCFS2_INVALID_SLOT);
519 if (!main_bm_inode) {
520 ret = -EINVAL;
521 mlog_errno(ret);
522 goto out;
523 }
524
525 mutex_lock(&main_bm_inode->i_mutex);
526
527 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
528 if (ret < 0) {
529 mlog_errno(ret);
530 goto out_mutex;
531 }
532
533 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
534
535 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
536 ocfs2_group_bitmap_size(osb->sb) * 8) {
537 mlog(ML_ERROR, "The disk is too old and small."
538 " Force to do offline resize.");
539 ret = -EINVAL;
540 goto out_unlock;
541 }
542
543 ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
544 if (ret < 0) {
545 mlog(ML_ERROR, "Can't read the group descriptor # %llu "
546 "from the device.", (unsigned long long)input->group);
547 goto out_unlock;
548 }
549
550 ocfs2_set_new_buffer_uptodate(inode, group_bh);
551
552 ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
553 if (ret) {
554 mlog_errno(ret);
555 goto out_unlock;
556 }
557
558 mlog(0, "Add a new group %llu in chain = %u, length = %u\n",
559 (unsigned long long)input->group, input->chain, input->clusters);
560
561 handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
562 if (IS_ERR(handle)) {
563 mlog_errno(PTR_ERR(handle));
564 ret = -EINVAL;
565 goto out_unlock;
566 }
567
568 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
569 cl = &fe->id2.i_chain;
570 cr = &cl->cl_recs[input->chain];
571
572 ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
573 OCFS2_JOURNAL_ACCESS_WRITE);
574 if (ret < 0) {
575 mlog_errno(ret);
576 goto out_commit;
577 }
578
579 group = (struct ocfs2_group_desc *)group_bh->b_data;
580 group->bg_next_group = cr->c_blkno;
581
582 ret = ocfs2_journal_dirty(handle, group_bh);
583 if (ret < 0) {
584 mlog_errno(ret);
585 goto out_commit;
586 }
587
588 ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
589 OCFS2_JOURNAL_ACCESS_WRITE);
590 if (ret < 0) {
591 mlog_errno(ret);
592 goto out_commit;
593 }
594
595 if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
596 le16_add_cpu(&cl->cl_next_free_rec, 1);
597 memset(cr, 0, sizeof(struct ocfs2_chain_rec));
598 }
599
600 cr->c_blkno = le64_to_cpu(input->group);
601 le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
602 le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
603
604 le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
605 le32_add_cpu(&fe->id1.bitmap1.i_used,
606 (input->clusters - input->frees) * cl_bpc);
607 le32_add_cpu(&fe->i_clusters, input->clusters);
608
609 ocfs2_journal_dirty(handle, main_bm_bh);
610
611 spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
612 OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
613 le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
614 spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
615 i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
616
617 ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
618
619out_commit:
620 ocfs2_commit_trans(osb, handle);
621out_unlock:
622 brelse(group_bh);
623 brelse(main_bm_bh);
624
625 ocfs2_inode_unlock(main_bm_inode, 1);
626
627out_mutex:
628 mutex_unlock(&main_bm_inode->i_mutex);
629 iput(main_bm_inode);
630
631out:
632 mlog_exit_void();
633 return ret;
634}
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
new file mode 100644
index 000000000000..f38841abf10b
--- /dev/null
+++ b/fs/ocfs2/resize.h
@@ -0,0 +1,32 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * resize.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_RESIZE_H
27#define OCFS2_RESIZE_H
28
29int ocfs2_group_extend(struct inode * inode, int new_clusters);
30int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
31
32#endif /* OCFS2_RESIZE_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index af4882b62cfa..3a50ce555e64 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -48,25 +48,6 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
48 s16 slot_num, 48 s16 slot_num,
49 s16 node_num); 49 s16 node_num);
50 50
51/* Use the slot information we've collected to create a map of mounted
52 * nodes. Should be holding an EX on super block. assumes slot info is
53 * up to date. Note that we call this *after* we find a slot, so our
54 * own node should be set in the map too... */
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
56{
57 int i;
58 struct ocfs2_slot_info *si = osb->slot_info;
59
60 spin_lock(&si->si_lock);
61
62 for (i = 0; i < si->si_size; i++)
63 if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
64 ocfs2_node_map_set_bit(osb, &osb->mounted_map,
65 si->si_global_node_nums[i]);
66
67 spin_unlock(&si->si_lock);
68}
69
70/* post the slot information on disk into our slot_info struct. */ 51/* post the slot information on disk into our slot_info struct. */
71void ocfs2_update_slot_info(struct ocfs2_slot_info *si) 52void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
72{ 53{
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index d8c8ceed031b..1025872aaade 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -52,8 +52,6 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
52void ocfs2_clear_slot(struct ocfs2_slot_info *si, 52void ocfs2_clear_slot(struct ocfs2_slot_info *si,
53 s16 slot_num); 53 s16 slot_num);
54 54
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
56
57static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, 55static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
58 int slot_num) 56 int slot_num)
59{ 57{
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8f09f5235e3a..7e397e2c25dd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -101,8 +101,6 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
102 u64 bg_blkno, 102 u64 bg_blkno,
103 u16 bg_bit_off); 103 u16 bg_bit_off);
104static inline u64 ocfs2_which_cluster_group(struct inode *inode,
105 u32 cluster);
106static inline void ocfs2_block_to_cluster_group(struct inode *inode, 104static inline void ocfs2_block_to_cluster_group(struct inode *inode,
107 u64 data_blkno, 105 u64 data_blkno,
108 u64 *bg_blkno, 106 u64 *bg_blkno,
@@ -114,7 +112,7 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
114 112
115 if (inode) { 113 if (inode) {
116 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 114 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
117 ocfs2_meta_unlock(inode, 1); 115 ocfs2_inode_unlock(inode, 1);
118 116
119 mutex_unlock(&inode->i_mutex); 117 mutex_unlock(&inode->i_mutex);
120 118
@@ -131,9 +129,9 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
131} 129}
132 130
133/* somewhat more expensive than our other checks, so use sparingly. */ 131/* somewhat more expensive than our other checks, so use sparingly. */
134static int ocfs2_check_group_descriptor(struct super_block *sb, 132int ocfs2_check_group_descriptor(struct super_block *sb,
135 struct ocfs2_dinode *di, 133 struct ocfs2_dinode *di,
136 struct ocfs2_group_desc *gd) 134 struct ocfs2_group_desc *gd)
137{ 135{
138 unsigned int max_bits; 136 unsigned int max_bits;
139 137
@@ -412,7 +410,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
412 410
413 mutex_lock(&alloc_inode->i_mutex); 411 mutex_lock(&alloc_inode->i_mutex);
414 412
415 status = ocfs2_meta_lock(alloc_inode, &bh, 1); 413 status = ocfs2_inode_lock(alloc_inode, &bh, 1);
416 if (status < 0) { 414 if (status < 0) {
417 mutex_unlock(&alloc_inode->i_mutex); 415 mutex_unlock(&alloc_inode->i_mutex);
418 iput(alloc_inode); 416 iput(alloc_inode);
@@ -1443,8 +1441,7 @@ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1443 1441
1444/* given a cluster offset, calculate which block group it belongs to 1442/* given a cluster offset, calculate which block group it belongs to
1445 * and return that block offset. */ 1443 * and return that block offset. */
1446static inline u64 ocfs2_which_cluster_group(struct inode *inode, 1444u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1447 u32 cluster)
1448{ 1445{
1449 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1446 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1450 u32 group_no; 1447 u32 group_no;
@@ -1519,8 +1516,9 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1519 if (min_clusters > (osb->bitmap_cpg - 1)) { 1516 if (min_clusters > (osb->bitmap_cpg - 1)) {
1520 /* The only paths asking for contiguousness 1517 /* The only paths asking for contiguousness
1521 * should know about this already. */ 1518 * should know about this already. */
1522 mlog(ML_ERROR, "minimum allocation requested exceeds " 1519 mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1523 "group bitmap size!"); 1520 "group bitmap size %u!\n", min_clusters,
1521 osb->bitmap_cpg);
1524 status = -ENOSPC; 1522 status = -ENOSPC;
1525 goto bail; 1523 goto bail;
1526 } 1524 }
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index cafe93703095..8799033bb459 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -147,4 +147,12 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
147int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 147int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
148 struct ocfs2_alloc_context *ac); 148 struct ocfs2_alloc_context *ac);
149 149
150/* given a cluster offset, calculate which block group it belongs to
151 * and return that block offset. */
152u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
153
154/* somewhat more expensive than our other checks, so use sparingly. */
155int ocfs2_check_group_descriptor(struct super_block *sb,
156 struct ocfs2_dinode *di,
157 struct ocfs2_group_desc *gd);
150#endif /* _CHAINALLOC_H_ */ 158#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5ee775420665..01fe40ee5ea9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -65,7 +65,6 @@
65#include "sysfile.h" 65#include "sysfile.h"
66#include "uptodate.h" 66#include "uptodate.h"
67#include "ver.h" 67#include "ver.h"
68#include "vote.h"
69 68
70#include "buffer_head_io.h" 69#include "buffer_head_io.h"
71 70
@@ -84,9 +83,11 @@ MODULE_LICENSE("GPL");
84 83
85struct mount_options 84struct mount_options
86{ 85{
86 unsigned long commit_interval;
87 unsigned long mount_opt; 87 unsigned long mount_opt;
88 unsigned int atime_quantum; 88 unsigned int atime_quantum;
89 signed short slot; 89 signed short slot;
90 unsigned int localalloc_opt;
90}; 91};
91 92
92static int ocfs2_parse_options(struct super_block *sb, char *options, 93static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -150,6 +151,9 @@ enum {
150 Opt_data_writeback, 151 Opt_data_writeback,
151 Opt_atime_quantum, 152 Opt_atime_quantum,
152 Opt_slot, 153 Opt_slot,
154 Opt_commit,
155 Opt_localalloc,
156 Opt_localflocks,
153 Opt_err, 157 Opt_err,
154}; 158};
155 159
@@ -165,6 +169,9 @@ static match_table_t tokens = {
165 {Opt_data_writeback, "data=writeback"}, 169 {Opt_data_writeback, "data=writeback"},
166 {Opt_atime_quantum, "atime_quantum=%u"}, 170 {Opt_atime_quantum, "atime_quantum=%u"},
167 {Opt_slot, "preferred_slot=%u"}, 171 {Opt_slot, "preferred_slot=%u"},
172 {Opt_commit, "commit=%u"},
173 {Opt_localalloc, "localalloc=%d"},
174 {Opt_localflocks, "localflocks"},
168 {Opt_err, NULL} 175 {Opt_err, NULL}
169}; 176};
170 177
@@ -213,7 +220,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
213 220
214 mlog_entry_void(); 221 mlog_entry_void();
215 222
216 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE); 223 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
217 if (IS_ERR(new)) { 224 if (IS_ERR(new)) {
218 status = PTR_ERR(new); 225 status = PTR_ERR(new);
219 mlog_errno(status); 226 mlog_errno(status);
@@ -221,7 +228,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
221 } 228 }
222 osb->root_inode = new; 229 osb->root_inode = new;
223 230
224 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE); 231 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
225 if (IS_ERR(new)) { 232 if (IS_ERR(new)) {
226 status = PTR_ERR(new); 233 status = PTR_ERR(new);
227 mlog_errno(status); 234 mlog_errno(status);
@@ -443,6 +450,8 @@ unlock_osb:
443 osb->s_mount_opt = parsed_options.mount_opt; 450 osb->s_mount_opt = parsed_options.mount_opt;
444 osb->s_atime_quantum = parsed_options.atime_quantum; 451 osb->s_atime_quantum = parsed_options.atime_quantum;
445 osb->preferred_slot = parsed_options.slot; 452 osb->preferred_slot = parsed_options.slot;
453 if (parsed_options.commit_interval)
454 osb->osb_commit_interval = parsed_options.commit_interval;
446 455
447 if (!ocfs2_is_hard_readonly(osb)) 456 if (!ocfs2_is_hard_readonly(osb))
448 ocfs2_set_journal_params(osb); 457 ocfs2_set_journal_params(osb);
@@ -597,6 +606,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
597 osb->s_mount_opt = parsed_options.mount_opt; 606 osb->s_mount_opt = parsed_options.mount_opt;
598 osb->s_atime_quantum = parsed_options.atime_quantum; 607 osb->s_atime_quantum = parsed_options.atime_quantum;
599 osb->preferred_slot = parsed_options.slot; 608 osb->preferred_slot = parsed_options.slot;
609 osb->osb_commit_interval = parsed_options.commit_interval;
610 osb->local_alloc_size = parsed_options.localalloc_opt;
600 611
601 sb->s_magic = OCFS2_SUPER_MAGIC; 612 sb->s_magic = OCFS2_SUPER_MAGIC;
602 613
@@ -747,9 +758,11 @@ static int ocfs2_parse_options(struct super_block *sb,
747 mlog_entry("remount: %d, options: \"%s\"\n", is_remount, 758 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
748 options ? options : "(none)"); 759 options ? options : "(none)");
749 760
761 mopt->commit_interval = 0;
750 mopt->mount_opt = 0; 762 mopt->mount_opt = 0;
751 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 763 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
752 mopt->slot = OCFS2_INVALID_SLOT; 764 mopt->slot = OCFS2_INVALID_SLOT;
765 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
753 766
754 if (!options) { 767 if (!options) {
755 status = 1; 768 status = 1;
@@ -816,6 +829,41 @@ static int ocfs2_parse_options(struct super_block *sb,
816 if (option) 829 if (option)
817 mopt->slot = (s16)option; 830 mopt->slot = (s16)option;
818 break; 831 break;
832 case Opt_commit:
833 option = 0;
834 if (match_int(&args[0], &option)) {
835 status = 0;
836 goto bail;
837 }
838 if (option < 0)
839 return 0;
840 if (option == 0)
841 option = JBD_DEFAULT_MAX_COMMIT_AGE;
842 mopt->commit_interval = HZ * option;
843 break;
844 case Opt_localalloc:
845 option = 0;
846 if (match_int(&args[0], &option)) {
847 status = 0;
848 goto bail;
849 }
850 if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
851 mopt->localalloc_opt = option;
852 break;
853 case Opt_localflocks:
854 /*
855 * Changing this during remount could race
856 * flock() requests, or "unbalance" existing
857 * ones (e.g., a lock is taken in one mode but
858 * dropped in the other). If users care enough
859 * to flip locking modes during remount, we
860 * could add a "local" flag to individual
861 * flock structures for proper tracking of
862 * state.
863 */
864 if (!is_remount)
865 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
866 break;
819 default: 867 default:
820 mlog(ML_ERROR, 868 mlog(ML_ERROR,
821 "Unrecognized mount option \"%s\" " 869 "Unrecognized mount option \"%s\" "
@@ -864,6 +912,16 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
864 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) 912 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
865 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); 913 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
866 914
915 if (osb->osb_commit_interval)
916 seq_printf(s, ",commit=%u",
917 (unsigned) (osb->osb_commit_interval / HZ));
918
919 if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
920 seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
921
922 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
923 seq_printf(s, ",localflocks,");
924
867 return 0; 925 return 0;
868} 926}
869 927
@@ -965,7 +1023,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
965 goto bail; 1023 goto bail;
966 } 1024 }
967 1025
968 status = ocfs2_meta_lock(inode, &bh, 0); 1026 status = ocfs2_inode_lock(inode, &bh, 0);
969 if (status < 0) { 1027 if (status < 0) {
970 mlog_errno(status); 1028 mlog_errno(status);
971 goto bail; 1029 goto bail;
@@ -989,7 +1047,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
989 1047
990 brelse(bh); 1048 brelse(bh);
991 1049
992 ocfs2_meta_unlock(inode, 0); 1050 ocfs2_inode_unlock(inode, 0);
993 status = 0; 1051 status = 0;
994bail: 1052bail:
995 if (inode) 1053 if (inode)
@@ -1020,8 +1078,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
1020 oi->ip_clusters = 0; 1078 oi->ip_clusters = 0;
1021 1079
1022 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 1080 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
1023 ocfs2_lock_res_init_once(&oi->ip_meta_lockres); 1081 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1024 ocfs2_lock_res_init_once(&oi->ip_data_lockres);
1025 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1082 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
1026 1083
1027 ocfs2_metadata_cache_init(&oi->vfs_inode); 1084 ocfs2_metadata_cache_init(&oi->vfs_inode);
@@ -1117,25 +1174,12 @@ static int ocfs2_mount_volume(struct super_block *sb)
1117 goto leave; 1174 goto leave;
1118 } 1175 }
1119 1176
1120 status = ocfs2_register_hb_callbacks(osb);
1121 if (status < 0) {
1122 mlog_errno(status);
1123 goto leave;
1124 }
1125
1126 status = ocfs2_dlm_init(osb); 1177 status = ocfs2_dlm_init(osb);
1127 if (status < 0) { 1178 if (status < 0) {
1128 mlog_errno(status); 1179 mlog_errno(status);
1129 goto leave; 1180 goto leave;
1130 } 1181 }
1131 1182
1132 /* requires vote_thread to be running. */
1133 status = ocfs2_register_net_handlers(osb);
1134 if (status < 0) {
1135 mlog_errno(status);
1136 goto leave;
1137 }
1138
1139 status = ocfs2_super_lock(osb, 1); 1183 status = ocfs2_super_lock(osb, 1);
1140 if (status < 0) { 1184 if (status < 0) {
1141 mlog_errno(status); 1185 mlog_errno(status);
@@ -1150,8 +1194,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1150 goto leave; 1194 goto leave;
1151 } 1195 }
1152 1196
1153 ocfs2_populate_mounted_map(osb);
1154
1155 /* load all node-local system inodes */ 1197 /* load all node-local system inodes */
1156 status = ocfs2_init_local_system_inodes(osb); 1198 status = ocfs2_init_local_system_inodes(osb);
1157 if (status < 0) { 1199 if (status < 0) {
@@ -1174,15 +1216,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1174 if (ocfs2_mount_local(osb)) 1216 if (ocfs2_mount_local(osb))
1175 goto leave; 1217 goto leave;
1176 1218
1177 /* This should be sent *after* we recovered our journal as it
1178 * will cause other nodes to unmark us as needing
1179 * recovery. However, we need to send it *before* dropping the
1180 * super block lock as otherwise their recovery threads might
1181 * try to clean us up while we're live! */
1182 status = ocfs2_request_mount_vote(osb);
1183 if (status < 0)
1184 mlog_errno(status);
1185
1186leave: 1219leave:
1187 if (unlock_super) 1220 if (unlock_super)
1188 ocfs2_super_unlock(osb, 1); 1221 ocfs2_super_unlock(osb, 1);
@@ -1240,10 +1273,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1240 mlog_errno(tmp); 1273 mlog_errno(tmp);
1241 return; 1274 return;
1242 } 1275 }
1243
1244 tmp = ocfs2_request_umount_vote(osb);
1245 if (tmp < 0)
1246 mlog_errno(tmp);
1247 } 1276 }
1248 1277
1249 if (osb->slot_num != OCFS2_INVALID_SLOT) 1278 if (osb->slot_num != OCFS2_INVALID_SLOT)
@@ -1254,13 +1283,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1254 1283
1255 ocfs2_release_system_inodes(osb); 1284 ocfs2_release_system_inodes(osb);
1256 1285
1257 if (osb->dlm) { 1286 if (osb->dlm)
1258 ocfs2_unregister_net_handlers(osb);
1259
1260 ocfs2_dlm_shutdown(osb); 1287 ocfs2_dlm_shutdown(osb);
1261 }
1262
1263 ocfs2_clear_hb_callbacks(osb);
1264 1288
1265 debugfs_remove(osb->osb_debug_root); 1289 debugfs_remove(osb->osb_debug_root);
1266 1290
@@ -1315,7 +1339,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1315 int i, cbits, bbits; 1339 int i, cbits, bbits;
1316 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1340 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1317 struct inode *inode = NULL; 1341 struct inode *inode = NULL;
1318 struct buffer_head *bitmap_bh = NULL;
1319 struct ocfs2_journal *journal; 1342 struct ocfs2_journal *journal;
1320 __le32 uuid_net_key; 1343 __le32 uuid_net_key;
1321 struct ocfs2_super *osb; 1344 struct ocfs2_super *osb;
@@ -1344,19 +1367,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
1344 osb->s_sectsize_bits = blksize_bits(sector_size); 1367 osb->s_sectsize_bits = blksize_bits(sector_size);
1345 BUG_ON(!osb->s_sectsize_bits); 1368 BUG_ON(!osb->s_sectsize_bits);
1346 1369
1347 osb->net_response_ids = 0;
1348 spin_lock_init(&osb->net_response_lock);
1349 INIT_LIST_HEAD(&osb->net_response_list);
1350
1351 INIT_LIST_HEAD(&osb->osb_net_handlers);
1352 init_waitqueue_head(&osb->recovery_event); 1370 init_waitqueue_head(&osb->recovery_event);
1353 spin_lock_init(&osb->vote_task_lock); 1371 spin_lock_init(&osb->dc_task_lock);
1354 init_waitqueue_head(&osb->vote_event); 1372 init_waitqueue_head(&osb->dc_event);
1355 osb->vote_work_sequence = 0; 1373 osb->dc_work_sequence = 0;
1356 osb->vote_wake_sequence = 0; 1374 osb->dc_wake_sequence = 0;
1357 INIT_LIST_HEAD(&osb->blocked_lock_list); 1375 INIT_LIST_HEAD(&osb->blocked_lock_list);
1358 osb->blocked_lock_count = 0; 1376 osb->blocked_lock_count = 0;
1359 INIT_LIST_HEAD(&osb->vote_list);
1360 spin_lock_init(&osb->osb_lock); 1377 spin_lock_init(&osb->osb_lock);
1361 1378
1362 atomic_set(&osb->alloc_stats.moves, 0); 1379 atomic_set(&osb->alloc_stats.moves, 0);
@@ -1496,7 +1513,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1496 } 1513 }
1497 1514
1498 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); 1515 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
1499 osb->net_key = le32_to_cpu(uuid_net_key);
1500 1516
1501 strncpy(osb->vol_label, di->id2.i_super.s_label, 63); 1517 strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
1502 osb->vol_label[63] = '\0'; 1518 osb->vol_label[63] = '\0';
@@ -1539,25 +1555,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
1539 } 1555 }
1540 1556
1541 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; 1557 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
1542
1543 /* We don't have a cluster lock on the bitmap here because
1544 * we're only interested in static information and the extra
1545 * complexity at mount time isn't worht it. Don't pass the
1546 * inode in to the read function though as we don't want it to
1547 * be put in the cache. */
1548 status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
1549 NULL);
1550 iput(inode); 1558 iput(inode);
1551 if (status < 0) {
1552 mlog_errno(status);
1553 goto bail;
1554 }
1555 1559
1556 di = (struct ocfs2_dinode *) bitmap_bh->b_data; 1560 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
1557 osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
1558 brelse(bitmap_bh);
1559 mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
1560 (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
1561 1561
1562 status = ocfs2_init_slot_info(osb); 1562 status = ocfs2_init_slot_info(osb);
1563 if (status < 0) { 1563 if (status < 0) {
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index fd2e846e3e6f..ab713ebdd546 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -112,7 +112,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
112 goto bail; 112 goto bail;
113 } 113 }
114 114
115 inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE); 115 inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
116 if (IS_ERR(inode)) { 116 if (IS_ERR(inode)) {
117 mlog_errno(PTR_ERR(inode)); 117 mlog_errno(PTR_ERR(inode));
118 inode = NULL; 118 inode = NULL;
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
index 5405ce121c99..e2488f4128a2 100644
--- a/fs/ocfs2/ver.c
+++ b/fs/ocfs2/ver.c
@@ -29,7 +29,7 @@
29 29
30#include "ver.h" 30#include "ver.h"
31 31
32#define OCFS2_BUILD_VERSION "1.3.3" 32#define OCFS2_BUILD_VERSION "1.5.0"
33 33
34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION 34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
35 35
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
deleted file mode 100644
index c05358538f2b..000000000000
--- a/fs/ocfs2/vote.c
+++ /dev/null
@@ -1,756 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.c
5 *
6 * description here
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/kthread.h>
30
31#include <cluster/heartbeat.h>
32#include <cluster/nodemanager.h>
33#include <cluster/tcp.h>
34
35#include <dlm/dlmapi.h>
36
37#define MLOG_MASK_PREFIX ML_VOTE
38#include <cluster/masklog.h>
39
40#include "ocfs2.h"
41
42#include "alloc.h"
43#include "dlmglue.h"
44#include "extent_map.h"
45#include "heartbeat.h"
46#include "inode.h"
47#include "journal.h"
48#include "slot_map.h"
49#include "vote.h"
50
51#include "buffer_head_io.h"
52
53#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
54#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
55struct ocfs2_msg_hdr
56{
57 __be32 h_response_id; /* used to lookup message handle on sending
58 * node. */
59 __be32 h_request;
60 __be64 h_blkno;
61 __be32 h_generation;
62 __be32 h_node_num; /* node sending this particular message. */
63};
64
65struct ocfs2_vote_msg
66{
67 struct ocfs2_msg_hdr v_hdr;
68 __be32 v_reserved1;
69} __attribute__ ((packed));
70
71/* Responses are given these values to maintain backwards
72 * compatibility with older ocfs2 versions */
73#define OCFS2_RESPONSE_OK (0)
74#define OCFS2_RESPONSE_BUSY (-16)
75#define OCFS2_RESPONSE_BAD_MSG (-22)
76
77struct ocfs2_response_msg
78{
79 struct ocfs2_msg_hdr r_hdr;
80 __be32 r_response;
81} __attribute__ ((packed));
82
83struct ocfs2_vote_work {
84 struct list_head w_list;
85 struct ocfs2_vote_msg w_msg;
86};
87
88enum ocfs2_vote_request {
89 OCFS2_VOTE_REQ_INVALID = 0,
90 OCFS2_VOTE_REQ_MOUNT,
91 OCFS2_VOTE_REQ_UMOUNT,
92 OCFS2_VOTE_REQ_LAST
93};
94
95static inline int ocfs2_is_valid_vote_request(int request)
96{
97 return OCFS2_VOTE_REQ_INVALID < request &&
98 request < OCFS2_VOTE_REQ_LAST;
99}
100
101typedef void (*ocfs2_net_response_callback)(void *priv,
102 struct ocfs2_response_msg *resp);
103struct ocfs2_net_response_cb {
104 ocfs2_net_response_callback rc_cb;
105 void *rc_priv;
106};
107
108struct ocfs2_net_wait_ctxt {
109 struct list_head n_list;
110 u32 n_response_id;
111 wait_queue_head_t n_event;
112 struct ocfs2_node_map n_node_map;
113 int n_response; /* an agreggate response. 0 if
114 * all nodes are go, < 0 on any
115 * negative response from any
116 * node or network error. */
117 struct ocfs2_net_response_cb *n_callback;
118};
119
120static void ocfs2_process_mount_request(struct ocfs2_super *osb,
121 unsigned int node_num)
122{
123 mlog(0, "MOUNT vote from node %u\n", node_num);
124 /* The other node only sends us this message when he has an EX
125 * on the superblock, so our recovery threads (if having been
126 * launched) are waiting on it.*/
127 ocfs2_recovery_map_clear(osb, node_num);
128 ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
129
130 /* We clear the umount map here because a node may have been
131 * previously mounted, safely unmounted but never stopped
132 * heartbeating - in which case we'd have a stale entry. */
133 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
134}
135
136static void ocfs2_process_umount_request(struct ocfs2_super *osb,
137 unsigned int node_num)
138{
139 mlog(0, "UMOUNT vote from node %u\n", node_num);
140 ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
141 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
142}
143
144static void ocfs2_process_vote(struct ocfs2_super *osb,
145 struct ocfs2_vote_msg *msg)
146{
147 int net_status, vote_response;
148 unsigned int node_num;
149 u64 blkno;
150 enum ocfs2_vote_request request;
151 struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
152 struct ocfs2_response_msg response;
153
154 /* decode the network mumbo jumbo into local variables. */
155 request = be32_to_cpu(hdr->h_request);
156 blkno = be64_to_cpu(hdr->h_blkno);
157 node_num = be32_to_cpu(hdr->h_node_num);
158
159 mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
160 request, (unsigned long long)blkno, node_num);
161
162 if (!ocfs2_is_valid_vote_request(request)) {
163 mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
164 request, node_num);
165 vote_response = OCFS2_RESPONSE_BAD_MSG;
166 goto respond;
167 }
168
169 vote_response = OCFS2_RESPONSE_OK;
170
171 switch (request) {
172 case OCFS2_VOTE_REQ_UMOUNT:
173 ocfs2_process_umount_request(osb, node_num);
174 goto respond;
175 case OCFS2_VOTE_REQ_MOUNT:
176 ocfs2_process_mount_request(osb, node_num);
177 goto respond;
178 default:
179 /* avoids a gcc warning */
180 break;
181 }
182
183respond:
184 /* Response struture is small so we just put it on the stack
185 * and stuff it inline. */
186 memset(&response, 0, sizeof(struct ocfs2_response_msg));
187 response.r_hdr.h_response_id = hdr->h_response_id;
188 response.r_hdr.h_blkno = hdr->h_blkno;
189 response.r_hdr.h_generation = hdr->h_generation;
190 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
191 response.r_response = cpu_to_be32(vote_response);
192
193 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
194 osb->net_key,
195 &response,
196 sizeof(struct ocfs2_response_msg),
197 node_num,
198 NULL);
199 /* We still want to error print for ENOPROTOOPT here. The
200 * sending node shouldn't have unregistered his net handler
201 * without sending an unmount vote 1st */
202 if (net_status < 0
203 && net_status != -ETIMEDOUT
204 && net_status != -ENOTCONN)
205 mlog(ML_ERROR, "message to node %u fails with error %d!\n",
206 node_num, net_status);
207}
208
209static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
210{
211 unsigned long processed;
212 struct ocfs2_lock_res *lockres;
213 struct ocfs2_vote_work *work;
214
215 mlog_entry_void();
216
217 spin_lock(&osb->vote_task_lock);
218 /* grab this early so we know to try again if a state change and
219 * wake happens part-way through our work */
220 osb->vote_work_sequence = osb->vote_wake_sequence;
221
222 processed = osb->blocked_lock_count;
223 while (processed) {
224 BUG_ON(list_empty(&osb->blocked_lock_list));
225
226 lockres = list_entry(osb->blocked_lock_list.next,
227 struct ocfs2_lock_res, l_blocked_list);
228 list_del_init(&lockres->l_blocked_list);
229 osb->blocked_lock_count--;
230 spin_unlock(&osb->vote_task_lock);
231
232 BUG_ON(!processed);
233 processed--;
234
235 ocfs2_process_blocked_lock(osb, lockres);
236
237 spin_lock(&osb->vote_task_lock);
238 }
239
240 while (osb->vote_count) {
241 BUG_ON(list_empty(&osb->vote_list));
242 work = list_entry(osb->vote_list.next,
243 struct ocfs2_vote_work, w_list);
244 list_del(&work->w_list);
245 osb->vote_count--;
246 spin_unlock(&osb->vote_task_lock);
247
248 ocfs2_process_vote(osb, &work->w_msg);
249 kfree(work);
250
251 spin_lock(&osb->vote_task_lock);
252 }
253 spin_unlock(&osb->vote_task_lock);
254
255 mlog_exit_void();
256}
257
258static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
259{
260 int empty = 0;
261
262 spin_lock(&osb->vote_task_lock);
263 if (list_empty(&osb->blocked_lock_list) &&
264 list_empty(&osb->vote_list))
265 empty = 1;
266
267 spin_unlock(&osb->vote_task_lock);
268 return empty;
269}
270
271static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
272{
273 int should_wake = 0;
274
275 spin_lock(&osb->vote_task_lock);
276 if (osb->vote_work_sequence != osb->vote_wake_sequence)
277 should_wake = 1;
278 spin_unlock(&osb->vote_task_lock);
279
280 return should_wake;
281}
282
283int ocfs2_vote_thread(void *arg)
284{
285 int status = 0;
286 struct ocfs2_super *osb = arg;
287
288 /* only quit once we've been asked to stop and there is no more
289 * work available */
290 while (!(kthread_should_stop() &&
291 ocfs2_vote_thread_lists_empty(osb))) {
292
293 wait_event_interruptible(osb->vote_event,
294 ocfs2_vote_thread_should_wake(osb) ||
295 kthread_should_stop());
296
297 mlog(0, "vote_thread: awoken\n");
298
299 ocfs2_vote_thread_do_work(osb);
300 }
301
302 osb->vote_task = NULL;
303 return status;
304}
305
306static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
307{
308 struct ocfs2_net_wait_ctxt *w;
309
310 w = kzalloc(sizeof(*w), GFP_NOFS);
311 if (!w) {
312 mlog_errno(-ENOMEM);
313 goto bail;
314 }
315
316 INIT_LIST_HEAD(&w->n_list);
317 init_waitqueue_head(&w->n_event);
318 ocfs2_node_map_init(&w->n_node_map);
319 w->n_response_id = response_id;
320 w->n_callback = NULL;
321bail:
322 return w;
323}
324
325static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
326{
327 unsigned int ret;
328
329 spin_lock(&osb->net_response_lock);
330 ret = ++osb->net_response_ids;
331 spin_unlock(&osb->net_response_lock);
332
333 return ret;
334}
335
336static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
337 struct ocfs2_net_wait_ctxt *w)
338{
339 spin_lock(&osb->net_response_lock);
340 list_del(&w->n_list);
341 spin_unlock(&osb->net_response_lock);
342}
343
344static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
345 struct ocfs2_net_wait_ctxt *w)
346{
347 spin_lock(&osb->net_response_lock);
348 list_add_tail(&w->n_list,
349 &osb->net_response_list);
350 spin_unlock(&osb->net_response_lock);
351}
352
353static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
354 struct ocfs2_net_wait_ctxt *w,
355 int node_num)
356{
357 assert_spin_locked(&osb->net_response_lock);
358
359 ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
360 if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
361 wake_up(&w->n_event);
362}
363
364/* Intended to be called from the node down callback, we fake remove
365 * the node from all our response contexts */
366void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
367 int node_num)
368{
369 struct list_head *p;
370 struct ocfs2_net_wait_ctxt *w = NULL;
371
372 spin_lock(&osb->net_response_lock);
373
374 list_for_each(p, &osb->net_response_list) {
375 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
376
377 __ocfs2_mark_node_responded(osb, w, node_num);
378 }
379
380 spin_unlock(&osb->net_response_lock);
381}
382
383static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
384 struct ocfs2_vote_msg *request,
385 unsigned int response_id,
386 int *response,
387 struct ocfs2_net_response_cb *callback)
388{
389 int status, i, remote_err;
390 struct ocfs2_net_wait_ctxt *w = NULL;
391 int dequeued = 0;
392
393 mlog_entry_void();
394
395 w = ocfs2_new_net_wait_ctxt(response_id);
396 if (!w) {
397 status = -ENOMEM;
398 mlog_errno(status);
399 goto bail;
400 }
401 w->n_callback = callback;
402
403 /* we're pretty much ready to go at this point, and this fills
404 * in n_response which we need anyway... */
405 ocfs2_queue_net_wait_ctxt(osb, w);
406
407 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
408
409 while (i != O2NM_INVALID_NODE_NUM) {
410 if (i != osb->node_num) {
411 mlog(0, "trying to send request to node %i\n", i);
412 ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
413
414 remote_err = 0;
415 status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
416 osb->net_key,
417 request,
418 sizeof(*request),
419 i,
420 &remote_err);
421 if (status == -ETIMEDOUT) {
422 mlog(0, "remote node %d timed out!\n", i);
423 status = -EAGAIN;
424 goto bail;
425 }
426 if (remote_err < 0) {
427 status = remote_err;
428 mlog(0, "remote error %d on node %d!\n",
429 remote_err, i);
430 mlog_errno(status);
431 goto bail;
432 }
433 if (status < 0) {
434 mlog_errno(status);
435 goto bail;
436 }
437 }
438 i++;
439 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
440 mlog(0, "next is %d, i am %d\n", i, osb->node_num);
441 }
442 mlog(0, "done sending, now waiting on responses...\n");
443
444 wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
445
446 ocfs2_dequeue_net_wait_ctxt(osb, w);
447 dequeued = 1;
448
449 *response = w->n_response;
450 status = 0;
451bail:
452 if (w) {
453 if (!dequeued)
454 ocfs2_dequeue_net_wait_ctxt(osb, w);
455 kfree(w);
456 }
457
458 mlog_exit(status);
459 return status;
460}
461
462static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
463 u64 blkno,
464 unsigned int generation,
465 enum ocfs2_vote_request type)
466{
467 struct ocfs2_vote_msg *request;
468 struct ocfs2_msg_hdr *hdr;
469
470 BUG_ON(!ocfs2_is_valid_vote_request(type));
471
472 request = kzalloc(sizeof(*request), GFP_NOFS);
473 if (!request) {
474 mlog_errno(-ENOMEM);
475 } else {
476 hdr = &request->v_hdr;
477 hdr->h_node_num = cpu_to_be32(osb->node_num);
478 hdr->h_request = cpu_to_be32(type);
479 hdr->h_blkno = cpu_to_be64(blkno);
480 hdr->h_generation = cpu_to_be32(generation);
481 }
482
483 return request;
484}
485
486/* Complete the buildup of a new vote request and process the
487 * broadcast return value. */
488static int ocfs2_do_request_vote(struct ocfs2_super *osb,
489 struct ocfs2_vote_msg *request,
490 struct ocfs2_net_response_cb *callback)
491{
492 int status, response = -EBUSY;
493 unsigned int response_id;
494 struct ocfs2_msg_hdr *hdr;
495
496 response_id = ocfs2_new_response_id(osb);
497
498 hdr = &request->v_hdr;
499 hdr->h_response_id = cpu_to_be32(response_id);
500
501 status = ocfs2_broadcast_vote(osb, request, response_id, &response,
502 callback);
503 if (status < 0) {
504 mlog_errno(status);
505 goto bail;
506 }
507
508 status = response;
509bail:
510
511 return status;
512}
513
514int ocfs2_request_mount_vote(struct ocfs2_super *osb)
515{
516 int status;
517 struct ocfs2_vote_msg *request = NULL;
518
519 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
520 if (!request) {
521 status = -ENOMEM;
522 goto bail;
523 }
524
525 status = -EAGAIN;
526 while (status == -EAGAIN) {
527 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
528 signal_pending(current)) {
529 status = -ERESTARTSYS;
530 goto bail;
531 }
532
533 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
534 osb->node_num)) {
535 status = 0;
536 goto bail;
537 }
538
539 status = ocfs2_do_request_vote(osb, request, NULL);
540 }
541
542bail:
543 kfree(request);
544 return status;
545}
546
547int ocfs2_request_umount_vote(struct ocfs2_super *osb)
548{
549 int status;
550 struct ocfs2_vote_msg *request = NULL;
551
552 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
553 if (!request) {
554 status = -ENOMEM;
555 goto bail;
556 }
557
558 status = -EAGAIN;
559 while (status == -EAGAIN) {
560 /* Do not check signals on this vote... We really want
561 * this one to go all the way through. */
562
563 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
564 osb->node_num)) {
565 status = 0;
566 goto bail;
567 }
568
569 status = ocfs2_do_request_vote(osb, request, NULL);
570 }
571
572bail:
573 kfree(request);
574 return status;
575}
576
577/* TODO: This should eventually be a hash table! */
578static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
579 u32 response_id)
580{
581 struct list_head *p;
582 struct ocfs2_net_wait_ctxt *w = NULL;
583
584 list_for_each(p, &osb->net_response_list) {
585 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
586 if (response_id == w->n_response_id)
587 break;
588 w = NULL;
589 }
590
591 return w;
592}
593
594/* Translate response codes into local node errno values */
595static inline int ocfs2_translate_response(int response)
596{
597 int ret;
598
599 switch (response) {
600 case OCFS2_RESPONSE_OK:
601 ret = 0;
602 break;
603
604 case OCFS2_RESPONSE_BUSY:
605 ret = -EBUSY;
606 break;
607
608 default:
609 ret = -EINVAL;
610 }
611
612 return ret;
613}
614
615static int ocfs2_handle_response_message(struct o2net_msg *msg,
616 u32 len,
617 void *data, void **ret_data)
618{
619 unsigned int response_id, node_num;
620 int response_status;
621 struct ocfs2_super *osb = data;
622 struct ocfs2_response_msg *resp;
623 struct ocfs2_net_wait_ctxt * w;
624 struct ocfs2_net_response_cb *resp_cb;
625
626 resp = (struct ocfs2_response_msg *) msg->buf;
627
628 response_id = be32_to_cpu(resp->r_hdr.h_response_id);
629 node_num = be32_to_cpu(resp->r_hdr.h_node_num);
630 response_status =
631 ocfs2_translate_response(be32_to_cpu(resp->r_response));
632
633 mlog(0, "received response message:\n");
634 mlog(0, "h_response_id = %u\n", response_id);
635 mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
636 mlog(0, "h_blkno = %llu\n",
637 (unsigned long long)be64_to_cpu(resp->r_hdr.h_blkno));
638 mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
639 mlog(0, "h_node_num = %u\n", node_num);
640 mlog(0, "r_response = %d\n", response_status);
641
642 spin_lock(&osb->net_response_lock);
643 w = __ocfs2_find_net_wait_ctxt(osb, response_id);
644 if (!w) {
645 mlog(0, "request not found!\n");
646 goto bail;
647 }
648 resp_cb = w->n_callback;
649
650 if (response_status && (!w->n_response)) {
651 /* we only really need one negative response so don't
652 * set it twice. */
653 w->n_response = response_status;
654 }
655
656 if (resp_cb) {
657 spin_unlock(&osb->net_response_lock);
658
659 resp_cb->rc_cb(resp_cb->rc_priv, resp);
660
661 spin_lock(&osb->net_response_lock);
662 }
663
664 __ocfs2_mark_node_responded(osb, w, node_num);
665bail:
666 spin_unlock(&osb->net_response_lock);
667
668 return 0;
669}
670
671static int ocfs2_handle_vote_message(struct o2net_msg *msg,
672 u32 len,
673 void *data, void **ret_data)
674{
675 int status;
676 struct ocfs2_super *osb = data;
677 struct ocfs2_vote_work *work;
678
679 work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS);
680 if (!work) {
681 status = -ENOMEM;
682 mlog_errno(status);
683 goto bail;
684 }
685
686 INIT_LIST_HEAD(&work->w_list);
687 memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
688
689 mlog(0, "scheduling vote request:\n");
690 mlog(0, "h_response_id = %u\n",
691 be32_to_cpu(work->w_msg.v_hdr.h_response_id));
692 mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
693 mlog(0, "h_blkno = %llu\n",
694 (unsigned long long)be64_to_cpu(work->w_msg.v_hdr.h_blkno));
695 mlog(0, "h_generation = %u\n",
696 be32_to_cpu(work->w_msg.v_hdr.h_generation));
697 mlog(0, "h_node_num = %u\n",
698 be32_to_cpu(work->w_msg.v_hdr.h_node_num));
699
700 spin_lock(&osb->vote_task_lock);
701 list_add_tail(&work->w_list, &osb->vote_list);
702 osb->vote_count++;
703 spin_unlock(&osb->vote_task_lock);
704
705 ocfs2_kick_vote_thread(osb);
706
707 status = 0;
708bail:
709 return status;
710}
711
712void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
713{
714 if (!osb->net_key)
715 return;
716
717 o2net_unregister_handler_list(&osb->osb_net_handlers);
718
719 if (!list_empty(&osb->net_response_list))
720 mlog(ML_ERROR, "net response list not empty!\n");
721
722 osb->net_key = 0;
723}
724
725int ocfs2_register_net_handlers(struct ocfs2_super *osb)
726{
727 int status = 0;
728
729 if (ocfs2_mount_local(osb))
730 return 0;
731
732 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
733 osb->net_key,
734 sizeof(struct ocfs2_response_msg),
735 ocfs2_handle_response_message,
736 osb, NULL, &osb->osb_net_handlers);
737 if (status) {
738 mlog_errno(status);
739 goto bail;
740 }
741
742 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
743 osb->net_key,
744 sizeof(struct ocfs2_vote_msg),
745 ocfs2_handle_vote_message,
746 osb, NULL, &osb->osb_net_handlers);
747 if (status) {
748 mlog_errno(status);
749 goto bail;
750 }
751bail:
752 if (status < 0)
753 ocfs2_unregister_net_handlers(osb);
754
755 return status;
756}
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d88173840082..6b7ff1618945 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -131,7 +131,7 @@ static void property_stop(struct seq_file *f, void *v)
131 /* Nothing to do */ 131 /* Nothing to do */
132} 132}
133 133
134static struct seq_operations property_op = { 134static const struct seq_operations property_op = {
135 .start = property_start, 135 .start = property_start,
136 .next = property_next, 136 .next = property_next,
137 .stop = property_stop, 137 .stop = property_stop,
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 722e12e5acc7..739da701ae7b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -195,96 +195,45 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
195 return ERR_PTR(res); 195 return ERR_PTR(res);
196} 196}
197 197
198/* 198static ssize_t part_start_show(struct device *dev,
199 * sysfs bindings for partitions 199 struct device_attribute *attr, char *buf)
200 */
201
202struct part_attribute {
203 struct attribute attr;
204 ssize_t (*show)(struct hd_struct *,char *);
205 ssize_t (*store)(struct hd_struct *,const char *, size_t);
206};
207
208static ssize_t
209part_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
210{ 200{
211 struct hd_struct * p = container_of(kobj,struct hd_struct,kobj); 201 struct hd_struct *p = dev_to_part(dev);
212 struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
213 ssize_t ret = 0;
214 if (part_attr->show)
215 ret = part_attr->show(p, page);
216 return ret;
217}
218static ssize_t
219part_attr_store(struct kobject * kobj, struct attribute * attr,
220 const char *page, size_t count)
221{
222 struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
223 struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
224 ssize_t ret = 0;
225 202
226 if (part_attr->store) 203 return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
227 ret = part_attr->store(p, page, count);
228 return ret;
229} 204}
230 205
231static struct sysfs_ops part_sysfs_ops = { 206static ssize_t part_size_show(struct device *dev,
232 .show = part_attr_show, 207 struct device_attribute *attr, char *buf)
233 .store = part_attr_store,
234};
235
236static ssize_t part_uevent_store(struct hd_struct * p,
237 const char *page, size_t count)
238{ 208{
239 kobject_uevent(&p->kobj, KOBJ_ADD); 209 struct hd_struct *p = dev_to_part(dev);
240 return count; 210 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
241} 211}
242static ssize_t part_dev_read(struct hd_struct * p, char *page) 212
243{ 213static ssize_t part_stat_show(struct device *dev,
244 struct gendisk *disk = container_of(p->kobj.parent,struct gendisk,kobj); 214 struct device_attribute *attr, char *buf)
245 dev_t dev = MKDEV(disk->major, disk->first_minor + p->partno);
246 return print_dev_t(page, dev);
247}
248static ssize_t part_start_read(struct hd_struct * p, char *page)
249{
250 return sprintf(page, "%llu\n",(unsigned long long)p->start_sect);
251}
252static ssize_t part_size_read(struct hd_struct * p, char *page)
253{
254 return sprintf(page, "%llu\n",(unsigned long long)p->nr_sects);
255}
256static ssize_t part_stat_read(struct hd_struct * p, char *page)
257{ 215{
258 return sprintf(page, "%8u %8llu %8u %8llu\n", 216 struct hd_struct *p = dev_to_part(dev);
217
218 return sprintf(buf, "%8u %8llu %8u %8llu\n",
259 p->ios[0], (unsigned long long)p->sectors[0], 219 p->ios[0], (unsigned long long)p->sectors[0],
260 p->ios[1], (unsigned long long)p->sectors[1]); 220 p->ios[1], (unsigned long long)p->sectors[1]);
261} 221}
262static struct part_attribute part_attr_uevent = {
263 .attr = {.name = "uevent", .mode = S_IWUSR },
264 .store = part_uevent_store
265};
266static struct part_attribute part_attr_dev = {
267 .attr = {.name = "dev", .mode = S_IRUGO },
268 .show = part_dev_read
269};
270static struct part_attribute part_attr_start = {
271 .attr = {.name = "start", .mode = S_IRUGO },
272 .show = part_start_read
273};
274static struct part_attribute part_attr_size = {
275 .attr = {.name = "size", .mode = S_IRUGO },
276 .show = part_size_read
277};
278static struct part_attribute part_attr_stat = {
279 .attr = {.name = "stat", .mode = S_IRUGO },
280 .show = part_stat_read
281};
282 222
283#ifdef CONFIG_FAIL_MAKE_REQUEST 223#ifdef CONFIG_FAIL_MAKE_REQUEST
224static ssize_t part_fail_show(struct device *dev,
225 struct device_attribute *attr, char *buf)
226{
227 struct hd_struct *p = dev_to_part(dev);
284 228
285static ssize_t part_fail_store(struct hd_struct * p, 229 return sprintf(buf, "%d\n", p->make_it_fail);
230}
231
232static ssize_t part_fail_store(struct device *dev,
233 struct device_attribute *attr,
286 const char *buf, size_t count) 234 const char *buf, size_t count)
287{ 235{
236 struct hd_struct *p = dev_to_part(dev);
288 int i; 237 int i;
289 238
290 if (count > 0 && sscanf(buf, "%d", &i) > 0) 239 if (count > 0 && sscanf(buf, "%d", &i) > 0)
@@ -292,50 +241,53 @@ static ssize_t part_fail_store(struct hd_struct * p,
292 241
293 return count; 242 return count;
294} 243}
295static ssize_t part_fail_read(struct hd_struct * p, char *page) 244#endif
296{
297 return sprintf(page, "%d\n", p->make_it_fail);
298}
299static struct part_attribute part_attr_fail = {
300 .attr = {.name = "make-it-fail", .mode = S_IRUGO | S_IWUSR },
301 .store = part_fail_store,
302 .show = part_fail_read
303};
304 245
246static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
247static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
248static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
249#ifdef CONFIG_FAIL_MAKE_REQUEST
250static struct device_attribute dev_attr_fail =
251 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
305#endif 252#endif
306 253
307static struct attribute * default_attrs[] = { 254static struct attribute *part_attrs[] = {
308 &part_attr_uevent.attr, 255 &dev_attr_start.attr,
309 &part_attr_dev.attr, 256 &dev_attr_size.attr,
310 &part_attr_start.attr, 257 &dev_attr_stat.attr,
311 &part_attr_size.attr,
312 &part_attr_stat.attr,
313#ifdef CONFIG_FAIL_MAKE_REQUEST 258#ifdef CONFIG_FAIL_MAKE_REQUEST
314 &part_attr_fail.attr, 259 &dev_attr_fail.attr,
315#endif 260#endif
316 NULL, 261 NULL
317}; 262};
318 263
319extern struct kset block_subsys; 264static struct attribute_group part_attr_group = {
265 .attrs = part_attrs,
266};
320 267
321static void part_release(struct kobject *kobj) 268static struct attribute_group *part_attr_groups[] = {
269 &part_attr_group,
270 NULL
271};
272
273static void part_release(struct device *dev)
322{ 274{
323 struct hd_struct * p = container_of(kobj,struct hd_struct,kobj); 275 struct hd_struct *p = dev_to_part(dev);
324 kfree(p); 276 kfree(p);
325} 277}
326 278
327struct kobj_type ktype_part = { 279struct device_type part_type = {
280 .name = "partition",
281 .groups = part_attr_groups,
328 .release = part_release, 282 .release = part_release,
329 .default_attrs = default_attrs,
330 .sysfs_ops = &part_sysfs_ops,
331}; 283};
332 284
333static inline void partition_sysfs_add_subdir(struct hd_struct *p) 285static inline void partition_sysfs_add_subdir(struct hd_struct *p)
334{ 286{
335 struct kobject *k; 287 struct kobject *k;
336 288
337 k = kobject_get(&p->kobj); 289 k = kobject_get(&p->dev.kobj);
338 p->holder_dir = kobject_add_dir(k, "holders"); 290 p->holder_dir = kobject_create_and_add("holders", k);
339 kobject_put(k); 291 kobject_put(k);
340} 292}
341 293
@@ -343,15 +295,16 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
343{ 295{
344 struct kobject *k; 296 struct kobject *k;
345 297
346 k = kobject_get(&disk->kobj); 298 k = kobject_get(&disk->dev.kobj);
347 disk->holder_dir = kobject_add_dir(k, "holders"); 299 disk->holder_dir = kobject_create_and_add("holders", k);
348 disk->slave_dir = kobject_add_dir(k, "slaves"); 300 disk->slave_dir = kobject_create_and_add("slaves", k);
349 kobject_put(k); 301 kobject_put(k);
350} 302}
351 303
352void delete_partition(struct gendisk *disk, int part) 304void delete_partition(struct gendisk *disk, int part)
353{ 305{
354 struct hd_struct *p = disk->part[part-1]; 306 struct hd_struct *p = disk->part[part-1];
307
355 if (!p) 308 if (!p)
356 return; 309 return;
357 if (!p->nr_sects) 310 if (!p->nr_sects)
@@ -361,113 +314,55 @@ void delete_partition(struct gendisk *disk, int part)
361 p->nr_sects = 0; 314 p->nr_sects = 0;
362 p->ios[0] = p->ios[1] = 0; 315 p->ios[0] = p->ios[1] = 0;
363 p->sectors[0] = p->sectors[1] = 0; 316 p->sectors[0] = p->sectors[1] = 0;
364 sysfs_remove_link(&p->kobj, "subsystem"); 317 kobject_put(p->holder_dir);
365 kobject_unregister(p->holder_dir); 318 device_del(&p->dev);
366 kobject_uevent(&p->kobj, KOBJ_REMOVE); 319 put_device(&p->dev);
367 kobject_del(&p->kobj);
368 kobject_put(&p->kobj);
369} 320}
370 321
371void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags) 322void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
372{ 323{
373 struct hd_struct *p; 324 struct hd_struct *p;
325 int err;
374 326
375 p = kzalloc(sizeof(*p), GFP_KERNEL); 327 p = kzalloc(sizeof(*p), GFP_KERNEL);
376 if (!p) 328 if (!p)
377 return; 329 return;
378 330
379 p->start_sect = start; 331 p->start_sect = start;
380 p->nr_sects = len; 332 p->nr_sects = len;
381 p->partno = part; 333 p->partno = part;
382 p->policy = disk->policy; 334 p->policy = disk->policy;
383 335
384 if (isdigit(disk->kobj.k_name[strlen(disk->kobj.k_name)-1])) 336 if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
385 kobject_set_name(&p->kobj, "%sp%d", 337 snprintf(p->dev.bus_id, BUS_ID_SIZE,
386 kobject_name(&disk->kobj), part); 338 "%sp%d", disk->dev.bus_id, part);
387 else 339 else
388 kobject_set_name(&p->kobj, "%s%d", 340 snprintf(p->dev.bus_id, BUS_ID_SIZE,
389 kobject_name(&disk->kobj),part); 341 "%s%d", disk->dev.bus_id, part);
390 p->kobj.parent = &disk->kobj; 342
391 p->kobj.ktype = &ktype_part; 343 device_initialize(&p->dev);
392 kobject_init(&p->kobj); 344 p->dev.devt = MKDEV(disk->major, disk->first_minor + part);
393 kobject_add(&p->kobj); 345 p->dev.class = &block_class;
394 if (!disk->part_uevent_suppress) 346 p->dev.type = &part_type;
395 kobject_uevent(&p->kobj, KOBJ_ADD); 347 p->dev.parent = &disk->dev;
396 sysfs_create_link(&p->kobj, &block_subsys.kobj, "subsystem"); 348 disk->part[part-1] = p;
349
350 /* delay uevent until 'holders' subdir is created */
351 p->dev.uevent_suppress = 1;
352 device_add(&p->dev);
353 partition_sysfs_add_subdir(p);
354 p->dev.uevent_suppress = 0;
397 if (flags & ADDPART_FLAG_WHOLEDISK) { 355 if (flags & ADDPART_FLAG_WHOLEDISK) {
398 static struct attribute addpartattr = { 356 static struct attribute addpartattr = {
399 .name = "whole_disk", 357 .name = "whole_disk",
400 .mode = S_IRUSR | S_IRGRP | S_IROTH, 358 .mode = S_IRUSR | S_IRGRP | S_IROTH,
401 }; 359 };
402 360 err = sysfs_create_file(&p->dev.kobj, &addpartattr);
403 sysfs_create_file(&p->kobj, &addpartattr);
404 } 361 }
405 partition_sysfs_add_subdir(p);
406 disk->part[part-1] = p;
407}
408 362
409static char *make_block_name(struct gendisk *disk) 363 /* suppress uevent if the disk supresses it */
410{ 364 if (!disk->dev.uevent_suppress)
411 char *name; 365 kobject_uevent(&p->dev.kobj, KOBJ_ADD);
412 static char *block_str = "block:";
413 int size;
414 char *s;
415
416 size = strlen(block_str) + strlen(disk->disk_name) + 1;
417 name = kmalloc(size, GFP_KERNEL);
418 if (!name)
419 return NULL;
420 strcpy(name, block_str);
421 strcat(name, disk->disk_name);
422 /* ewww... some of these buggers have / in name... */
423 s = strchr(name, '/');
424 if (s)
425 *s = '!';
426 return name;
427}
428
429static int disk_sysfs_symlinks(struct gendisk *disk)
430{
431 struct device *target = get_device(disk->driverfs_dev);
432 int err;
433 char *disk_name = NULL;
434
435 if (target) {
436 disk_name = make_block_name(disk);
437 if (!disk_name) {
438 err = -ENOMEM;
439 goto err_out;
440 }
441
442 err = sysfs_create_link(&disk->kobj, &target->kobj, "device");
443 if (err)
444 goto err_out_disk_name;
445
446 err = sysfs_create_link(&target->kobj, &disk->kobj, disk_name);
447 if (err)
448 goto err_out_dev_link;
449 }
450
451 err = sysfs_create_link(&disk->kobj, &block_subsys.kobj,
452 "subsystem");
453 if (err)
454 goto err_out_disk_name_lnk;
455
456 kfree(disk_name);
457
458 return 0;
459
460err_out_disk_name_lnk:
461 if (target) {
462 sysfs_remove_link(&target->kobj, disk_name);
463err_out_dev_link:
464 sysfs_remove_link(&disk->kobj, "device");
465err_out_disk_name:
466 kfree(disk_name);
467err_out:
468 put_device(target);
469 }
470 return err;
471} 366}
472 367
473/* Not exported, helper to add_disk(). */ 368/* Not exported, helper to add_disk(). */
@@ -479,19 +374,29 @@ void register_disk(struct gendisk *disk)
479 struct hd_struct *p; 374 struct hd_struct *p;
480 int err; 375 int err;
481 376
482 kobject_set_name(&disk->kobj, "%s", disk->disk_name); 377 disk->dev.parent = disk->driverfs_dev;
483 /* ewww... some of these buggers have / in name... */ 378 disk->dev.devt = MKDEV(disk->major, disk->first_minor);
484 s = strchr(disk->kobj.k_name, '/'); 379
380 strlcpy(disk->dev.bus_id, disk->disk_name, KOBJ_NAME_LEN);
381 /* ewww... some of these buggers have / in the name... */
382 s = strchr(disk->dev.bus_id, '/');
485 if (s) 383 if (s)
486 *s = '!'; 384 *s = '!';
487 if ((err = kobject_add(&disk->kobj))) 385
386 /* delay uevents, until we scanned partition table */
387 disk->dev.uevent_suppress = 1;
388
389 if (device_add(&disk->dev))
488 return; 390 return;
489 err = disk_sysfs_symlinks(disk); 391#ifndef CONFIG_SYSFS_DEPRECATED
392 err = sysfs_create_link(block_depr, &disk->dev.kobj,
393 kobject_name(&disk->dev.kobj));
490 if (err) { 394 if (err) {
491 kobject_del(&disk->kobj); 395 device_del(&disk->dev);
492 return; 396 return;
493 } 397 }
494 disk_sysfs_add_subdirs(disk); 398#endif
399 disk_sysfs_add_subdirs(disk);
495 400
496 /* No minors to use for partitions */ 401 /* No minors to use for partitions */
497 if (disk->minors == 1) 402 if (disk->minors == 1)
@@ -505,25 +410,23 @@ void register_disk(struct gendisk *disk)
505 if (!bdev) 410 if (!bdev)
506 goto exit; 411 goto exit;
507 412
508 /* scan partition table, but suppress uevents */
509 bdev->bd_invalidated = 1; 413 bdev->bd_invalidated = 1;
510 disk->part_uevent_suppress = 1;
511 err = blkdev_get(bdev, FMODE_READ, 0); 414 err = blkdev_get(bdev, FMODE_READ, 0);
512 disk->part_uevent_suppress = 0;
513 if (err < 0) 415 if (err < 0)
514 goto exit; 416 goto exit;
515 blkdev_put(bdev); 417 blkdev_put(bdev);
516 418
517exit: 419exit:
518 /* announce disk after possible partitions are already created */ 420 /* announce disk after possible partitions are created */
519 kobject_uevent(&disk->kobj, KOBJ_ADD); 421 disk->dev.uevent_suppress = 0;
422 kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
520 423
521 /* announce possible partitions */ 424 /* announce possible partitions */
522 for (i = 1; i < disk->minors; i++) { 425 for (i = 1; i < disk->minors; i++) {
523 p = disk->part[i-1]; 426 p = disk->part[i-1];
524 if (!p || !p->nr_sects) 427 if (!p || !p->nr_sects)
525 continue; 428 continue;
526 kobject_uevent(&p->kobj, KOBJ_ADD); 429 kobject_uevent(&p->dev.kobj, KOBJ_ADD);
527 } 430 }
528} 431}
529 432
@@ -602,19 +505,11 @@ void del_gendisk(struct gendisk *disk)
602 disk_stat_set_all(disk, 0); 505 disk_stat_set_all(disk, 0);
603 disk->stamp = 0; 506 disk->stamp = 0;
604 507
605 kobject_uevent(&disk->kobj, KOBJ_REMOVE); 508 kobject_put(disk->holder_dir);
606 kobject_unregister(disk->holder_dir); 509 kobject_put(disk->slave_dir);
607 kobject_unregister(disk->slave_dir); 510 disk->driverfs_dev = NULL;
608 if (disk->driverfs_dev) { 511#ifndef CONFIG_SYSFS_DEPRECATED
609 char *disk_name = make_block_name(disk); 512 sysfs_remove_link(block_depr, disk->dev.bus_id);
610 sysfs_remove_link(&disk->kobj, "device"); 513#endif
611 if (disk_name) { 514 device_del(&disk->dev);
612 sysfs_remove_link(&disk->driverfs_dev->kobj, disk_name);
613 kfree(disk_name);
614 }
615 put_device(disk->driverfs_dev);
616 disk->driverfs_dev = NULL;
617 }
618 sysfs_remove_link(&disk->kobj, "subsystem");
619 kobject_del(&disk->kobj);
620} 515}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 5be663e5dad1..b380313092bd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -164,7 +164,7 @@ static inline char *task_state(struct task_struct *p, char *buffer)
164 ppid = pid_alive(p) ? 164 ppid = pid_alive(p) ?
165 task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; 165 task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
166 tpid = pid_alive(p) && p->ptrace ? 166 tpid = pid_alive(p) && p->ptrace ?
167 task_ppid_nr_ns(rcu_dereference(p->parent), ns) : 0; 167 task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0;
168 buffer += sprintf(buffer, 168 buffer += sprintf(buffer,
169 "State:\t%s\n" 169 "State:\t%s\n"
170 "Tgid:\t%d\n" 170 "Tgid:\t%d\n"
@@ -459,8 +459,8 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
459 } 459 }
460 460
461 sid = task_session_nr_ns(task, ns); 461 sid = task_session_nr_ns(task, ns);
462 ppid = task_tgid_nr_ns(task->real_parent, ns);
462 pgid = task_pgrp_nr_ns(task, ns); 463 pgid = task_pgrp_nr_ns(task, ns);
463 ppid = task_ppid_nr_ns(task, ns);
464 464
465 unlock_task_sighand(task, &flags); 465 unlock_task_sighand(task, &flags);
466 } 466 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e88ee1a0323a..9fa9708cc715 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -202,6 +202,26 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
202 (task_is_stopped_or_traced(task)) && \ 202 (task_is_stopped_or_traced(task)) && \
203 security_ptrace(current,task) == 0)) 203 security_ptrace(current,task) == 0))
204 204
205struct mm_struct *mm_for_maps(struct task_struct *task)
206{
207 struct mm_struct *mm = get_task_mm(task);
208 if (!mm)
209 return NULL;
210 down_read(&mm->mmap_sem);
211 task_lock(task);
212 if (task->mm != mm)
213 goto out;
214 if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
215 goto out;
216 task_unlock(task);
217 return mm;
218out:
219 task_unlock(task);
220 up_read(&mm->mmap_sem);
221 mmput(mm);
222 return NULL;
223}
224
205static int proc_pid_cmdline(struct task_struct *task, char * buffer) 225static int proc_pid_cmdline(struct task_struct *task, char * buffer)
206{ 226{
207 int res = 0; 227 int res = 0;
@@ -290,6 +310,77 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
290} 310}
291#endif 311#endif
292 312
313#ifdef CONFIG_LATENCYTOP
314static int lstats_show_proc(struct seq_file *m, void *v)
315{
316 int i;
317 struct task_struct *task = m->private;
318 seq_puts(m, "Latency Top version : v0.1\n");
319
320 for (i = 0; i < 32; i++) {
321 if (task->latency_record[i].backtrace[0]) {
322 int q;
323 seq_printf(m, "%i %li %li ",
324 task->latency_record[i].count,
325 task->latency_record[i].time,
326 task->latency_record[i].max);
327 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
328 char sym[KSYM_NAME_LEN];
329 char *c;
330 if (!task->latency_record[i].backtrace[q])
331 break;
332 if (task->latency_record[i].backtrace[q] == ULONG_MAX)
333 break;
334 sprint_symbol(sym, task->latency_record[i].backtrace[q]);
335 c = strchr(sym, '+');
336 if (c)
337 *c = 0;
338 seq_printf(m, "%s ", sym);
339 }
340 seq_printf(m, "\n");
341 }
342
343 }
344 return 0;
345}
346
347static int lstats_open(struct inode *inode, struct file *file)
348{
349 int ret;
350 struct seq_file *m;
351 struct task_struct *task = get_proc_task(inode);
352
353 ret = single_open(file, lstats_show_proc, NULL);
354 if (!ret) {
355 m = file->private_data;
356 m->private = task;
357 }
358 return ret;
359}
360
361static ssize_t lstats_write(struct file *file, const char __user *buf,
362 size_t count, loff_t *offs)
363{
364 struct seq_file *m;
365 struct task_struct *task;
366
367 m = file->private_data;
368 task = m->private;
369 clear_all_latency_tracing(task);
370
371 return count;
372}
373
374static const struct file_operations proc_lstats_operations = {
375 .open = lstats_open,
376 .read = seq_read,
377 .write = lstats_write,
378 .llseek = seq_lseek,
379 .release = single_release,
380};
381
382#endif
383
293/* The badness from the OOM killer */ 384/* The badness from the OOM killer */
294unsigned long badness(struct task_struct *p, unsigned long uptime); 385unsigned long badness(struct task_struct *p, unsigned long uptime);
295static int proc_oom_score(struct task_struct *task, char *buffer) 386static int proc_oom_score(struct task_struct *task, char *buffer)
@@ -1000,6 +1091,7 @@ static const struct file_operations proc_fault_inject_operations = {
1000}; 1091};
1001#endif 1092#endif
1002 1093
1094
1003#ifdef CONFIG_SCHED_DEBUG 1095#ifdef CONFIG_SCHED_DEBUG
1004/* 1096/*
1005 * Print out various scheduling related per-task fields: 1097 * Print out various scheduling related per-task fields:
@@ -2210,6 +2302,9 @@ static const struct pid_entry tgid_base_stuff[] = {
2210#ifdef CONFIG_SCHEDSTATS 2302#ifdef CONFIG_SCHEDSTATS
2211 INF("schedstat", S_IRUGO, pid_schedstat), 2303 INF("schedstat", S_IRUGO, pid_schedstat),
2212#endif 2304#endif
2305#ifdef CONFIG_LATENCYTOP
2306 REG("latency", S_IRUGO, lstats),
2307#endif
2213#ifdef CONFIG_PROC_PID_CPUSET 2308#ifdef CONFIG_PROC_PID_CPUSET
2214 REG("cpuset", S_IRUGO, cpuset), 2309 REG("cpuset", S_IRUGO, cpuset),
2215#endif 2310#endif
@@ -2535,6 +2630,9 @@ static const struct pid_entry tid_base_stuff[] = {
2535#ifdef CONFIG_SCHEDSTATS 2630#ifdef CONFIG_SCHEDSTATS
2536 INF("schedstat", S_IRUGO, pid_schedstat), 2631 INF("schedstat", S_IRUGO, pid_schedstat),
2537#endif 2632#endif
2633#ifdef CONFIG_LATENCYTOP
2634 REG("latency", S_IRUGO, lstats),
2635#endif
2538#ifdef CONFIG_PROC_PID_CPUSET 2636#ifdef CONFIG_PROC_PID_CPUSET
2539 REG("cpuset", S_IRUGO, cpuset), 2637 REG("cpuset", S_IRUGO, cpuset),
2540#endif 2638#endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8d49838e5554..6a2fe5187b62 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -374,16 +374,9 @@ static int proc_delete_dentry(struct dentry * dentry)
374 return 1; 374 return 1;
375} 375}
376 376
377static int proc_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
378{
379 d_drop(dentry);
380 return 0;
381}
382
383static struct dentry_operations proc_dentry_operations = 377static struct dentry_operations proc_dentry_operations =
384{ 378{
385 .d_delete = proc_delete_dentry, 379 .d_delete = proc_delete_dentry,
386 .d_revalidate = proc_revalidate_dentry,
387}; 380};
388 381
389/* 382/*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1820eb2ef762..05b3e9006262 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -27,6 +27,8 @@ struct vmalloc_info {
27 unsigned long largest_chunk; 27 unsigned long largest_chunk;
28}; 28};
29 29
30extern struct mm_struct *mm_for_maps(struct task_struct *);
31
30#ifdef CONFIG_MMU 32#ifdef CONFIG_MMU
31#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) 33#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
32extern void get_vmalloc_info(struct vmalloc_info *vmi); 34extern void get_vmalloc_info(struct vmalloc_info *vmi);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index e0d064e9764e..3462bfde89f6 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -410,7 +410,7 @@ static const struct file_operations proc_modules_operations = {
410}; 410};
411#endif 411#endif
412 412
413#ifdef CONFIG_SLAB 413#ifdef CONFIG_SLABINFO
414static int slabinfo_open(struct inode *inode, struct file *file) 414static int slabinfo_open(struct inode *inode, struct file *file)
415{ 415{
416 return seq_open(file, &slabinfo_op); 416 return seq_open(file, &slabinfo_op);
@@ -728,7 +728,7 @@ void __init proc_misc_init(void)
728#endif 728#endif
729 create_seq_entry("stat", 0, &proc_stat_operations); 729 create_seq_entry("stat", 0, &proc_stat_operations);
730 create_seq_entry("interrupts", 0, &proc_interrupts_operations); 730 create_seq_entry("interrupts", 0, &proc_interrupts_operations);
731#ifdef CONFIG_SLAB 731#ifdef CONFIG_SLABINFO
732 create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations); 732 create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
733#ifdef CONFIG_DEBUG_SLAB_LEAK 733#ifdef CONFIG_DEBUG_SLAB_LEAK
734 create_seq_entry("slab_allocators", 0 ,&proc_slabstats_operations); 734 create_seq_entry("slab_allocators", 0 ,&proc_slabstats_operations);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 0afe21ee0607..4823c9677fac 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -22,10 +22,48 @@
22#include <linux/mount.h> 22#include <linux/mount.h>
23#include <linux/nsproxy.h> 23#include <linux/nsproxy.h>
24#include <net/net_namespace.h> 24#include <net/net_namespace.h>
25#include <linux/seq_file.h>
25 26
26#include "internal.h" 27#include "internal.h"
27 28
28 29
30int seq_open_net(struct inode *ino, struct file *f,
31 const struct seq_operations *ops, int size)
32{
33 struct net *net;
34 struct seq_net_private *p;
35
36 BUG_ON(size < sizeof(*p));
37
38 net = get_proc_net(ino);
39 if (net == NULL)
40 return -ENXIO;
41
42 p = __seq_open_private(f, ops, size);
43 if (p == NULL) {
44 put_net(net);
45 return -ENOMEM;
46 }
47 p->net = net;
48 return 0;
49}
50EXPORT_SYMBOL_GPL(seq_open_net);
51
52int seq_release_net(struct inode *ino, struct file *f)
53{
54 struct seq_file *seq;
55 struct seq_net_private *p;
56
57 seq = f->private_data;
58 p = seq->private;
59
60 put_net(p->net);
61 seq_release_private(ino, f);
62 return 0;
63}
64EXPORT_SYMBOL_GPL(seq_release_net);
65
66
29struct proc_dir_entry *proc_net_fops_create(struct net *net, 67struct proc_dir_entry *proc_net_fops_create(struct net *net,
30 const char *name, mode_t mode, const struct file_operations *fops) 68 const char *name, mode_t mode, const struct file_operations *fops)
31{ 69{
@@ -58,6 +96,17 @@ static struct proc_dir_entry *proc_net_shadow(struct task_struct *task,
58 return task->nsproxy->net_ns->proc_net; 96 return task->nsproxy->net_ns->proc_net;
59} 97}
60 98
99struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
100 struct proc_dir_entry *parent)
101{
102 struct proc_dir_entry *pde;
103 pde = proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
104 if (pde != NULL)
105 pde->data = net;
106 return pde;
107}
108EXPORT_SYMBOL_GPL(proc_net_mkdir);
109
61static __net_init int proc_net_ns_init(struct net *net) 110static __net_init int proc_net_ns_init(struct net *net)
62{ 111{
63 struct proc_dir_entry *root, *netd, *net_statd; 112 struct proc_dir_entry *root, *netd, *net_statd;
@@ -69,18 +118,16 @@ static __net_init int proc_net_ns_init(struct net *net)
69 goto out; 118 goto out;
70 119
71 err = -EEXIST; 120 err = -EEXIST;
72 netd = proc_mkdir("net", root); 121 netd = proc_net_mkdir(net, "net", root);
73 if (!netd) 122 if (!netd)
74 goto free_root; 123 goto free_root;
75 124
76 err = -EEXIST; 125 err = -EEXIST;
77 net_statd = proc_mkdir("stat", netd); 126 net_statd = proc_net_mkdir(net, "stat", netd);
78 if (!net_statd) 127 if (!net_statd)
79 goto free_net; 128 goto free_net;
80 129
81 root->data = net; 130 root->data = net;
82 netd->data = net;
83 net_statd->data = net;
84 131
85 net->proc_net_root = root; 132 net->proc_net_root = root;
86 net->proc_net = netd; 133 net->proc_net = netd;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c24d81a5a040..8043a3eab52c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -397,12 +397,11 @@ static void *m_start(struct seq_file *m, loff_t *pos)
397 if (!priv->task) 397 if (!priv->task)
398 return NULL; 398 return NULL;
399 399
400 mm = get_task_mm(priv->task); 400 mm = mm_for_maps(priv->task);
401 if (!mm) 401 if (!mm)
402 return NULL; 402 return NULL;
403 403
404 priv->tail_vma = tail_vma = get_gate_vma(priv->task); 404 priv->tail_vma = tail_vma = get_gate_vma(priv->task);
405 down_read(&mm->mmap_sem);
406 405
407 /* Start with last addr hint */ 406 /* Start with last addr hint */
408 if (last_addr && (vma = find_vma(mm, last_addr))) { 407 if (last_addr && (vma = find_vma(mm, last_addr))) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index d8b8c7183c24..1932c2ca3457 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -165,15 +165,13 @@ static void *m_start(struct seq_file *m, loff_t *pos)
165 if (!priv->task) 165 if (!priv->task)
166 return NULL; 166 return NULL;
167 167
168 mm = get_task_mm(priv->task); 168 mm = mm_for_maps(priv->task);
169 if (!mm) { 169 if (!mm) {
170 put_task_struct(priv->task); 170 put_task_struct(priv->task);
171 priv->task = NULL; 171 priv->task = NULL;
172 return NULL; 172 return NULL;
173 } 173 }
174 174
175 down_read(&mm->mmap_sem);
176
177 /* start from the Nth VMA */ 175 /* start from the Nth VMA */
178 for (vml = mm->context.vmlist; vml; vml = vml->next) 176 for (vml = mm->context.vmlist; vml; vml = vml->next)
179 if (n-- == 0) 177 if (n-- == 0)
diff --git a/fs/read_write.c b/fs/read_write.c
index ea1f94cc722e..1c177f29e1b7 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -197,25 +197,27 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
197{ 197{
198 struct inode *inode; 198 struct inode *inode;
199 loff_t pos; 199 loff_t pos;
200 int retval = -EINVAL;
200 201
201 inode = file->f_path.dentry->d_inode; 202 inode = file->f_path.dentry->d_inode;
202 if (unlikely((ssize_t) count < 0)) 203 if (unlikely((ssize_t) count < 0))
203 goto Einval; 204 return retval;
204 pos = *ppos; 205 pos = *ppos;
205 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) 206 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
206 goto Einval; 207 return retval;
207 208
208 if (unlikely(inode->i_flock && mandatory_lock(inode))) { 209 if (unlikely(inode->i_flock && mandatory_lock(inode))) {
209 int retval = locks_mandatory_area( 210 retval = locks_mandatory_area(
210 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, 211 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
211 inode, file, pos, count); 212 inode, file, pos, count);
212 if (retval < 0) 213 if (retval < 0)
213 return retval; 214 return retval;
214 } 215 }
216 retval = security_file_permission(file,
217 read_write == READ ? MAY_READ : MAY_WRITE);
218 if (retval)
219 return retval;
215 return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; 220 return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
216
217Einval:
218 return -EINVAL;
219} 221}
220 222
221static void wait_on_retry_sync_kiocb(struct kiocb *iocb) 223static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
@@ -267,18 +269,15 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
267 ret = rw_verify_area(READ, file, pos, count); 269 ret = rw_verify_area(READ, file, pos, count);
268 if (ret >= 0) { 270 if (ret >= 0) {
269 count = ret; 271 count = ret;
270 ret = security_file_permission (file, MAY_READ); 272 if (file->f_op->read)
271 if (!ret) { 273 ret = file->f_op->read(file, buf, count, pos);
272 if (file->f_op->read) 274 else
273 ret = file->f_op->read(file, buf, count, pos); 275 ret = do_sync_read(file, buf, count, pos);
274 else 276 if (ret > 0) {
275 ret = do_sync_read(file, buf, count, pos); 277 fsnotify_access(file->f_path.dentry);
276 if (ret > 0) { 278 add_rchar(current, ret);
277 fsnotify_access(file->f_path.dentry);
278 add_rchar(current, ret);
279 }
280 inc_syscr(current);
281 } 279 }
280 inc_syscr(current);
282 } 281 }
283 282
284 return ret; 283 return ret;
@@ -325,18 +324,15 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
325 ret = rw_verify_area(WRITE, file, pos, count); 324 ret = rw_verify_area(WRITE, file, pos, count);
326 if (ret >= 0) { 325 if (ret >= 0) {
327 count = ret; 326 count = ret;
328 ret = security_file_permission (file, MAY_WRITE); 327 if (file->f_op->write)
329 if (!ret) { 328 ret = file->f_op->write(file, buf, count, pos);
330 if (file->f_op->write) 329 else
331 ret = file->f_op->write(file, buf, count, pos); 330 ret = do_sync_write(file, buf, count, pos);
332 else 331 if (ret > 0) {
333 ret = do_sync_write(file, buf, count, pos); 332 fsnotify_modify(file->f_path.dentry);
334 if (ret > 0) { 333 add_wchar(current, ret);
335 fsnotify_modify(file->f_path.dentry);
336 add_wchar(current, ret);
337 }
338 inc_syscw(current);
339 } 334 }
335 inc_syscw(current);
340 } 336 }
341 337
342 return ret; 338 return ret;
@@ -450,6 +446,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
450 } 446 }
451 return seg; 447 return seg;
452} 448}
449EXPORT_SYMBOL(iov_shorten);
453 450
454ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, 451ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
455 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) 452 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
@@ -603,9 +600,6 @@ static ssize_t do_readv_writev(int type, struct file *file,
603 ret = rw_verify_area(type, file, pos, tot_len); 600 ret = rw_verify_area(type, file, pos, tot_len);
604 if (ret < 0) 601 if (ret < 0)
605 goto out; 602 goto out;
606 ret = security_file_permission(file, type == READ ? MAY_READ : MAY_WRITE);
607 if (ret)
608 goto out;
609 603
610 fnv = NULL; 604 fnv = NULL;
611 if (type == READ) { 605 if (type == READ) {
@@ -737,10 +731,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
737 goto fput_in; 731 goto fput_in;
738 count = retval; 732 count = retval;
739 733
740 retval = security_file_permission (in_file, MAY_READ);
741 if (retval)
742 goto fput_in;
743
744 /* 734 /*
745 * Get output file, and verify that it is ok.. 735 * Get output file, and verify that it is ok..
746 */ 736 */
@@ -759,10 +749,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
759 goto fput_out; 749 goto fput_out;
760 count = retval; 750 count = retval;
761 751
762 retval = security_file_permission (out_file, MAY_WRITE);
763 if (retval)
764 goto fput_out;
765
766 if (!max) 752 if (!max)
767 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 753 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
768 754
diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
index 6673ee82cb4c..4faf8c4722c3 100644
--- a/fs/smbfs/Makefile
+++ b/fs/smbfs/Makefile
@@ -16,23 +16,3 @@ EXTRA_CFLAGS += -DSMBFS_PARANOIA
16#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP 16#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
17#EXTRA_CFLAGS += -Werror 17#EXTRA_CFLAGS += -Werror
18 18
19#
20# Maintainer rules
21#
22
23# getopt.c not included. It is intentionally separate
24SRC = proc.c dir.c cache.c sock.c inode.c file.c ioctl.c smbiod.c request.c \
25 symlink.c
26
27proto:
28 -rm -f proto.h
29 @echo > proto2.h "/*"
30 @echo >> proto2.h " * Autogenerated with cproto on: " `date`
31 @echo >> proto2.h " */"
32 @echo >> proto2.h ""
33 @echo >> proto2.h "struct smb_request;"
34 @echo >> proto2.h "struct sock;"
35 @echo >> proto2.h "struct statfs;"
36 @echo >> proto2.h ""
37 cproto -E "gcc -E" -e -v -I $(TOPDIR)/include -DMAKING_PROTO -D__KERNEL__ $(SRC) >> proto2.h
38 mv proto2.h proto.h
diff --git a/fs/splice.c b/fs/splice.c
index 6bdcb6107bc3..1577a7391d23 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -254,11 +254,16 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
254 } 254 }
255 255
256 while (page_nr < spd_pages) 256 while (page_nr < spd_pages)
257 page_cache_release(spd->pages[page_nr++]); 257 spd->spd_release(spd, page_nr++);
258 258
259 return ret; 259 return ret;
260} 260}
261 261
262static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
263{
264 page_cache_release(spd->pages[i]);
265}
266
262static int 267static int
263__generic_file_splice_read(struct file *in, loff_t *ppos, 268__generic_file_splice_read(struct file *in, loff_t *ppos,
264 struct pipe_inode_info *pipe, size_t len, 269 struct pipe_inode_info *pipe, size_t len,
@@ -277,6 +282,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
277 .partial = partial, 282 .partial = partial,
278 .flags = flags, 283 .flags = flags,
279 .ops = &page_cache_pipe_buf_ops, 284 .ops = &page_cache_pipe_buf_ops,
285 .spd_release = spd_release_page,
280 }; 286 };
281 287
282 index = *ppos >> PAGE_CACHE_SHIFT; 288 index = *ppos >> PAGE_CACHE_SHIFT;
@@ -908,10 +914,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
908 if (unlikely(ret < 0)) 914 if (unlikely(ret < 0))
909 return ret; 915 return ret;
910 916
911 ret = security_file_permission(out, MAY_WRITE);
912 if (unlikely(ret < 0))
913 return ret;
914
915 return out->f_op->splice_write(pipe, out, ppos, len, flags); 917 return out->f_op->splice_write(pipe, out, ppos, len, flags);
916} 918}
917 919
@@ -934,10 +936,6 @@ static long do_splice_to(struct file *in, loff_t *ppos,
934 if (unlikely(ret < 0)) 936 if (unlikely(ret < 0))
935 return ret; 937 return ret;
936 938
937 ret = security_file_permission(in, MAY_READ);
938 if (unlikely(ret < 0))
939 return ret;
940
941 return in->f_op->splice_read(in, ppos, pipe, len, flags); 939 return in->f_op->splice_read(in, ppos, pipe, len, flags);
942} 940}
943 941
@@ -1033,7 +1031,11 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1033 goto out_release; 1031 goto out_release;
1034 } 1032 }
1035 1033
1034done:
1036 pipe->nrbufs = pipe->curbuf = 0; 1035 pipe->nrbufs = pipe->curbuf = 0;
1036 if (bytes > 0)
1037 file_accessed(in);
1038
1037 return bytes; 1039 return bytes;
1038 1040
1039out_release: 1041out_release:
@@ -1049,16 +1051,11 @@ out_release:
1049 buf->ops = NULL; 1051 buf->ops = NULL;
1050 } 1052 }
1051 } 1053 }
1052 pipe->nrbufs = pipe->curbuf = 0;
1053
1054 /*
1055 * If we transferred some data, return the number of bytes:
1056 */
1057 if (bytes > 0)
1058 return bytes;
1059 1054
1060 return ret; 1055 if (!bytes)
1056 bytes = ret;
1061 1057
1058 goto done;
1062} 1059}
1063EXPORT_SYMBOL(splice_direct_to_actor); 1060EXPORT_SYMBOL(splice_direct_to_actor);
1064 1061
@@ -1440,6 +1437,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1440 .partial = partial, 1437 .partial = partial,
1441 .flags = flags, 1438 .flags = flags,
1442 .ops = &user_page_pipe_buf_ops, 1439 .ops = &user_page_pipe_buf_ops,
1440 .spd_release = spd_release_page,
1443 }; 1441 };
1444 1442
1445 pipe = pipe_info(file->f_path.dentry->d_inode); 1443 pipe = pipe_info(file->f_path.dentry->d_inode);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 337162935d21..4948d9bc405d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -440,7 +440,7 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
440/** 440/**
441 * sysfs_remove_one - remove sysfs_dirent from parent 441 * sysfs_remove_one - remove sysfs_dirent from parent
442 * @acxt: addrm context to use 442 * @acxt: addrm context to use
443 * @sd: sysfs_dirent to be added 443 * @sd: sysfs_dirent to be removed
444 * 444 *
445 * Mark @sd removed and drop nlink of parent inode if @sd is a 445 * Mark @sd removed and drop nlink of parent inode if @sd is a
446 * directory. @sd is unlinked from the children list. 446 * directory. @sd is unlinked from the children list.
@@ -678,8 +678,10 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
678 sd = sysfs_find_dirent(parent_sd, dentry->d_name.name); 678 sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
679 679
680 /* no such entry */ 680 /* no such entry */
681 if (!sd) 681 if (!sd) {
682 ret = ERR_PTR(-ENOENT);
682 goto out_unlock; 683 goto out_unlock;
684 }
683 685
684 /* attach dentry and inode */ 686 /* attach dentry and inode */
685 inode = sysfs_get_inode(sd); 687 inode = sysfs_get_inode(sd);
@@ -781,6 +783,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
781 old_dentry = sysfs_get_dentry(sd); 783 old_dentry = sysfs_get_dentry(sd);
782 if (IS_ERR(old_dentry)) { 784 if (IS_ERR(old_dentry)) {
783 error = PTR_ERR(old_dentry); 785 error = PTR_ERR(old_dentry);
786 old_dentry = NULL;
784 goto out; 787 goto out;
785 } 788 }
786 789
@@ -848,6 +851,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
848 old_dentry = sysfs_get_dentry(sd); 851 old_dentry = sysfs_get_dentry(sd);
849 if (IS_ERR(old_dentry)) { 852 if (IS_ERR(old_dentry)) {
850 error = PTR_ERR(old_dentry); 853 error = PTR_ERR(old_dentry);
854 old_dentry = NULL;
851 goto out; 855 goto out;
852 } 856 }
853 old_parent = old_dentry->d_parent; 857 old_parent = old_dentry->d_parent;
@@ -855,6 +859,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
855 new_parent = sysfs_get_dentry(new_parent_sd); 859 new_parent = sysfs_get_dentry(new_parent_sd);
856 if (IS_ERR(new_parent)) { 860 if (IS_ERR(new_parent)) {
857 error = PTR_ERR(new_parent); 861 error = PTR_ERR(new_parent);
862 new_parent = NULL;
858 goto out; 863 goto out;
859 } 864 }
860 865
@@ -878,7 +883,6 @@ again:
878 error = 0; 883 error = 0;
879 d_add(new_dentry, NULL); 884 d_add(new_dentry, NULL);
880 d_move(old_dentry, new_dentry); 885 d_move(old_dentry, new_dentry);
881 dput(new_dentry);
882 886
883 /* Remove from old parent's list and insert into new parent's list. */ 887 /* Remove from old parent's list and insert into new parent's list. */
884 sysfs_unlink_sibling(sd); 888 sysfs_unlink_sibling(sd);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4045bdcc4b33..a271c87c4472 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -20,43 +20,6 @@
20 20
21#include "sysfs.h" 21#include "sysfs.h"
22 22
23#define to_sattr(a) container_of(a,struct subsys_attribute, attr)
24
25/*
26 * Subsystem file operations.
27 * These operations allow subsystems to have files that can be
28 * read/written.
29 */
30static ssize_t
31subsys_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
32{
33 struct kset *kset = to_kset(kobj);
34 struct subsys_attribute * sattr = to_sattr(attr);
35 ssize_t ret = -EIO;
36
37 if (sattr->show)
38 ret = sattr->show(kset, page);
39 return ret;
40}
41
42static ssize_t
43subsys_attr_store(struct kobject * kobj, struct attribute * attr,
44 const char * page, size_t count)
45{
46 struct kset *kset = to_kset(kobj);
47 struct subsys_attribute * sattr = to_sattr(attr);
48 ssize_t ret = -EIO;
49
50 if (sattr->store)
51 ret = sattr->store(kset, page, count);
52 return ret;
53}
54
55static struct sysfs_ops subsys_sysfs_ops = {
56 .show = subsys_attr_show,
57 .store = subsys_attr_store,
58};
59
60/* 23/*
61 * There's one sysfs_buffer for each open file and one 24 * There's one sysfs_buffer for each open file and one
62 * sysfs_open_dirent for each sysfs_dirent with one or more open 25 * sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -66,7 +29,7 @@ static struct sysfs_ops subsys_sysfs_ops = {
66 * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open 29 * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open
67 * is protected by sysfs_open_dirent_lock. 30 * is protected by sysfs_open_dirent_lock.
68 */ 31 */
69static spinlock_t sysfs_open_dirent_lock = SPIN_LOCK_UNLOCKED; 32static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
70 33
71struct sysfs_open_dirent { 34struct sysfs_open_dirent {
72 atomic_t refcnt; 35 atomic_t refcnt;
@@ -354,31 +317,23 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
354{ 317{
355 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 318 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
356 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 319 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
357 struct sysfs_buffer * buffer; 320 struct sysfs_buffer *buffer;
358 struct sysfs_ops * ops = NULL; 321 struct sysfs_ops *ops;
359 int error; 322 int error = -EACCES;
360 323
361 /* need attr_sd for attr and ops, its parent for kobj */ 324 /* need attr_sd for attr and ops, its parent for kobj */
362 if (!sysfs_get_active_two(attr_sd)) 325 if (!sysfs_get_active_two(attr_sd))
363 return -ENODEV; 326 return -ENODEV;
364 327
365 /* if the kobject has no ktype, then we assume that it is a subsystem 328 /* every kobject with an attribute needs a ktype assigned */
366 * itself, and use ops for it. 329 if (kobj->ktype && kobj->ktype->sysfs_ops)
367 */
368 if (kobj->kset && kobj->kset->ktype)
369 ops = kobj->kset->ktype->sysfs_ops;
370 else if (kobj->ktype)
371 ops = kobj->ktype->sysfs_ops; 330 ops = kobj->ktype->sysfs_ops;
372 else 331 else {
373 ops = &subsys_sysfs_ops; 332 printk(KERN_ERR "missing sysfs attribute operations for "
374 333 "kobject: %s\n", kobject_name(kobj));
375 error = -EACCES; 334 WARN_ON(1);
376
377 /* No sysfs operations, either from having no subsystem,
378 * or the subsystem have no operations.
379 */
380 if (!ops)
381 goto err_out; 335 goto err_out;
336 }
382 337
383 /* File needs write support. 338 /* File needs write support.
384 * The inode's perms must say it's ok, 339 * The inode's perms must say it's ok,
@@ -568,7 +523,11 @@ int sysfs_add_file_to_group(struct kobject *kobj,
568 struct sysfs_dirent *dir_sd; 523 struct sysfs_dirent *dir_sd;
569 int error; 524 int error;
570 525
571 dir_sd = sysfs_get_dirent(kobj->sd, group); 526 if (group)
527 dir_sd = sysfs_get_dirent(kobj->sd, group);
528 else
529 dir_sd = sysfs_get(kobj->sd);
530
572 if (!dir_sd) 531 if (!dir_sd)
573 return -ENOENT; 532 return -ENOENT;
574 533
@@ -656,7 +615,10 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
656{ 615{
657 struct sysfs_dirent *dir_sd; 616 struct sysfs_dirent *dir_sd;
658 617
659 dir_sd = sysfs_get_dirent(kobj->sd, group); 618 if (group)
619 dir_sd = sysfs_get_dirent(kobj->sd, group);
620 else
621 dir_sd = sysfs_get(kobj->sd);
660 if (dir_sd) { 622 if (dir_sd) {
661 sysfs_hash_and_remove(dir_sd, attr->name); 623 sysfs_hash_and_remove(dir_sd, attr->name);
662 sysfs_put(dir_sd); 624 sysfs_put(dir_sd);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index d1972374655a..0871c3dadce1 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -16,25 +16,31 @@
16#include "sysfs.h" 16#include "sysfs.h"
17 17
18 18
19static void remove_files(struct sysfs_dirent *dir_sd, 19static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
20 const struct attribute_group *grp) 20 const struct attribute_group *grp)
21{ 21{
22 struct attribute *const* attr; 22 struct attribute *const* attr;
23 int i;
23 24
24 for (attr = grp->attrs; *attr; attr++) 25 for (i = 0, attr = grp->attrs; *attr; i++, attr++)
25 sysfs_hash_and_remove(dir_sd, (*attr)->name); 26 if (!grp->is_visible ||
27 grp->is_visible(kobj, *attr, i))
28 sysfs_hash_and_remove(dir_sd, (*attr)->name);
26} 29}
27 30
28static int create_files(struct sysfs_dirent *dir_sd, 31static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
29 const struct attribute_group *grp) 32 const struct attribute_group *grp)
30{ 33{
31 struct attribute *const* attr; 34 struct attribute *const* attr;
32 int error = 0; 35 int error = 0, i;
33 36
34 for (attr = grp->attrs; *attr && !error; attr++) 37 for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++)
35 error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR); 38 if (!grp->is_visible ||
39 grp->is_visible(kobj, *attr, i))
40 error |=
41 sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
36 if (error) 42 if (error)
37 remove_files(dir_sd, grp); 43 remove_files(dir_sd, kobj, grp);
38 return error; 44 return error;
39} 45}
40 46
@@ -54,7 +60,7 @@ int sysfs_create_group(struct kobject * kobj,
54 } else 60 } else
55 sd = kobj->sd; 61 sd = kobj->sd;
56 sysfs_get(sd); 62 sysfs_get(sd);
57 error = create_files(sd, grp); 63 error = create_files(sd, kobj, grp);
58 if (error) { 64 if (error) {
59 if (grp->name) 65 if (grp->name)
60 sysfs_remove_subdir(sd); 66 sysfs_remove_subdir(sd);
@@ -75,7 +81,7 @@ void sysfs_remove_group(struct kobject * kobj,
75 } else 81 } else
76 sd = sysfs_get(dir_sd); 82 sd = sysfs_get(dir_sd);
77 83
78 remove_files(sd, grp); 84 remove_files(sd, kobj, grp);
79 if (grp->name) 85 if (grp->name)
80 sysfs_remove_subdir(sd); 86 sysfs_remove_subdir(sd);
81 87
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3eac20c63c41..5f66c4466151 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -19,39 +19,6 @@
19 19
20#include "sysfs.h" 20#include "sysfs.h"
21 21
22static int object_depth(struct sysfs_dirent *sd)
23{
24 int depth = 0;
25
26 for (; sd->s_parent; sd = sd->s_parent)
27 depth++;
28
29 return depth;
30}
31
32static int object_path_length(struct sysfs_dirent * sd)
33{
34 int length = 1;
35
36 for (; sd->s_parent; sd = sd->s_parent)
37 length += strlen(sd->s_name) + 1;
38
39 return length;
40}
41
42static void fill_object_path(struct sysfs_dirent *sd, char *buffer, int length)
43{
44 --length;
45 for (; sd->s_parent; sd = sd->s_parent) {
46 int cur = strlen(sd->s_name);
47
48 /* back up enough to print this bus id with '/' */
49 length -= cur;
50 strncpy(buffer + length, sd->s_name, cur);
51 *(buffer + --length) = '/';
52 }
53}
54
55/** 22/**
56 * sysfs_create_link - create symlink between two objects. 23 * sysfs_create_link - create symlink between two objects.
57 * @kobj: object whose directory we're creating the link in. 24 * @kobj: object whose directory we're creating the link in.
@@ -112,7 +79,6 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
112 return error; 79 return error;
113} 80}
114 81
115
116/** 82/**
117 * sysfs_remove_link - remove symlink in object's directory. 83 * sysfs_remove_link - remove symlink in object's directory.
118 * @kobj: object we're acting for. 84 * @kobj: object we're acting for.
@@ -124,24 +90,54 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
124 sysfs_hash_and_remove(kobj->sd, name); 90 sysfs_hash_and_remove(kobj->sd, name);
125} 91}
126 92
127static int sysfs_get_target_path(struct sysfs_dirent * parent_sd, 93static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
128 struct sysfs_dirent * target_sd, char *path) 94 struct sysfs_dirent *target_sd, char *path)
129{ 95{
130 char * s; 96 struct sysfs_dirent *base, *sd;
131 int depth, size; 97 char *s = path;
98 int len = 0;
99
100 /* go up to the root, stop at the base */
101 base = parent_sd;
102 while (base->s_parent) {
103 sd = target_sd->s_parent;
104 while (sd->s_parent && base != sd)
105 sd = sd->s_parent;
106
107 if (base == sd)
108 break;
109
110 strcpy(s, "../");
111 s += 3;
112 base = base->s_parent;
113 }
114
115 /* determine end of target string for reverse fillup */
116 sd = target_sd;
117 while (sd->s_parent && sd != base) {
118 len += strlen(sd->s_name) + 1;
119 sd = sd->s_parent;
120 }
132 121
133 depth = object_depth(parent_sd); 122 /* check limits */
134 size = object_path_length(target_sd) + depth * 3 - 1; 123 if (len < 2)
135 if (size > PATH_MAX) 124 return -EINVAL;
125 len--;
126 if ((s - path) + len > PATH_MAX)
136 return -ENAMETOOLONG; 127 return -ENAMETOOLONG;
137 128
138 pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size); 129 /* reverse fillup of target string from target to base */
130 sd = target_sd;
131 while (sd->s_parent && sd != base) {
132 int slen = strlen(sd->s_name);
139 133
140 for (s = path; depth--; s += 3) 134 len -= slen;
141 strcpy(s,"../"); 135 strncpy(s + len, sd->s_name, slen);
136 if (len)
137 s[--len] = '/';
142 138
143 fill_object_path(target_sd, path, size); 139 sd = sd->s_parent;
144 pr_debug("%s: path = '%s'\n", __FUNCTION__, path); 140 }
145 141
146 return 0; 142 return 0;
147} 143}
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b9c8589e05c2..a49dd8d4b069 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -725,15 +725,15 @@ xfs_buf_associate_memory(
725{ 725{
726 int rval; 726 int rval;
727 int i = 0; 727 int i = 0;
728 size_t ptr; 728 unsigned long pageaddr;
729 size_t end, end_cur; 729 unsigned long offset;
730 off_t offset; 730 size_t buflen;
731 int page_count; 731 int page_count;
732 732
733 page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; 733 pageaddr = (unsigned long)mem & PAGE_CACHE_MASK;
734 offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK); 734 offset = (unsigned long)mem - pageaddr;
735 if (offset && (len > PAGE_CACHE_SIZE)) 735 buflen = PAGE_CACHE_ALIGN(len + offset);
736 page_count++; 736 page_count = buflen >> PAGE_CACHE_SHIFT;
737 737
738 /* Free any previous set of page pointers */ 738 /* Free any previous set of page pointers */
739 if (bp->b_pages) 739 if (bp->b_pages)
@@ -747,22 +747,15 @@ xfs_buf_associate_memory(
747 return rval; 747 return rval;
748 748
749 bp->b_offset = offset; 749 bp->b_offset = offset;
750 ptr = (size_t) mem & PAGE_CACHE_MASK; 750
751 end = PAGE_CACHE_ALIGN((size_t) mem + len); 751 for (i = 0; i < bp->b_page_count; i++) {
752 end_cur = end; 752 bp->b_pages[i] = mem_to_page((void *)pageaddr);
753 /* set up first page */ 753 pageaddr += PAGE_CACHE_SIZE;
754 bp->b_pages[0] = mem_to_page(mem);
755
756 ptr += PAGE_CACHE_SIZE;
757 bp->b_page_count = ++i;
758 while (ptr < end) {
759 bp->b_pages[i] = mem_to_page((void *)ptr);
760 bp->b_page_count = ++i;
761 ptr += PAGE_CACHE_SIZE;
762 } 754 }
763 bp->b_locked = 0; 755 bp->b_locked = 0;
764 756
765 bp->b_count_desired = bp->b_buffer_length = len; 757 bp->b_count_desired = len;
758 bp->b_buffer_length = buflen;
766 bp->b_flags |= XBF_MAPPED; 759 bp->b_flags |= XBF_MAPPED;
767 760
768 return 0; 761 return 0;
@@ -1032,7 +1025,7 @@ xfs_buf_ioend(
1032 xfs_buf_t *bp, 1025 xfs_buf_t *bp,
1033 int schedule) 1026 int schedule)
1034{ 1027{
1035 bp->b_flags &= ~(XBF_READ | XBF_WRITE); 1028 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1036 if (bp->b_error == 0) 1029 if (bp->b_error == 0)
1037 bp->b_flags |= XBF_DONE; 1030 bp->b_flags |= XBF_DONE;
1038 1031
@@ -1750,6 +1743,8 @@ xfsbufd(
1750 1743
1751 current->flags |= PF_MEMALLOC; 1744 current->flags |= PF_MEMALLOC;
1752 1745
1746 set_freezable();
1747
1753 do { 1748 do {
1754 if (unlikely(freezing(current))) { 1749 if (unlikely(freezing(current))) {
1755 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1750 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index fb8dd34041eb..21a1c2b1c5fc 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -218,6 +218,15 @@ xfs_vm_fault(
218} 218}
219#endif /* CONFIG_XFS_DMAPI */ 219#endif /* CONFIG_XFS_DMAPI */
220 220
221/*
222 * Unfortunately we can't just use the clean and simple readdir implementation
223 * below, because nfs might call back into ->lookup from the filldir callback
224 * and that will deadlock the low-level btree code.
225 *
226 * Hopefully we'll find a better workaround that allows to use the optimal
227 * version at least for local readdirs for 2.6.25.
228 */
229#if 0
221STATIC int 230STATIC int
222xfs_file_readdir( 231xfs_file_readdir(
223 struct file *filp, 232 struct file *filp,
@@ -249,6 +258,126 @@ xfs_file_readdir(
249 return -error; 258 return -error;
250 return 0; 259 return 0;
251} 260}
261#else
262
263struct hack_dirent {
264 u64 ino;
265 loff_t offset;
266 int namlen;
267 unsigned int d_type;
268 char name[];
269};
270
271struct hack_callback {
272 char *dirent;
273 size_t len;
274 size_t used;
275};
276
277STATIC int
278xfs_hack_filldir(
279 void *__buf,
280 const char *name,
281 int namlen,
282 loff_t offset,
283 u64 ino,
284 unsigned int d_type)
285{
286 struct hack_callback *buf = __buf;
287 struct hack_dirent *de = (struct hack_dirent *)(buf->dirent + buf->used);
288 unsigned int reclen;
289
290 reclen = ALIGN(sizeof(struct hack_dirent) + namlen, sizeof(u64));
291 if (buf->used + reclen > buf->len)
292 return -EINVAL;
293
294 de->namlen = namlen;
295 de->offset = offset;
296 de->ino = ino;
297 de->d_type = d_type;
298 memcpy(de->name, name, namlen);
299 buf->used += reclen;
300 return 0;
301}
302
303STATIC int
304xfs_file_readdir(
305 struct file *filp,
306 void *dirent,
307 filldir_t filldir)
308{
309 struct inode *inode = filp->f_path.dentry->d_inode;
310 xfs_inode_t *ip = XFS_I(inode);
311 struct hack_callback buf;
312 struct hack_dirent *de;
313 int error;
314 loff_t size;
315 int eof = 0;
316 xfs_off_t start_offset, curr_offset, offset;
317
318 /*
319 * Try fairly hard to get memory
320 */
321 buf.len = PAGE_CACHE_SIZE;
322 do {
323 buf.dirent = kmalloc(buf.len, GFP_KERNEL);
324 if (buf.dirent)
325 break;
326 buf.len >>= 1;
327 } while (buf.len >= 1024);
328
329 if (!buf.dirent)
330 return -ENOMEM;
331
332 curr_offset = filp->f_pos;
333 if (curr_offset == 0x7fffffff)
334 offset = 0xffffffff;
335 else
336 offset = filp->f_pos;
337
338 while (!eof) {
339 unsigned int reclen;
340
341 start_offset = offset;
342
343 buf.used = 0;
344 error = -xfs_readdir(ip, &buf, buf.len, &offset,
345 xfs_hack_filldir);
346 if (error || offset == start_offset) {
347 size = 0;
348 break;
349 }
350
351 size = buf.used;
352 de = (struct hack_dirent *)buf.dirent;
353 curr_offset = de->offset /* & 0x7fffffff */;
354 while (size > 0) {
355 if (filldir(dirent, de->name, de->namlen,
356 curr_offset & 0x7fffffff,
357 de->ino, de->d_type)) {
358 goto done;
359 }
360
361 reclen = ALIGN(sizeof(struct hack_dirent) + de->namlen,
362 sizeof(u64));
363 size -= reclen;
364 de = (struct hack_dirent *)((char *)de + reclen);
365 curr_offset = de->offset /* & 0x7fffffff */;
366 }
367 }
368
369 done:
370 if (!error) {
371 if (size == 0)
372 filp->f_pos = offset & 0x7fffffff;
373 else if (de)
374 filp->f_pos = curr_offset;
375 }
376
377 kfree(buf.dirent);
378 return error;
379}
380#endif
252 381
253STATIC int 382STATIC int
254xfs_file_mmap( 383xfs_file_mmap(
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 2b34bad48b07..98a56568bb24 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1047,24 +1047,20 @@ xfs_ioc_bulkstat(
1047 if ((count = bulkreq.icount) <= 0) 1047 if ((count = bulkreq.icount) <= 0)
1048 return -XFS_ERROR(EINVAL); 1048 return -XFS_ERROR(EINVAL);
1049 1049
1050 if (bulkreq.ubuffer == NULL)
1051 return -XFS_ERROR(EINVAL);
1052
1050 if (cmd == XFS_IOC_FSINUMBERS) 1053 if (cmd == XFS_IOC_FSINUMBERS)
1051 error = xfs_inumbers(mp, &inlast, &count, 1054 error = xfs_inumbers(mp, &inlast, &count,
1052 bulkreq.ubuffer, xfs_inumbers_fmt); 1055 bulkreq.ubuffer, xfs_inumbers_fmt);
1053 else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) 1056 else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
1054 error = xfs_bulkstat_single(mp, &inlast, 1057 error = xfs_bulkstat_single(mp, &inlast,
1055 bulkreq.ubuffer, &done); 1058 bulkreq.ubuffer, &done);
1056 else { /* XFS_IOC_FSBULKSTAT */ 1059 else /* XFS_IOC_FSBULKSTAT */
1057 if (count == 1 && inlast != 0) { 1060 error = xfs_bulkstat(mp, &inlast, &count,
1058 inlast++; 1061 (bulkstat_one_pf)xfs_bulkstat_one, NULL,
1059 error = xfs_bulkstat_single(mp, &inlast, 1062 sizeof(xfs_bstat_t), bulkreq.ubuffer,
1060 bulkreq.ubuffer, &done); 1063 BULKSTAT_FG_QUICK, &done);
1061 } else {
1062 error = xfs_bulkstat(mp, &inlast, &count,
1063 (bulkstat_one_pf)xfs_bulkstat_one, NULL,
1064 sizeof(xfs_bstat_t), bulkreq.ubuffer,
1065 BULKSTAT_FG_QUICK, &done);
1066 }
1067 }
1068 1064
1069 if (error) 1065 if (error)
1070 return -error; 1066 return -error;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0046bdd5b7f1..bf2a956b63c2 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -291,6 +291,9 @@ xfs_ioc_bulkstat_compat(
291 if ((count = bulkreq.icount) <= 0) 291 if ((count = bulkreq.icount) <= 0)
292 return -XFS_ERROR(EINVAL); 292 return -XFS_ERROR(EINVAL);
293 293
294 if (bulkreq.ubuffer == NULL)
295 return -XFS_ERROR(EINVAL);
296
294 if (cmd == XFS_IOC_FSINUMBERS) 297 if (cmd == XFS_IOC_FSINUMBERS)
295 error = xfs_inumbers(mp, &inlast, &count, 298 error = xfs_inumbers(mp, &inlast, &count,
296 bulkreq.ubuffer, xfs_inumbers_fmt_compat); 299 bulkreq.ubuffer, xfs_inumbers_fmt_compat);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index ac50f8a37582..5e8bb7f71b5a 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -117,7 +117,7 @@ xfs_ichgtime(
117 */ 117 */
118 SYNCHRONIZE(); 118 SYNCHRONIZE();
119 ip->i_update_core = 1; 119 ip->i_update_core = 1;
120 if (!(inode->i_state & I_SYNC)) 120 if (!(inode->i_state & I_NEW))
121 mark_inode_dirty_sync(inode); 121 mark_inode_dirty_sync(inode);
122} 122}
123 123
@@ -169,7 +169,7 @@ xfs_ichgtime_fast(
169 */ 169 */
170 SYNCHRONIZE(); 170 SYNCHRONIZE();
171 ip->i_update_core = 1; 171 ip->i_update_core = 1;
172 if (!(inode->i_state & I_SYNC)) 172 if (!(inode->i_state & I_NEW))
173 mark_inode_dirty_sync(inode); 173 mark_inode_dirty_sync(inode);
174} 174}
175 175
@@ -332,9 +332,7 @@ xfs_vn_mknod(
332 ASSERT(vp); 332 ASSERT(vp);
333 ip = vn_to_inode(vp); 333 ip = vn_to_inode(vp);
334 334
335 if (S_ISCHR(mode) || S_ISBLK(mode)) 335 if (S_ISDIR(mode))
336 ip->i_rdev = rdev;
337 else if (S_ISDIR(mode))
338 xfs_validate_fields(ip); 336 xfs_validate_fields(ip);
339 d_instantiate(dentry, ip); 337 d_instantiate(dentry, ip);
340 xfs_validate_fields(dir); 338 xfs_validate_fields(dir);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index b5f91281b707..d488645f833d 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1008,6 +1008,9 @@ xfs_qm_sync(
1008 boolean_t nowait; 1008 boolean_t nowait;
1009 int error; 1009 int error;
1010 1010
1011 if (! XFS_IS_QUOTA_ON(mp))
1012 return 0;
1013
1011 restarts = 0; 1014 restarts = 0;
1012 /* 1015 /*
1013 * We won't block unless we are asked to. 1016 * We won't block unless we are asked to.
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index c171767e242a..a5f4f4fb8868 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -508,7 +508,7 @@ xfs_dir2_block_getdents(
508 continue; 508 continue;
509 509
510 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, 510 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
511 ptr - (char *)block); 511 (char *)dep - (char *)block);
512 ino = be64_to_cpu(dep->inumber); 512 ino = be64_to_cpu(dep->inumber);
513#if XFS_BIG_INUMS 513#if XFS_BIG_INUMS
514 ino += mp->m_inoadd; 514 ino += mp->m_inoadd;
@@ -519,9 +519,7 @@ xfs_dir2_block_getdents(
519 */ 519 */
520 if (filldir(dirent, dep->name, dep->namelen, cook, 520 if (filldir(dirent, dep->name, dep->namelen, cook,
521 ino, DT_UNKNOWN)) { 521 ino, DT_UNKNOWN)) {
522 *offset = xfs_dir2_db_off_to_dataptr(mp, 522 *offset = cook;
523 mp->m_dirdatablk,
524 (char *)dep - (char *)block);
525 xfs_da_brelse(NULL, bp); 523 xfs_da_brelse(NULL, bp);
526 return 0; 524 return 0;
527 } 525 }
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e7c12fa1303e..0ca0020ba09f 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1091,7 +1091,7 @@ xfs_dir2_leaf_getdents(
1091 * Won't fit. Return to caller. 1091 * Won't fit. Return to caller.
1092 */ 1092 */
1093 if (filldir(dirent, dep->name, dep->namelen, 1093 if (filldir(dirent, dep->name, dep->namelen,
1094 xfs_dir2_byte_to_dataptr(mp, curoff + length), 1094 xfs_dir2_byte_to_dataptr(mp, curoff),
1095 ino, DT_UNKNOWN)) 1095 ino, DT_UNKNOWN))
1096 break; 1096 break;
1097 1097
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 182c70315ad1..919d275a1cef 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -752,7 +752,7 @@ xfs_dir2_sf_getdents(
752#if XFS_BIG_INUMS 752#if XFS_BIG_INUMS
753 ino += mp->m_inoadd; 753 ino += mp->m_inoadd;
754#endif 754#endif
755 if (filldir(dirent, ".", 1, dotdot_offset, ino, DT_DIR)) { 755 if (filldir(dirent, ".", 1, dot_offset, ino, DT_DIR)) {
756 *offset = dot_offset; 756 *offset = dot_offset;
757 return 0; 757 return 0;
758 } 758 }
@@ -762,13 +762,11 @@ xfs_dir2_sf_getdents(
762 * Put .. entry unless we're starting past it. 762 * Put .. entry unless we're starting past it.
763 */ 763 */
764 if (*offset <= dotdot_offset) { 764 if (*offset <= dotdot_offset) {
765 off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
766 XFS_DIR2_DATA_FIRST_OFFSET);
767 ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); 765 ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
768#if XFS_BIG_INUMS 766#if XFS_BIG_INUMS
769 ino += mp->m_inoadd; 767 ino += mp->m_inoadd;
770#endif 768#endif
771 if (filldir(dirent, "..", 2, off, ino, DT_DIR)) { 769 if (filldir(dirent, "..", 2, dotdot_offset, ino, DT_DIR)) {
772 *offset = dotdot_offset; 770 *offset = dotdot_offset;
773 return 0; 771 return 0;
774 } 772 }
@@ -793,8 +791,7 @@ xfs_dir2_sf_getdents(
793#endif 791#endif
794 792
795 if (filldir(dirent, sfep->name, sfep->namelen, 793 if (filldir(dirent, sfep->name, sfep->namelen,
796 off + xfs_dir2_data_entsize(sfep->namelen), 794 off, ino, DT_UNKNOWN)) {
797 ino, DT_UNKNOWN)) {
798 *offset = off; 795 *offset = off;
799 return 0; 796 return 0;
800 } 797 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 488836e204a3..fb69ef180b27 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -267,7 +267,7 @@ finish_inode:
267 icl = NULL; 267 icl = NULL;
268 if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq, 268 if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
269 first_index, 1)) { 269 first_index, 1)) {
270 if ((iq->i_ino & mask) == first_index) 270 if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index)
271 icl = iq->i_cluster; 271 icl = iq->i_cluster;
272 } 272 }
273 273
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index abf509a88915..344948082819 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1459,8 +1459,10 @@ xfs_itruncate_start(
1459 mp = ip->i_mount; 1459 mp = ip->i_mount;
1460 vp = XFS_ITOV(ip); 1460 vp = XFS_ITOV(ip);
1461 1461
1462 vn_iowait(ip); /* wait for the completion of any pending DIOs */ 1462 /* wait for the completion of any pending DIOs */
1463 1463 if (new_size < ip->i_size)
1464 vn_iowait(ip);
1465
1464 /* 1466 /*
1465 * Call toss_pages or flushinval_pages to get rid of pages 1467 * Call toss_pages or flushinval_pages to get rid of pages
1466 * overlapping the region being removed. We have to use 1468 * overlapping the region being removed. We have to use
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 9972992fd3c3..9fc4c2886529 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -316,6 +316,8 @@ xfs_bulkstat_use_dinode(
316 return 1; 316 return 1;
317} 317}
318 318
319#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)
320
319/* 321/*
320 * Return stat information in bulk (by-inode) for the filesystem. 322 * Return stat information in bulk (by-inode) for the filesystem.
321 */ 323 */
@@ -353,7 +355,7 @@ xfs_bulkstat(
353 xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */ 355 xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */
354 xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ 356 xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
355 xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ 357 xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */
356 xfs_ino_t lastino=0; /* last inode number returned */ 358 xfs_ino_t lastino; /* last inode number returned */
357 int nbcluster; /* # of blocks in a cluster */ 359 int nbcluster; /* # of blocks in a cluster */
358 int nicluster; /* # of inodes in a cluster */ 360 int nicluster; /* # of inodes in a cluster */
359 int nimask; /* mask for inode clusters */ 361 int nimask; /* mask for inode clusters */
@@ -373,6 +375,7 @@ xfs_bulkstat(
373 * Get the last inode value, see if there's nothing to do. 375 * Get the last inode value, see if there's nothing to do.
374 */ 376 */
375 ino = (xfs_ino_t)*lastinop; 377 ino = (xfs_ino_t)*lastinop;
378 lastino = ino;
376 dip = NULL; 379 dip = NULL;
377 agno = XFS_INO_TO_AGNO(mp, ino); 380 agno = XFS_INO_TO_AGNO(mp, ino);
378 agino = XFS_INO_TO_AGINO(mp, ino); 381 agino = XFS_INO_TO_AGINO(mp, ino);
@@ -382,6 +385,9 @@ xfs_bulkstat(
382 *ubcountp = 0; 385 *ubcountp = 0;
383 return 0; 386 return 0;
384 } 387 }
388 if (!ubcountp || *ubcountp <= 0) {
389 return EINVAL;
390 }
385 ubcount = *ubcountp; /* statstruct's */ 391 ubcount = *ubcountp; /* statstruct's */
386 ubleft = ubcount * statstruct_size; /* bytes */ 392 ubleft = ubcount * statstruct_size; /* bytes */
387 *ubcountp = ubelem = 0; 393 *ubcountp = ubelem = 0;
@@ -402,7 +408,8 @@ xfs_bulkstat(
402 * inode returned; 0 means start of the allocation group. 408 * inode returned; 0 means start of the allocation group.
403 */ 409 */
404 rval = 0; 410 rval = 0;
405 while (ubleft >= statstruct_size && agno < mp->m_sb.sb_agcount) { 411 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
412 cond_resched();
406 bp = NULL; 413 bp = NULL;
407 down_read(&mp->m_peraglock); 414 down_read(&mp->m_peraglock);
408 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 415 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
@@ -499,6 +506,7 @@ xfs_bulkstat(
499 break; 506 break;
500 error = xfs_inobt_lookup_ge(cur, agino, 0, 0, 507 error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
501 &tmp); 508 &tmp);
509 cond_resched();
502 } 510 }
503 /* 511 /*
504 * If ran off the end of the ag either with an error, 512 * If ran off the end of the ag either with an error,
@@ -542,6 +550,7 @@ xfs_bulkstat(
542 */ 550 */
543 agino = gino + XFS_INODES_PER_CHUNK; 551 agino = gino + XFS_INODES_PER_CHUNK;
544 error = xfs_inobt_increment(cur, 0, &tmp); 552 error = xfs_inobt_increment(cur, 0, &tmp);
553 cond_resched();
545 } 554 }
546 /* 555 /*
547 * Drop the btree buffers and the agi buffer. 556 * Drop the btree buffers and the agi buffer.
@@ -555,12 +564,12 @@ xfs_bulkstat(
555 */ 564 */
556 irbufend = irbp; 565 irbufend = irbp;
557 for (irbp = irbuf; 566 for (irbp = irbuf;
558 irbp < irbufend && ubleft >= statstruct_size; irbp++) { 567 irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
559 /* 568 /*
560 * Now process this chunk of inodes. 569 * Now process this chunk of inodes.
561 */ 570 */
562 for (agino = irbp->ir_startino, chunkidx = clustidx = 0; 571 for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
563 ubleft > 0 && 572 XFS_BULKSTAT_UBLEFT(ubleft) &&
564 irbp->ir_freecount < XFS_INODES_PER_CHUNK; 573 irbp->ir_freecount < XFS_INODES_PER_CHUNK;
565 chunkidx++, clustidx++, agino++) { 574 chunkidx++, clustidx++, agino++) {
566 ASSERT(chunkidx < XFS_INODES_PER_CHUNK); 575 ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
@@ -663,15 +672,13 @@ xfs_bulkstat(
663 ubleft, private_data, 672 ubleft, private_data,
664 bno, &ubused, dip, &fmterror); 673 bno, &ubused, dip, &fmterror);
665 if (fmterror == BULKSTAT_RV_NOTHING) { 674 if (fmterror == BULKSTAT_RV_NOTHING) {
666 if (error == EFAULT) { 675 if (error && error != ENOENT &&
667 ubleft = 0; 676 error != EINVAL) {
668 rval = error;
669 break;
670 }
671 else if (error == ENOMEM)
672 ubleft = 0; 677 ubleft = 0;
673 else 678 rval = error;
674 lastino = ino; 679 break;
680 }
681 lastino = ino;
675 continue; 682 continue;
676 } 683 }
677 if (fmterror == BULKSTAT_RV_GIVEUP) { 684 if (fmterror == BULKSTAT_RV_GIVEUP) {
@@ -686,6 +693,8 @@ xfs_bulkstat(
686 ubelem++; 693 ubelem++;
687 lastino = ino; 694 lastino = ino;
688 } 695 }
696
697 cond_resched();
689 } 698 }
690 699
691 if (bp) 700 if (bp)
@@ -694,11 +703,12 @@ xfs_bulkstat(
694 /* 703 /*
695 * Set up for the next loop iteration. 704 * Set up for the next loop iteration.
696 */ 705 */
697 if (ubleft > 0) { 706 if (XFS_BULKSTAT_UBLEFT(ubleft)) {
698 if (end_of_ag) { 707 if (end_of_ag) {
699 agno++; 708 agno++;
700 agino = 0; 709 agino = 0;
701 } 710 } else
711 agino = XFS_INO_TO_AGINO(mp, lastino);
702 } else 712 } else
703 break; 713 break;
704 } 714 }
@@ -707,6 +717,11 @@ xfs_bulkstat(
707 */ 717 */
708 kmem_free(irbuf, irbsize); 718 kmem_free(irbuf, irbsize);
709 *ubcountp = ubelem; 719 *ubcountp = ubelem;
720 /*
721 * Found some inodes, return them now and return the error next time.
722 */
723 if (ubelem)
724 rval = 0;
710 if (agno >= mp->m_sb.sb_agcount) { 725 if (agno >= mp->m_sb.sb_agcount) {
711 /* 726 /*
712 * If we ran out of filesystem, mark lastino as off 727 * If we ran out of filesystem, mark lastino as off