aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorHaavard Skinnemoen <haavard.skinnemoen@atmel.com>2009-03-27 11:14:38 -0400
committerHaavard Skinnemoen <haavard.skinnemoen@atmel.com>2009-03-27 11:14:38 -0400
commitb92efa9abffc4a634cd2e7a0f81f8aa6310d67c9 (patch)
tree9847508d9b8d4e585f90db4a453bfbc3700c997e /fs
parenta16fffdd8eb95ebab7dc22414896fe6493951e0e (diff)
parentbe0ea69674ed95e1e98cb3687a241badc756d228 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into avr32-arch
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig10
-rw-r--r--fs/Kconfig1401
-rw-r--r--fs/Makefile6
-rw-r--r--fs/adfs/Kconfig27
-rw-r--r--fs/affs/Kconfig21
-rw-r--r--fs/afs/Kconfig21
-rw-r--r--fs/aio.c42
-rw-r--r--fs/autofs/Kconfig21
-rw-r--r--fs/autofs4/Kconfig20
-rw-r--r--fs/befs/Kconfig26
-rw-r--r--fs/bfs/Kconfig19
-rw-r--r--fs/binfmt_elf.c14
-rw-r--r--fs/bio-integrity.c112
-rw-r--r--fs/bio.c92
-rw-r--r--fs/btrfs/Kconfig31
-rw-r--r--fs/btrfs/async-thread.c61
-rw-r--r--fs/btrfs/btrfs_inode.h8
-rw-r--r--fs/btrfs/compression.c1
-rw-r--r--fs/btrfs/ctree.c325
-rw-r--r--fs/btrfs/ctree.h86
-rw-r--r--fs/btrfs/disk-io.c172
-rw-r--r--fs/btrfs/disk-io.h12
-rw-r--r--fs/btrfs/extent-tree.c818
-rw-r--r--fs/btrfs/extent_io.c134
-rw-r--r--fs/btrfs/extent_io.h18
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file.c29
-rw-r--r--fs/btrfs/inode-map.c1
-rw-r--r--fs/btrfs/inode.c142
-rw-r--r--fs/btrfs/ioctl.c7
-rw-r--r--fs/btrfs/ioctl.h14
-rw-r--r--fs/btrfs/locking.c207
-rw-r--r--fs/btrfs/locking.h8
-rw-r--r--fs/btrfs/ordered-data.c4
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/ref-cache.h1
-rw-r--r--fs/btrfs/super.c14
-rw-r--r--fs/btrfs/transaction.c6
-rw-r--r--fs/btrfs/tree-defrag.c1
-rw-r--r--fs/btrfs/tree-log.c356
-rw-r--r--fs/btrfs/volumes.c62
-rw-r--r--fs/btrfs/xattr.c48
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/buffer.c28
-rw-r--r--fs/cifs/CHANGES26
-rw-r--r--fs/cifs/Kconfig21
-rw-r--r--fs/cifs/README22
-rw-r--r--fs/cifs/cifs_debug.c2
-rw-r--r--fs/cifs/cifs_dfs_ref.c36
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifsencrypt.c18
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h8
-rw-r--r--fs/cifs/cifspdu.h76
-rw-r--r--fs/cifs/cifsproto.h17
-rw-r--r--fs/cifs/cifssmb.c34
-rw-r--r--fs/cifs/connect.c84
-rw-r--r--fs/cifs/dir.c339
-rw-r--r--fs/cifs/file.c199
-rw-r--r--fs/cifs/inode.c112
-rw-r--r--fs/cifs/md5.c38
-rw-r--r--fs/cifs/md5.h6
-rw-r--r--fs/cifs/readdir.c58
-rw-r--r--fs/cifs/sess.c91
-rw-r--r--fs/cifs/smbfsctl.h84
-rw-r--r--fs/cifs/transport.c127
-rw-r--r--fs/coda/Kconfig21
-rw-r--r--fs/compat.c5
-rw-r--r--fs/compat_ioctl.c22
-rw-r--r--fs/configfs/Kconfig11
-rw-r--r--fs/cramfs/Kconfig19
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/devpts/inode.c5
-rw-r--r--fs/dlm/plock.c6
-rw-r--r--fs/dquot.c223
-rw-r--r--fs/ecryptfs/Kconfig11
-rw-r--r--fs/ecryptfs/crypto.c55
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h4
-rw-r--r--fs/ecryptfs/inode.c32
-rw-r--r--fs/ecryptfs/keystore.c3
-rw-r--r--fs/ecryptfs/main.c5
-rw-r--r--fs/efs/Kconfig14
-rw-r--r--fs/eventpoll.c34
-rw-r--r--fs/exec.c41
-rw-r--r--fs/ext2/super.c9
-rw-r--r--fs/ext3/inode.c18
-rw-r--r--fs/ext3/namei.c20
-rw-r--r--fs/ext3/super.c11
-rw-r--r--fs/ext4/balloc.c10
-rw-r--r--fs/ext4/ext4.h9
-rw-r--r--fs/ext4/extents.c8
-rw-r--r--fs/ext4/ialloc.c17
-rw-r--r--fs/ext4/inode.c47
-rw-r--r--fs/ext4/mballoc.c47
-rw-r--r--fs/ext4/migrate.c8
-rw-r--r--fs/ext4/namei.c21
-rw-r--r--fs/ext4/resize.c3
-rw-r--r--fs/ext4/super.c12
-rw-r--r--fs/fat/Kconfig97
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fcntl.c33
-rw-r--r--fs/file_table.c3
-rw-r--r--fs/freevxfs/Kconfig16
-rw-r--r--fs/fs-writeback.c9
-rw-r--r--fs/fuse/Kconfig15
-rw-r--r--fs/fuse/dev.c16
-rw-r--r--fs/fuse/file.c2
-rw-r--r--fs/fuse/inode.c30
-rw-r--r--fs/gfs2/Kconfig17
-rw-r--r--fs/gfs2/Makefile4
-rw-r--r--fs/gfs2/acl.c1
-rw-r--r--fs/gfs2/bmap.c1
-rw-r--r--fs/gfs2/dir.c1
-rw-r--r--fs/gfs2/eaops.c1
-rw-r--r--fs/gfs2/eattr.c1
-rw-r--r--fs/gfs2/glock.c268
-rw-r--r--fs/gfs2/glock.h127
-rw-r--r--fs/gfs2/glops.c160
-rw-r--r--fs/gfs2/glops.h1
-rw-r--r--fs/gfs2/incore.h71
-rw-r--r--fs/gfs2/inode.c13
-rw-r--r--fs/gfs2/inode.h22
-rw-r--r--fs/gfs2/lock_dlm.c241
-rw-r--r--fs/gfs2/locking.c232
-rw-r--r--fs/gfs2/locking/dlm/Makefile3
-rw-r--r--fs/gfs2/locking/dlm/lock.c708
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h166
-rw-r--r--fs/gfs2/locking/dlm/main.c48
-rw-r--r--fs/gfs2/locking/dlm/mount.c276
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c226
-rw-r--r--fs/gfs2/locking/dlm/thread.c68
-rw-r--r--fs/gfs2/log.c1
-rw-r--r--fs/gfs2/lops.c1
-rw-r--r--fs/gfs2/main.c13
-rw-r--r--fs/gfs2/meta_io.c22
-rw-r--r--fs/gfs2/meta_io.h1
-rw-r--r--fs/gfs2/mount.c128
-rw-r--r--fs/gfs2/mount.h17
-rw-r--r--fs/gfs2/ops_address.c5
-rw-r--r--fs/gfs2/ops_dentry.c1
-rw-r--r--fs/gfs2/ops_export.c1
-rw-r--r--fs/gfs2/ops_file.c76
-rw-r--r--fs/gfs2/ops_fstype.c156
-rw-r--r--fs/gfs2/ops_inode.c1
-rw-r--r--fs/gfs2/ops_super.c44
-rw-r--r--fs/gfs2/quota.c203
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/recovery.c28
-rw-r--r--fs/gfs2/rgrp.c189
-rw-r--r--fs/gfs2/super.c3
-rw-r--r--fs/gfs2/super.h26
-rw-r--r--fs/gfs2/sys.c236
-rw-r--r--fs/gfs2/trans.c19
-rw-r--r--fs/gfs2/util.c11
-rw-r--r--fs/hfs/Kconfig12
-rw-r--r--fs/hfsplus/Kconfig13
-rw-r--r--fs/hpfs/Kconfig14
-rw-r--r--fs/hugetlbfs/inode.c8
-rw-r--r--fs/inode.c78
-rw-r--r--fs/internal.h2
-rw-r--r--fs/ioctl.c18
-rw-r--r--fs/isofs/Kconfig39
-rw-r--r--fs/jbd/journal.c17
-rw-r--r--fs/jbd2/journal.c23
-rw-r--r--fs/jbd2/transaction.c42
-rw-r--r--fs/jffs2/background.c18
-rw-r--r--fs/jffs2/readinode.c42
-rw-r--r--fs/jfs/Kconfig49
-rw-r--r--fs/lockd/clntlock.c51
-rw-r--r--fs/lockd/svclock.c6
-rw-r--r--fs/minix/Kconfig17
-rw-r--r--fs/minix/inode.c2
-rw-r--r--fs/namei.c8
-rw-r--r--fs/namespace.c17
-rw-r--r--fs/ncpfs/Kconfig21
-rw-r--r--fs/nfs/Kconfig86
-rw-r--r--fs/nfs/client.c73
-rw-r--r--fs/nfs/dir.c8
-rw-r--r--fs/nfs/nfs3acl.c27
-rw-r--r--fs/nfs/nfs3xdr.c34
-rw-r--r--fs/nfs/nfs4namespace.c15
-rw-r--r--fs/nfsd/Kconfig80
-rw-r--r--fs/nfsd/auth.c3
-rw-r--r--fs/nfsd/nfs4state.c1
-rw-r--r--fs/nfsd/nfs4xdr.c1
-rw-r--r--fs/nfsd/vfs.c5
-rw-r--r--fs/notify/inotify/inotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c135
-rw-r--r--fs/ntfs/Kconfig78
-rw-r--r--fs/ocfs2/Kconfig85
-rw-r--r--fs/ocfs2/alloc.c33
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/dcache.c42
-rw-r--r--fs/ocfs2/dcache.h9
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c12
-rw-r--r--fs/ocfs2/dlm/dlmthread.c3
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/dlmglue.c15
-rw-r--r--fs/ocfs2/journal.h6
-rw-r--r--fs/ocfs2/namei.c3
-rw-r--r--fs/ocfs2/ocfs2.h9
-rw-r--r--fs/ocfs2/ocfs2_fs.h6
-rw-r--r--fs/ocfs2/quota_global.c173
-rw-r--r--fs/ocfs2/super.c11
-rw-r--r--fs/ocfs2/xattr.c74
-rw-r--r--fs/omfs/Kconfig13
-rw-r--r--fs/partitions/check.c10
-rw-r--r--fs/partitions/ibm.c101
-rw-r--r--fs/pipe.c24
-rw-r--r--fs/proc/base.c16
-rw-r--r--fs/proc/inode.c4
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/qnx4/Kconfig25
-rw-r--r--fs/ramfs/file-nommu.c4
-rw-r--r--fs/reiserfs/Kconfig85
-rw-r--r--fs/romfs/Kconfig16
-rw-r--r--fs/seq_file.c151
-rw-r--r--fs/smbfs/Kconfig55
-rw-r--r--fs/squashfs/Kconfig51
-rw-r--r--fs/squashfs/block.c18
-rw-r--r--fs/squashfs/cache.c4
-rw-r--r--fs/squashfs/inode.c6
-rw-r--r--fs/squashfs/squashfs.h2
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/super.c37
-rw-r--r--fs/sync.c14
-rw-r--r--fs/sysfs/Kconfig23
-rw-r--r--fs/sysfs/bin.c259
-rw-r--r--fs/sysfs/dir.c33
-rw-r--r--fs/sysfs/file.c26
-rw-r--r--fs/sysfs/inode.c17
-rw-r--r--fs/sysfs/mount.c6
-rw-r--r--fs/sysfs/sysfs.h3
-rw-r--r--fs/sysv/Kconfig36
-rw-r--r--fs/timerfd.c12
-rw-r--r--fs/ubifs/budget.c35
-rw-r--r--fs/ubifs/debug.c122
-rw-r--r--fs/ubifs/debug.h36
-rw-r--r--fs/ubifs/dir.c96
-rw-r--r--fs/ubifs/file.c9
-rw-r--r--fs/ubifs/gc.c28
-rw-r--r--fs/ubifs/io.c22
-rw-r--r--fs/ubifs/journal.c2
-rw-r--r--fs/ubifs/lprops.c12
-rw-r--r--fs/ubifs/lpt_commit.c44
-rw-r--r--fs/ubifs/master.c2
-rw-r--r--fs/ubifs/orphan.c38
-rw-r--r--fs/ubifs/super.c195
-rw-r--r--fs/ubifs/tnc.c12
-rw-r--r--fs/ubifs/ubifs.h26
-rw-r--r--fs/udf/Kconfig18
-rw-r--r--fs/ufs/Kconfig43
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c91
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c305
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.h15
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c184
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c27
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c6
-rw-r--r--fs/xfs/quota/xfs_dquot.c38
-rw-r--r--fs/xfs/quota/xfs_dquot.h10
-rw-r--r--fs/xfs/quota/xfs_qm.c9
-rw-r--r--fs/xfs/xfs_ag.h6
-rw-r--r--fs/xfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/xfs_attr.c26
-rw-r--r--fs/xfs/xfs_bmap.c166
-rw-r--r--fs/xfs/xfs_bmap.h2
-rw-r--r--fs/xfs/xfs_bmap_btree.c10
-rw-r--r--fs/xfs/xfs_bmap_btree.h4
-rw-r--r--fs/xfs/xfs_btree.c16
-rw-r--r--fs/xfs/xfs_da_btree.c8
-rw-r--r--fs/xfs/xfs_dfrag.c10
-rw-r--r--fs/xfs/xfs_ialloc.c6
-rw-r--r--fs/xfs/xfs_ialloc.h2
-rw-r--r--fs/xfs/xfs_ialloc_btree.h1
-rw-r--r--fs/xfs/xfs_iget.c15
-rw-r--r--fs/xfs/xfs_inode.c19
-rw-r--r--fs/xfs/xfs_inode_item.h4
-rw-r--r--fs/xfs/xfs_iomap.c10
-rw-r--r--fs/xfs/xfs_itable.c6
-rw-r--r--fs/xfs/xfs_log_recover.c48
-rw-r--r--fs/xfs/xfs_mount.c26
-rw-r--r--fs/xfs/xfs_mount.h9
-rw-r--r--fs/xfs/xfs_rename.c2
-rw-r--r--fs/xfs/xfs_rtalloc.c2
-rw-r--r--fs/xfs/xfs_rw.h1
-rw-r--r--fs/xfs/xfs_sb.h2
-rw-r--r--fs/xfs/xfs_vnodeops.c20
290 files changed, 8342 insertions, 6754 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
new file mode 100644
index 000000000000..74e0723e90bc
--- /dev/null
+++ b/fs/9p/Kconfig
@@ -0,0 +1,10 @@
1config 9P_FS
2 tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
3 depends on INET && NET_9P && EXPERIMENTAL
4 help
5 If you say Y here, you will get experimental support for
6 Plan 9 resource sharing via the 9P2000 protocol.
7
8 See <http://v9fs.sf.net> for more information.
9
10 If unsure, say N.
diff --git a/fs/Kconfig b/fs/Kconfig
index 51307b0fdf0f..93945dd0b1ae 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -27,141 +27,8 @@ config FS_MBCACHE
27 default y if EXT4_FS=y && EXT4_FS_XATTR 27 default y if EXT4_FS=y && EXT4_FS_XATTR
28 default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR 28 default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
29 29
30config REISERFS_FS 30source "fs/reiserfs/Kconfig"
31 tristate "Reiserfs support" 31source "fs/jfs/Kconfig"
32 help
33 Stores not just filenames but the files themselves in a balanced
34 tree. Uses journalling.
35
36 Balanced trees are more efficient than traditional file system
37 architectural foundations.
38
39 In general, ReiserFS is as fast as ext2, but is very efficient with
40 large directories and small files. Additional patches are needed
41 for NFS and quotas, please see <http://www.namesys.com/> for links.
42
43 It is more easily extended to have features currently found in
44 database and keyword search systems than block allocation based file
45 systems are. The next version will be so extended, and will support
46 plugins consistent with our motto ``It takes more than a license to
47 make source code open.''
48
49 Read <http://www.namesys.com/> to learn more about reiserfs.
50
51 Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
52
53 If you like it, you can pay us to add new features to it that you
54 need, buy a support contract, or pay us to port it to another OS.
55
56config REISERFS_CHECK
57 bool "Enable reiserfs debug mode"
58 depends on REISERFS_FS
59 help
60 If you set this to Y, then ReiserFS will perform every check it can
61 possibly imagine of its internal consistency throughout its
62 operation. It will also go substantially slower. More than once we
63 have forgotten that this was on, and then gone despondent over the
64 latest benchmarks.:-) Use of this option allows our team to go all
65 out in checking for consistency when debugging without fear of its
66 effect on end users. If you are on the verge of sending in a bug
67 report, say Y and you might get a useful error message. Almost
68 everyone should say N.
69
70config REISERFS_PROC_INFO
71 bool "Stats in /proc/fs/reiserfs"
72 depends on REISERFS_FS && PROC_FS
73 help
74 Create under /proc/fs/reiserfs a hierarchy of files, displaying
75 various ReiserFS statistics and internal data at the expense of
76 making your kernel or module slightly larger (+8 KB). This also
77 increases the amount of kernel memory required for each mount.
78 Almost everyone but ReiserFS developers and people fine-tuning
79 reiserfs or tracing problems should say N.
80
81config REISERFS_FS_XATTR
82 bool "ReiserFS extended attributes"
83 depends on REISERFS_FS
84 help
85 Extended attributes are name:value pairs associated with inodes by
86 the kernel or by users (see the attr(5) manual page, or visit
87 <http://acl.bestbits.at/> for details).
88
89 If unsure, say N.
90
91config REISERFS_FS_POSIX_ACL
92 bool "ReiserFS POSIX Access Control Lists"
93 depends on REISERFS_FS_XATTR
94 select FS_POSIX_ACL
95 help
96 Posix Access Control Lists (ACLs) support permissions for users and
97 groups beyond the owner/group/world scheme.
98
99 To learn more about Access Control Lists, visit the Posix ACLs for
100 Linux website <http://acl.bestbits.at/>.
101
102 If you don't know what Access Control Lists are, say N
103
104config REISERFS_FS_SECURITY
105 bool "ReiserFS Security Labels"
106 depends on REISERFS_FS_XATTR
107 help
108 Security labels support alternative access control models
109 implemented by security modules like SELinux. This option
110 enables an extended attribute handler for file security
111 labels in the ReiserFS filesystem.
112
113 If you are not using a security module that requires using
114 extended attributes for file security labels, say N.
115
116config JFS_FS
117 tristate "JFS filesystem support"
118 select NLS
119 help
120 This is a port of IBM's Journaled Filesystem . More information is
121 available in the file <file:Documentation/filesystems/jfs.txt>.
122
123 If you do not intend to use the JFS filesystem, say N.
124
125config JFS_POSIX_ACL
126 bool "JFS POSIX Access Control Lists"
127 depends on JFS_FS
128 select FS_POSIX_ACL
129 help
130 Posix Access Control Lists (ACLs) support permissions for users and
131 groups beyond the owner/group/world scheme.
132
133 To learn more about Access Control Lists, visit the Posix ACLs for
134 Linux website <http://acl.bestbits.at/>.
135
136 If you don't know what Access Control Lists are, say N
137
138config JFS_SECURITY
139 bool "JFS Security Labels"
140 depends on JFS_FS
141 help
142 Security labels support alternative access control models
143 implemented by security modules like SELinux. This option
144 enables an extended attribute handler for file security
145 labels in the jfs filesystem.
146
147 If you are not using a security module that requires using
148 extended attributes for file security labels, say N.
149
150config JFS_DEBUG
151 bool "JFS debugging"
152 depends on JFS_FS
153 help
154 If you are experiencing any problems with the JFS filesystem, say
155 Y here. This will result in additional debugging messages to be
156 written to the system log. Under normal circumstances, this
157 results in very little overhead.
158
159config JFS_STATISTICS
160 bool "JFS statistics"
161 depends on JFS_FS
162 help
163 Enabling this option will cause statistics from the JFS file system
164 to be made available to the user in the /proc/fs/jfs/ directory.
165 32
166config FS_POSIX_ACL 33config FS_POSIX_ACL
167# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4) 34# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
@@ -182,111 +49,8 @@ config FILE_LOCKING
182 49
183source "fs/xfs/Kconfig" 50source "fs/xfs/Kconfig"
184source "fs/gfs2/Kconfig" 51source "fs/gfs2/Kconfig"
185 52source "fs/ocfs2/Kconfig"
186config OCFS2_FS 53source "fs/btrfs/Kconfig"
187 tristate "OCFS2 file system support"
188 depends on NET && SYSFS
189 select CONFIGFS_FS
190 select JBD2
191 select CRC32
192 select QUOTA
193 select QUOTA_TREE
194 help
195 OCFS2 is a general purpose extent based shared disk cluster file
196 system with many similarities to ext3. It supports 64 bit inode
197 numbers, and has automatically extending metadata groups which may
198 also make it attractive for non-clustered use.
199
200 You'll want to install the ocfs2-tools package in order to at least
201 get "mount.ocfs2".
202
203 Project web page: http://oss.oracle.com/projects/ocfs2
204 Tools web page: http://oss.oracle.com/projects/ocfs2-tools
205 OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
206
207 For more information on OCFS2, see the file
208 <file:Documentation/filesystems/ocfs2.txt>.
209
210config OCFS2_FS_O2CB
211 tristate "O2CB Kernelspace Clustering"
212 depends on OCFS2_FS
213 default y
214 help
215 OCFS2 includes a simple kernelspace clustering package, the OCFS2
216 Cluster Base. It only requires a very small userspace component
217 to configure it. This comes with the standard ocfs2-tools package.
218 O2CB is limited to maintaining a cluster for OCFS2 file systems.
219 It cannot manage any other cluster applications.
220
221 It is always safe to say Y here, as the clustering method is
222 run-time selectable.
223
224config OCFS2_FS_USERSPACE_CLUSTER
225 tristate "OCFS2 Userspace Clustering"
226 depends on OCFS2_FS && DLM
227 default y
228 help
229 This option will allow OCFS2 to use userspace clustering services
230 in conjunction with the DLM in fs/dlm. If you are using a
231 userspace cluster manager, say Y here.
232
233 It is safe to say Y, as the clustering method is run-time
234 selectable.
235
236config OCFS2_FS_STATS
237 bool "OCFS2 statistics"
238 depends on OCFS2_FS
239 default y
240 help
241 This option allows some fs statistics to be captured. Enabling
242 this option may increase the memory consumption.
243
244config OCFS2_DEBUG_MASKLOG
245 bool "OCFS2 logging support"
246 depends on OCFS2_FS
247 default y
248 help
249 The ocfs2 filesystem has an extensive logging system. The system
250 allows selection of events to log via files in /sys/o2cb/logmask/.
251 This option will enlarge your kernel, but it allows debugging of
252 ocfs2 filesystem issues.
253
254config OCFS2_DEBUG_FS
255 bool "OCFS2 expensive checks"
256 depends on OCFS2_FS
257 default n
258 help
259 This option will enable expensive consistency checks. Enable
260 this option for debugging only as it is likely to decrease
261 performance of the filesystem.
262
263config OCFS2_FS_POSIX_ACL
264 bool "OCFS2 POSIX Access Control Lists"
265 depends on OCFS2_FS
266 select FS_POSIX_ACL
267 default n
268 help
269 Posix Access Control Lists (ACLs) support permissions for users and
270 groups beyond the owner/group/world scheme.
271
272config BTRFS_FS
273 tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
274 depends on EXPERIMENTAL
275 select LIBCRC32C
276 select ZLIB_INFLATE
277 select ZLIB_DEFLATE
278 help
279 Btrfs is a new filesystem with extents, writable snapshotting,
280 support for multiple devices and many more features.
281
282 Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
283 FINALIZED. You should say N here unless you are interested in
284 testing Btrfs with non-critical data.
285
286 To compile this file system support as a module, choose M here. The
287 module will be called btrfs.
288
289 If unsure, say N.
290 54
291endif # BLOCK 55endif # BLOCK
292 56
@@ -348,64 +112,9 @@ config QUOTACTL
348 depends on XFS_QUOTA || QUOTA 112 depends on XFS_QUOTA || QUOTA
349 default y 113 default y
350 114
351config AUTOFS_FS 115source "fs/autofs/Kconfig"
352 tristate "Kernel automounter support" 116source "fs/autofs4/Kconfig"
353 help 117source "fs/fuse/Kconfig"
354 The automounter is a tool to automatically mount remote file systems
355 on demand. This implementation is partially kernel-based to reduce
356 overhead in the already-mounted case; this is unlike the BSD
357 automounter (amd), which is a pure user space daemon.
358
359 To use the automounter you need the user-space tools from the autofs
360 package; you can find the location in <file:Documentation/Changes>.
361 You also want to answer Y to "NFS file system support", below.
362
363 If you want to use the newer version of the automounter with more
364 features, say N here and say Y to "Kernel automounter v4 support",
365 below.
366
367 To compile this support as a module, choose M here: the module will be
368 called autofs.
369
370 If you are not a part of a fairly large, distributed network, you
371 probably do not need an automounter, and can say N here.
372
373config AUTOFS4_FS
374 tristate "Kernel automounter version 4 support (also supports v3)"
375 help
376 The automounter is a tool to automatically mount remote file systems
377 on demand. This implementation is partially kernel-based to reduce
378 overhead in the already-mounted case; this is unlike the BSD
379 automounter (amd), which is a pure user space daemon.
380
381 To use the automounter you need the user-space tools from
382 <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
383 want to answer Y to "NFS file system support", below.
384
385 To compile this support as a module, choose M here: the module will be
386 called autofs4. You will need to add "alias autofs autofs4" to your
387 modules configuration file.
388
389 If you are not a part of a fairly large, distributed network or
390 don't have a laptop which needs to dynamically reconfigure to the
391 local network, you probably do not need an automounter, and can say
392 N here.
393
394config FUSE_FS
395 tristate "FUSE (Filesystem in Userspace) support"
396 help
397 With FUSE it is possible to implement a fully functional filesystem
398 in a userspace program.
399
400 There's also companion library: libfuse. This library along with
401 utilities is available from the FUSE homepage:
402 <http://fuse.sourceforge.net/>
403
404 See <file:Documentation/filesystems/fuse.txt> for more information.
405 See <file:Documentation/Changes> for needed library/utility version.
406
407 If you want to develop a userspace FS, or if you want to use
408 a filesystem based on FUSE, answer Y or M.
409 118
410config GENERIC_ACL 119config GENERIC_ACL
411 bool 120 bool
@@ -414,64 +123,8 @@ config GENERIC_ACL
414if BLOCK 123if BLOCK
415menu "CD-ROM/DVD Filesystems" 124menu "CD-ROM/DVD Filesystems"
416 125
417config ISO9660_FS 126source "fs/isofs/Kconfig"
418 tristate "ISO 9660 CDROM file system support" 127source "fs/udf/Kconfig"
419 help
420 This is the standard file system used on CD-ROMs. It was previously
421 known as "High Sierra File System" and is called "hsfs" on other
422 Unix systems. The so-called Rock-Ridge extensions which allow for
423 long Unix filenames and symbolic links are also supported by this
424 driver. If you have a CD-ROM drive and want to do more with it than
425 just listen to audio CDs and watch its LEDs, say Y (and read
426 <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO,
427 available from <http://www.tldp.org/docs.html#howto>), thereby
428 enlarging your kernel by about 27 KB; otherwise say N.
429
430 To compile this file system support as a module, choose M here: the
431 module will be called isofs.
432
433config JOLIET
434 bool "Microsoft Joliet CDROM extensions"
435 depends on ISO9660_FS
436 select NLS
437 help
438 Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system
439 which allows for long filenames in unicode format (unicode is the
440 new 16 bit character code, successor to ASCII, which encodes the
441 characters of almost all languages of the world; see
442 <http://www.unicode.org/> for more information). Say Y here if you
443 want to be able to read Joliet CD-ROMs under Linux.
444
445config ZISOFS
446 bool "Transparent decompression extension"
447 depends on ISO9660_FS
448 select ZLIB_INFLATE
449 help
450 This is a Linux-specific extension to RockRidge which lets you store
451 data in compressed form on a CD-ROM and have it transparently
452 decompressed when the CD-ROM is accessed. See
453 <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools
454 necessary to create such a filesystem. Say Y here if you want to be
455 able to read such compressed CD-ROMs.
456
457config UDF_FS
458 tristate "UDF file system support"
459 select CRC_ITU_T
460 help
461 This is the new file system used on some CD-ROMs and DVDs. Say Y if
462 you intend to mount DVD discs or CDRW's written in packet mode, or
463 if written to by other UDF utilities, such as DirectCD.
464 Please read <file:Documentation/filesystems/udf.txt>.
465
466 To compile this file system support as a module, choose M here: the
467 module will be called udf.
468
469 If unsure, say N.
470
471config UDF_NLS
472 bool
473 default y
474 depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
475 128
476endmenu 129endmenu
477endif # BLOCK 130endif # BLOCK
@@ -479,182 +132,8 @@ endif # BLOCK
479if BLOCK 132if BLOCK
480menu "DOS/FAT/NT Filesystems" 133menu "DOS/FAT/NT Filesystems"
481 134
482config FAT_FS 135source "fs/fat/Kconfig"
483 tristate 136source "fs/ntfs/Kconfig"
484 select NLS
485 help
486 If you want to use one of the FAT-based file systems (the MS-DOS and
487 VFAT (Windows 95) file systems), then you must say Y or M here
488 to include FAT support. You will then be able to mount partitions or
489 diskettes with FAT-based file systems and transparently access the
490 files on them, i.e. MSDOS files will look and behave just like all
491 other Unix files.
492
493 This FAT support is not a file system in itself, it only provides
494 the foundation for the other file systems. You will have to say Y or
495 M to at least one of "MSDOS fs support" or "VFAT fs support" in
496 order to make use of it.
497
498 Another way to read and write MSDOS floppies and hard drive
499 partitions from within Linux (but not transparently) is with the
500 mtools ("man mtools") program suite. You don't need to say Y here in
501 order to do that.
502
503 If you need to move large files on floppies between a DOS and a
504 Linux box, say Y here, mount the floppy under Linux with an MSDOS
505 file system and use GNU tar's M option. GNU tar is a program
506 available for Unix and DOS ("man tar" or "info tar").
507
508 The FAT support will enlarge your kernel by about 37 KB. If unsure,
509 say Y.
510
511 To compile this as a module, choose M here: the module will be called
512 fat. Note that if you compile the FAT support as a module, you
513 cannot compile any of the FAT-based file systems into the kernel
514 -- they will have to be modules as well.
515
516config MSDOS_FS
517 tristate "MSDOS fs support"
518 select FAT_FS
519 help
520 This allows you to mount MSDOS partitions of your hard drive (unless
521 they are compressed; to access compressed MSDOS partitions under
522 Linux, you can either use the DOS emulator DOSEMU, described in the
523 DOSEMU-HOWTO, available from
524 <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
525 <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
526 intend to use dosemu with a non-compressed MSDOS partition, say Y
527 here) and MSDOS floppies. This means that file access becomes
528 transparent, i.e. the MSDOS files look and behave just like all
529 other Unix files.
530
531 If you have Windows 95 or Windows NT installed on your MSDOS
532 partitions, you should use the VFAT file system (say Y to "VFAT fs
533 support" below), or you will not be able to see the long filenames
534 generated by Windows 95 / Windows NT.
535
536 This option will enlarge your kernel by about 7 KB. If unsure,
537 answer Y. This will only work if you said Y to "DOS FAT fs support"
538 as well. To compile this as a module, choose M here: the module will
539 be called msdos.
540
541config VFAT_FS
542 tristate "VFAT (Windows-95) fs support"
543 select FAT_FS
544 help
545 This option provides support for normal Windows file systems with
546 long filenames. That includes non-compressed FAT-based file systems
547 used by Windows 95, Windows 98, Windows NT 4.0, and the Unix
548 programs from the mtools package.
549
550 The VFAT support enlarges your kernel by about 10 KB and it only
551 works if you said Y to the "DOS FAT fs support" above. Please read
552 the file <file:Documentation/filesystems/vfat.txt> for details. If
553 unsure, say Y.
554
555 To compile this as a module, choose M here: the module will be called
556 vfat.
557
558config FAT_DEFAULT_CODEPAGE
559 int "Default codepage for FAT"
560 depends on MSDOS_FS || VFAT_FS
561 default 437
562 help
563 This option should be set to the codepage of your FAT filesystems.
564 It can be overridden with the "codepage" mount option.
565 See <file:Documentation/filesystems/vfat.txt> for more information.
566
567config FAT_DEFAULT_IOCHARSET
568 string "Default iocharset for FAT"
569 depends on VFAT_FS
570 default "iso8859-1"
571 help
572 Set this to the default input/output character set you'd
573 like FAT to use. It should probably match the character set
574 that most of your FAT filesystems use, and can be overridden
575 with the "iocharset" mount option for FAT filesystems.
576 Note that "utf8" is not recommended for FAT filesystems.
577 If unsure, you shouldn't set "utf8" here.
578 See <file:Documentation/filesystems/vfat.txt> for more information.
579
580config NTFS_FS
581 tristate "NTFS file system support"
582 select NLS
583 help
584 NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
585
586 Saying Y or M here enables read support. There is partial, but
587 safe, write support available. For write support you must also
588 say Y to "NTFS write support" below.
589
590 There are also a number of user-space tools available, called
591 ntfsprogs. These include ntfsundelete and ntfsresize, that work
592 without NTFS support enabled in the kernel.
593
594 This is a rewrite from scratch of Linux NTFS support and replaced
595 the old NTFS code starting with Linux 2.5.11. A backport to
596 the Linux 2.4 kernel series is separately available as a patch
597 from the project web site.
598
599 For more information see <file:Documentation/filesystems/ntfs.txt>
600 and <http://www.linux-ntfs.org/>.
601
602 To compile this file system support as a module, choose M here: the
603 module will be called ntfs.
604
605 If you are not using Windows NT, 2000, XP or 2003 in addition to
606 Linux on your computer it is safe to say N.
607
608config NTFS_DEBUG
609 bool "NTFS debugging support"
610 depends on NTFS_FS
611 help
612 If you are experiencing any problems with the NTFS file system, say
613 Y here. This will result in additional consistency checks to be
614 performed by the driver as well as additional debugging messages to
615 be written to the system log. Note that debugging messages are
616 disabled by default. To enable them, supply the option debug_msgs=1
617 at the kernel command line when booting the kernel or as an option
618 to insmod when loading the ntfs module. Once the driver is active,
619 you can enable debugging messages by doing (as root):
620 echo 1 > /proc/sys/fs/ntfs-debug
621 Replacing the "1" with "0" would disable debug messages.
622
623 If you leave debugging messages disabled, this results in little
624 overhead, but enabling debug messages results in very significant
625 slowdown of the system.
626
627 When reporting bugs, please try to have available a full dump of
628 debugging messages while the misbehaviour was occurring.
629
630config NTFS_RW
631 bool "NTFS write support"
632 depends on NTFS_FS
633 help
634 This enables the partial, but safe, write support in the NTFS driver.
635
636 The only supported operation is overwriting existing files, without
637 changing the file length. No file or directory creation, deletion or
638 renaming is possible. Note only non-resident files can be written to
639 so you may find that some very small files (<500 bytes or so) cannot
640 be written to.
641
642 While we cannot guarantee that it will not damage any data, we have
643 so far not received a single report where the driver would have
644 damaged someones data so we assume it is perfectly safe to use.
645
646 Note: While write support is safe in this version (a rewrite from
647 scratch of the NTFS support), it should be noted that the old NTFS
648 write support, included in Linux 2.5.10 and before (since 1997),
649 is not safe.
650
651 This is currently useful with TopologiLinux. TopologiLinux is run
652 on top of any DOS/Microsoft Windows system without partitioning your
653 hard disk. Unlike other Linux distributions TopologiLinux does not
654 need its own partition. For more information see
655 <http://topologi-linux.sourceforge.net/>
656
657 It is perfectly safe to say N here.
658 137
659endmenu 138endmenu
660endif # BLOCK 139endif # BLOCK
@@ -662,30 +141,7 @@ endif # BLOCK
662menu "Pseudo filesystems" 141menu "Pseudo filesystems"
663 142
664source "fs/proc/Kconfig" 143source "fs/proc/Kconfig"
665 144source "fs/sysfs/Kconfig"
666config SYSFS
667 bool "sysfs file system support" if EMBEDDED
668 default y
669 help
670 The sysfs filesystem is a virtual filesystem that the kernel uses to
671 export internal kernel objects, their attributes, and their
672 relationships to one another.
673
674 Users can use sysfs to ascertain useful information about the running
675 kernel, such as the devices the kernel has discovered on each bus and
676 which driver each is bound to. sysfs can also be used to tune devices
677 and other kernel subsystems.
678
679 Some system agents rely on the information in sysfs to operate.
680 /sbin/hotplug uses device and object attributes in sysfs to assist in
681 delegating policy decisions, like persistently naming devices.
682
683 sysfs is currently used by the block subsystem to mount the root
684 partition. If sysfs is disabled you must specify the boot device on
685 the kernel boot command line via its major and minor numbers. For
686 example, "root=03:01" for /dev/hda1.
687
688 Designers of embedded systems may wish to say N here to conserve space.
689 145
690config TMPFS 146config TMPFS
691 bool "Virtual memory file system support (former shm fs)" 147 bool "Virtual memory file system support (former shm fs)"
@@ -726,17 +182,7 @@ config HUGETLBFS
726config HUGETLB_PAGE 182config HUGETLB_PAGE
727 def_bool HUGETLBFS 183 def_bool HUGETLBFS
728 184
729config CONFIGFS_FS 185source "fs/configfs/Kconfig"
730 tristate "Userspace-driven configuration filesystem"
731 depends on SYSFS
732 help
733 configfs is a ram-based filesystem that provides the converse
734 of sysfs's functionality. Where sysfs is a filesystem-based
735 view of kernel objects, configfs is a filesystem-based manager
736 of kernel objects, or config_items.
737
738 Both sysfs and configfs can and should exist together on the
739 same system. One is not a replacement for the other.
740 186
741endmenu 187endmenu
742 188
@@ -755,425 +201,27 @@ menuconfig MISC_FILESYSTEMS
755 201
756if MISC_FILESYSTEMS 202if MISC_FILESYSTEMS
757 203
758config ADFS_FS 204source "fs/adfs/Kconfig"
759 tristate "ADFS file system support (EXPERIMENTAL)" 205source "fs/affs/Kconfig"
760 depends on BLOCK && EXPERIMENTAL 206source "fs/ecryptfs/Kconfig"
761 help 207source "fs/hfs/Kconfig"
762 The Acorn Disc Filing System is the standard file system of the 208source "fs/hfsplus/Kconfig"
763 RiscOS operating system which runs on Acorn's ARM-based Risc PC 209source "fs/befs/Kconfig"
764 systems and the Acorn Archimedes range of machines. If you say Y 210source "fs/bfs/Kconfig"
765 here, Linux will be able to read from ADFS partitions on hard drives 211source "fs/efs/Kconfig"
766 and from ADFS-formatted floppy discs. If you also want to be able to
767 write to those devices, say Y to "ADFS write support" below.
768
769 The ADFS partition should be the first partition (i.e.,
770 /dev/[hs]d?1) on each of your drives. Please read the file
771 <file:Documentation/filesystems/adfs.txt> for further details.
772
773 To compile this code as a module, choose M here: the module will be
774 called adfs.
775
776 If unsure, say N.
777
778config ADFS_FS_RW
779 bool "ADFS write support (DANGEROUS)"
780 depends on ADFS_FS
781 help
782 If you say Y here, you will be able to write to ADFS partitions on
783 hard drives and ADFS-formatted floppy disks. This is experimental
784 codes, so if you're unsure, say N.
785
786config AFFS_FS
787 tristate "Amiga FFS file system support (EXPERIMENTAL)"
788 depends on BLOCK && EXPERIMENTAL
789 help
790 The Fast File System (FFS) is the common file system used on hard
791 disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y
792 if you want to be able to read and write files from and to an Amiga
793 FFS partition on your hard drive. Amiga floppies however cannot be
794 read with this driver due to an incompatibility of the floppy
795 controller used in an Amiga and the standard floppy controller in
796 PCs and workstations. Read <file:Documentation/filesystems/affs.txt>
797 and <file:fs/affs/Changes>.
798
799 With this driver you can also mount disk files used by Bernd
800 Schmidt's Un*X Amiga Emulator
801 (<http://www.freiburg.linux.de/~uae/>).
802 If you want to do this, you will also need to say Y or M to "Loop
803 device support", above.
804
805 To compile this file system support as a module, choose M here: the
806 module will be called affs. If unsure, say N.
807
808config ECRYPT_FS
809 tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
810 depends on EXPERIMENTAL && KEYS && CRYPTO && NET
811 help
812 Encrypted filesystem that operates on the VFS layer. See
813 <file:Documentation/filesystems/ecryptfs.txt> to learn more about
814 eCryptfs. Userspace components are required and can be
815 obtained from <http://ecryptfs.sf.net>.
816
817 To compile this file system support as a module, choose M here: the
818 module will be called ecryptfs.
819
820config HFS_FS
821 tristate "Apple Macintosh file system support (EXPERIMENTAL)"
822 depends on BLOCK && EXPERIMENTAL
823 select NLS
824 help
825 If you say Y here, you will be able to mount Macintosh-formatted
826 floppy disks and hard drive partitions with full read-write access.
827 Please read <file:Documentation/filesystems/hfs.txt> to learn about
828 the available mount options.
829
830 To compile this file system support as a module, choose M here: the
831 module will be called hfs.
832
833config HFSPLUS_FS
834 tristate "Apple Extended HFS file system support"
835 depends on BLOCK
836 select NLS
837 select NLS_UTF8
838 help
839 If you say Y here, you will be able to mount extended format
840 Macintosh-formatted hard drive partitions with full read-write access.
841
842 This file system is often called HFS+ and was introduced with
843 MacOS 8. It includes all Mac specific filesystem data such as
844 data forks and creator codes, but it also has several UNIX
845 style features such as file ownership and permissions.
846
847config BEFS_FS
848 tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
849 depends on BLOCK && EXPERIMENTAL
850 select NLS
851 help
852 The BeOS File System (BeFS) is the native file system of Be, Inc's
853 BeOS. Notable features include support for arbitrary attributes
854 on files and directories, and database-like indices on selected
855 attributes. (Also note that this driver doesn't make those features
856 available at this time). It is a 64 bit filesystem, so it supports
857 extremely large volumes and files.
858
859 If you use this filesystem, you should also say Y to at least one
860 of the NLS (native language support) options below.
861
862 If you don't know what this is about, say N.
863
864 To compile this as a module, choose M here: the module will be
865 called befs.
866
867config BEFS_DEBUG
868 bool "Debug BeFS"
869 depends on BEFS_FS
870 help
871 If you say Y here, you can use the 'debug' mount option to enable
872 debugging output from the driver.
873
874config BFS_FS
875 tristate "BFS file system support (EXPERIMENTAL)"
876 depends on BLOCK && EXPERIMENTAL
877 help
878 Boot File System (BFS) is a file system used under SCO UnixWare to
879 allow the bootloader access to the kernel image and other important
880 files during the boot process. It is usually mounted under /stand
881 and corresponds to the slice marked as "STAND" in the UnixWare
882 partition. You should say Y if you want to read or write the files
883 on your /stand slice from within Linux. You then also need to say Y
884 to "UnixWare slices support", below. More information about the BFS
885 file system is contained in the file
886 <file:Documentation/filesystems/bfs.txt>.
887
888 If you don't know what this is about, say N.
889
890 To compile this as a module, choose M here: the module will be called
891 bfs. Note that the file system of your root partition (the one
892 containing the directory /) cannot be compiled as a module.
893
894
895
896config EFS_FS
897 tristate "EFS file system support (read only) (EXPERIMENTAL)"
898 depends on BLOCK && EXPERIMENTAL
899 help
900 EFS is an older file system used for non-ISO9660 CD-ROMs and hard
901 disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
902 uses the XFS file system for hard disk partitions however).
903
904 This implementation only offers read-only access. If you don't know
905 what all this is about, it's safe to say N. For more information
906 about EFS see its home page at <http://aeschi.ch.eu.org/efs/>.
907
908 To compile the EFS file system support as a module, choose M here: the
909 module will be called efs.
910
911source "fs/jffs2/Kconfig" 212source "fs/jffs2/Kconfig"
912# UBIFS File system configuration 213# UBIFS File system configuration
913source "fs/ubifs/Kconfig" 214source "fs/ubifs/Kconfig"
914 215source "fs/cramfs/Kconfig"
915config CRAMFS 216source "fs/squashfs/Kconfig"
916 tristate "Compressed ROM file system support (cramfs)" 217source "fs/freevxfs/Kconfig"
917 depends on BLOCK 218source "fs/minix/Kconfig"
918 select ZLIB_INFLATE 219source "fs/omfs/Kconfig"
919 help 220source "fs/hpfs/Kconfig"
920 Saying Y here includes support for CramFs (Compressed ROM File 221source "fs/qnx4/Kconfig"
921 System). CramFs is designed to be a simple, small, and compressed 222source "fs/romfs/Kconfig"
922 file system for ROM based embedded systems. CramFs is read-only, 223source "fs/sysv/Kconfig"
923 limited to 256MB file systems (with 16MB files), and doesn't support 224source "fs/ufs/Kconfig"
924 16/32 bits uid/gid, hard links and timestamps.
925
926 See <file:Documentation/filesystems/cramfs.txt> and
927 <file:fs/cramfs/README> for further information.
928
929 To compile this as a module, choose M here: the module will be called
930 cramfs. Note that the root file system (the one containing the
931 directory /) cannot be compiled as a module.
932
933 If unsure, say N.
934
935config SQUASHFS
936 tristate "SquashFS 4.0 - Squashed file system support"
937 depends on BLOCK
938 select ZLIB_INFLATE
939 help
940 Saying Y here includes support for SquashFS 4.0 (a Compressed
941 Read-Only File System). Squashfs is a highly compressed read-only
942 filesystem for Linux. It uses zlib compression to compress both
943 files, inodes and directories. Inodes in the system are very small
944 and all blocks are packed to minimise data overhead. Block sizes
945 greater than 4K are supported up to a maximum of 1 Mbytes (default
946 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files
947 (larger than 4GB), full uid/gid information, hard links and
948 timestamps.
949
950 Squashfs is intended for general read-only filesystem use, for
951 archival use (i.e. in cases where a .tar.gz file may be used), and in
952 embedded systems where low overhead is needed. Further information
953 and tools are available from http://squashfs.sourceforge.net.
954
955 If you want to compile this as a module ( = code which can be
956 inserted in and removed from the running kernel whenever you want),
957 say M here and read <file:Documentation/modules.txt>. The module
958 will be called squashfs. Note that the root file system (the one
959 containing the directory /) cannot be compiled as a module.
960
961 If unsure, say N.
962
963config SQUASHFS_EMBEDDED
964
965 bool "Additional option for memory-constrained systems"
966 depends on SQUASHFS
967 default n
968 help
969 Saying Y here allows you to specify cache size.
970
971 If unsure, say N.
972
973config SQUASHFS_FRAGMENT_CACHE_SIZE
974 int "Number of fragments cached" if SQUASHFS_EMBEDDED
975 depends on SQUASHFS
976 default "3"
977 help
978 By default SquashFS caches the last 3 fragments read from
979 the filesystem. Increasing this amount may mean SquashFS
980 has to re-read fragments less often from disk, at the expense
981 of extra system memory. Decreasing this amount will mean
982 SquashFS uses less memory at the expense of extra reads from disk.
983
984 Note there must be at least one cached fragment. Anything
985 much more than three will probably not make much difference.
986
987config VXFS_FS
988 tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
989 depends on BLOCK
990 help
991 FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
992 file system format. VERITAS VxFS(TM) is the standard file system
993 of SCO UnixWare (and possibly others) and optionally available
994 for Sunsoft Solaris, HP-UX and many other operating systems.
995 Currently only readonly access is supported.
996
997 NOTE: the file system type as used by mount(1), mount(2) and
998 fstab(5) is 'vxfs' as it describes the file system format, not
999 the actual driver.
1000
1001 To compile this as a module, choose M here: the module will be
1002 called freevxfs. If unsure, say N.
1003
1004config MINIX_FS
1005 tristate "Minix file system support"
1006 depends on BLOCK
1007 help
1008 Minix is a simple operating system used in many classes about OS's.
1009 The minix file system (method to organize files on a hard disk
1010 partition or a floppy disk) was the original file system for Linux,
1011 but has been superseded by the second extended file system ext2fs.
1012 You don't want to use the minix file system on your hard disk
1013 because of certain built-in restrictions, but it is sometimes found
1014 on older Linux floppy disks. This option will enlarge your kernel
1015 by about 28 KB. If unsure, say N.
1016
1017 To compile this file system support as a module, choose M here: the
1018 module will be called minix. Note that the file system of your root
1019 partition (the one containing the directory /) cannot be compiled as
1020 a module.
1021
1022config OMFS_FS
1023 tristate "SonicBlue Optimized MPEG File System support"
1024 depends on BLOCK
1025 select CRC_ITU_T
1026 help
1027 This is the proprietary file system used by the Rio Karma music
1028 player and ReplayTV DVR. Despite the name, this filesystem is not
1029 more efficient than a standard FS for MPEG files, in fact likely
1030 the opposite is true. Say Y if you have either of these devices
1031 and wish to mount its disk.
1032
1033 To compile this file system support as a module, choose M here: the
1034 module will be called omfs. If unsure, say N.
1035
1036config HPFS_FS
1037 tristate "OS/2 HPFS file system support"
1038 depends on BLOCK
1039 help
1040 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
1041 is the file system used for organizing files on OS/2 hard disk
1042 partitions. Say Y if you want to be able to read files from and
1043 write files to an OS/2 HPFS partition on your hard drive. OS/2
1044 floppies however are in regular MSDOS format, so you don't need this
1045 option in order to be able to read them. Read
1046 <file:Documentation/filesystems/hpfs.txt>.
1047
1048 To compile this file system support as a module, choose M here: the
1049 module will be called hpfs. If unsure, say N.
1050
1051
1052config QNX4FS_FS
1053 tristate "QNX4 file system support (read only)"
1054 depends on BLOCK
1055 help
1056 This is the file system used by the real-time operating systems
1057 QNX 4 and QNX 6 (the latter is also called QNX RTP).
1058 Further information is available at <http://www.qnx.com/>.
1059 Say Y if you intend to mount QNX hard disks or floppies.
1060 Unless you say Y to "QNX4FS read-write support" below, you will
1061 only be able to read these file systems.
1062
1063 To compile this file system support as a module, choose M here: the
1064 module will be called qnx4.
1065
1066 If you don't know whether you need it, then you don't need it:
1067 answer N.
1068
1069config QNX4FS_RW
1070 bool "QNX4FS write support (DANGEROUS)"
1071 depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
1072 help
1073 Say Y if you want to test write support for QNX4 file systems.
1074
1075 It's currently broken, so for now:
1076 answer N.
1077
1078config ROMFS_FS
1079 tristate "ROM file system support"
1080 depends on BLOCK
1081 ---help---
1082 This is a very small read-only file system mainly intended for
1083 initial ram disks of installation disks, but it could be used for
1084 other read-only media as well. Read
1085 <file:Documentation/filesystems/romfs.txt> for details.
1086
1087 To compile this file system support as a module, choose M here: the
1088 module will be called romfs. Note that the file system of your
1089 root partition (the one containing the directory /) cannot be a
1090 module.
1091
1092 If you don't know whether you need it, then you don't need it:
1093 answer N.
1094
1095
1096config SYSV_FS
1097 tristate "System V/Xenix/V7/Coherent file system support"
1098 depends on BLOCK
1099 help
1100 SCO, Xenix and Coherent are commercial Unix systems for Intel
1101 machines, and Version 7 was used on the DEC PDP-11. Saying Y
1102 here would allow you to read from their floppies and hard disk
1103 partitions.
1104
1105 If you have floppies or hard disk partitions like that, it is likely
1106 that they contain binaries from those other Unix systems; in order
1107 to run these binaries, you will want to install linux-abi which is
1108 a set of kernel modules that lets you run SCO, Xenix, Wyse,
1109 UnixWare, Dell Unix and System V programs under Linux. It is
1110 available via FTP (user: ftp) from
1111 <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
1112 NOTE: that will work only for binaries from Intel-based systems;
1113 PDP ones will have to wait until somebody ports Linux to -11 ;-)
1114
1115 If you only intend to mount files from some other Unix over the
1116 network using NFS, you don't need the System V file system support
1117 (but you need NFS file system support obviously).
1118
1119 Note that this option is generally not needed for floppies, since a
1120 good portable way to transport files and directories between unixes
1121 (and even other operating systems) is given by the tar program ("man
1122 tar" or preferably "info tar"). Note also that this option has
1123 nothing whatsoever to do with the option "System V IPC". Read about
1124 the System V file system in
1125 <file:Documentation/filesystems/sysv-fs.txt>.
1126 Saying Y here will enlarge your kernel by about 27 KB.
1127
1128 To compile this as a module, choose M here: the module will be called
1129 sysv.
1130
1131 If you haven't heard about all of this before, it's safe to say N.
1132
1133
1134config UFS_FS
1135 tristate "UFS file system support (read only)"
1136 depends on BLOCK
1137 help
1138 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
1139 OpenBSD and NeXTstep) use a file system called UFS. Some System V
1140 Unixes can create and mount hard disk partitions and diskettes using
1141 this file system as well. Saying Y here will allow you to read from
1142 these partitions; if you also want to write to them, say Y to the
1143 experimental "UFS file system write support", below. Please read the
1144 file <file:Documentation/filesystems/ufs.txt> for more information.
1145
1146 The recently released UFS2 variant (used in FreeBSD 5.x) is
1147 READ-ONLY supported.
1148
1149 Note that this option is generally not needed for floppies, since a
1150 good portable way to transport files and directories between unixes
1151 (and even other operating systems) is given by the tar program ("man
1152 tar" or preferably "info tar").
1153
1154 When accessing NeXTstep files, you may need to convert them from the
1155 NeXT character set to the Latin1 character set; use the program
1156 recode ("info recode") for this purpose.
1157
1158 To compile the UFS file system support as a module, choose M here: the
1159 module will be called ufs.
1160
1161 If you haven't heard about all of this before, it's safe to say N.
1162
1163config UFS_FS_WRITE
1164 bool "UFS file system write support (DANGEROUS)"
1165 depends on UFS_FS && EXPERIMENTAL
1166 help
1167 Say Y here if you want to try writing to UFS partitions. This is
1168 experimental, so you should back up your UFS partitions beforehand.
1169
1170config UFS_DEBUG
1171 bool "UFS debugging"
1172 depends on UFS_FS
1173 help
1174 If you are experiencing any problems with the UFS filesystem, say
1175 Y here. This will result in _many_ additional debugging messages to be
1176 written to the system log.
1177 225
1178endif # MISC_FILESYSTEMS 226endif # MISC_FILESYSTEMS
1179 227
@@ -1193,173 +241,8 @@ menuconfig NETWORK_FILESYSTEMS
1193 241
1194if NETWORK_FILESYSTEMS 242if NETWORK_FILESYSTEMS
1195 243
1196config NFS_FS 244source "fs/nfs/Kconfig"
1197 tristate "NFS client support" 245source "fs/nfsd/Kconfig"
1198 depends on INET
1199 select LOCKD
1200 select SUNRPC
1201 select NFS_ACL_SUPPORT if NFS_V3_ACL
1202 help
1203 Choose Y here if you want to access files residing on other
1204 computers using Sun's Network File System protocol. To compile
1205 this file system support as a module, choose M here: the module
1206 will be called nfs.
1207
1208 To mount file systems exported by NFS servers, you also need to
1209 install the user space mount.nfs command which can be found in
1210 the Linux nfs-utils package, available from http://linux-nfs.org/.
1211 Information about using the mount command is available in the
1212 mount(8) man page. More detail about the Linux NFS client
1213 implementation is available via the nfs(5) man page.
1214
1215 Below you can choose which versions of the NFS protocol are
1216 available in the kernel to mount NFS servers. Support for NFS
1217 version 2 (RFC 1094) is always available when NFS_FS is selected.
1218
1219 To configure a system which mounts its root file system via NFS
1220 at boot time, say Y here, select "Kernel level IP
1221 autoconfiguration" in the NETWORK menu, and select "Root file
1222 system on NFS" below. You cannot compile this file system as a
1223 module in this case.
1224
1225 If unsure, say N.
1226
1227config NFS_V3
1228 bool "NFS client support for NFS version 3"
1229 depends on NFS_FS
1230 help
1231 This option enables support for version 3 of the NFS protocol
1232 (RFC 1813) in the kernel's NFS client.
1233
1234 If unsure, say Y.
1235
1236config NFS_V3_ACL
1237 bool "NFS client support for the NFSv3 ACL protocol extension"
1238 depends on NFS_V3
1239 help
1240 Some NFS servers support an auxiliary NFSv3 ACL protocol that
1241 Sun added to Solaris but never became an official part of the
1242 NFS version 3 protocol. This protocol extension allows
1243 applications on NFS clients to manipulate POSIX Access Control
1244 Lists on files residing on NFS servers. NFS servers enforce
1245 ACLs on local files whether this protocol is available or not.
1246
1247 Choose Y here if your NFS server supports the Solaris NFSv3 ACL
1248 protocol extension and you want your NFS client to allow
1249 applications to access and modify ACLs on files on the server.
1250
1251 Most NFS servers don't support the Solaris NFSv3 ACL protocol
1252 extension. You can choose N here or specify the "noacl" mount
1253 option to prevent your NFS client from trying to use the NFSv3
1254 ACL protocol.
1255
1256 If unsure, say N.
1257
1258config NFS_V4
1259 bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
1260 depends on NFS_FS && EXPERIMENTAL
1261 select RPCSEC_GSS_KRB5
1262 help
1263 This option enables support for version 4 of the NFS protocol
1264 (RFC 3530) in the kernel's NFS client.
1265
1266 To mount NFS servers using NFSv4, you also need to install user
1267 space programs which can be found in the Linux nfs-utils package,
1268 available from http://linux-nfs.org/.
1269
1270 If unsure, say N.
1271
1272config ROOT_NFS
1273 bool "Root file system on NFS"
1274 depends on NFS_FS=y && IP_PNP
1275 help
1276 If you want your system to mount its root file system via NFS,
1277 choose Y here. This is common practice for managing systems
1278 without local permanent storage. For details, read
1279 <file:Documentation/filesystems/nfsroot.txt>.
1280
1281 Most people say N here.
1282
1283config NFSD
1284 tristate "NFS server support"
1285 depends on INET
1286 select LOCKD
1287 select SUNRPC
1288 select EXPORTFS
1289 select NFS_ACL_SUPPORT if NFSD_V2_ACL
1290 help
1291 Choose Y here if you want to allow other computers to access
1292 files residing on this system using Sun's Network File System
1293 protocol. To compile the NFS server support as a module,
1294 choose M here: the module will be called nfsd.
1295
1296 You may choose to use a user-space NFS server instead, in which
1297 case you can choose N here.
1298
1299 To export local file systems using NFS, you also need to install
1300 user space programs which can be found in the Linux nfs-utils
1301 package, available from http://linux-nfs.org/. More detail about
1302 the Linux NFS server implementation is available via the
1303 exports(5) man page.
1304
1305 Below you can choose which versions of the NFS protocol are
1306 available to clients mounting the NFS server on this system.
1307 Support for NFS version 2 (RFC 1094) is always available when
1308 CONFIG_NFSD is selected.
1309
1310 If unsure, say N.
1311
1312config NFSD_V2_ACL
1313 bool
1314 depends on NFSD
1315
1316config NFSD_V3
1317 bool "NFS server support for NFS version 3"
1318 depends on NFSD
1319 help
1320 This option enables support in your system's NFS server for
1321 version 3 of the NFS protocol (RFC 1813).
1322
1323 If unsure, say Y.
1324
1325config NFSD_V3_ACL
1326 bool "NFS server support for the NFSv3 ACL protocol extension"
1327 depends on NFSD_V3
1328 select NFSD_V2_ACL
1329 help
1330 Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
1331 never became an official part of the NFS version 3 protocol.
1332 This protocol extension allows applications on NFS clients to
1333 manipulate POSIX Access Control Lists on files residing on NFS
1334 servers. NFS servers enforce POSIX ACLs on local files whether
1335 this protocol is available or not.
1336
1337 This option enables support in your system's NFS server for the
1338 NFSv3 ACL protocol extension allowing NFS clients to manipulate
1339 POSIX ACLs on files exported by your system's NFS server. NFS
1340 clients which support the Solaris NFSv3 ACL protocol can then
1341 access and modify ACLs on your NFS server.
1342
1343 To store ACLs on your NFS server, you also need to enable ACL-
1344 related CONFIG options for your local file systems of choice.
1345
1346 If unsure, say N.
1347
1348config NFSD_V4
1349 bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
1350 depends on NFSD && PROC_FS && EXPERIMENTAL
1351 select NFSD_V3
1352 select FS_POSIX_ACL
1353 select RPCSEC_GSS_KRB5
1354 help
1355 This option enables support in your system's NFS server for
1356 version 4 of the NFS protocol (RFC 3530).
1357
1358 To export files using NFSv4, you need to install additional user
1359 space programs which can be found in the Linux nfs-utils package,
1360 available from http://linux-nfs.org/.
1361
1362 If unsure, say N.
1363 246
1364config LOCKD 247config LOCKD
1365 tristate 248 tristate
@@ -1381,221 +264,13 @@ config NFS_COMMON
1381 depends on NFSD || NFS_FS 264 depends on NFSD || NFS_FS
1382 default y 265 default y
1383 266
1384config SUNRPC 267source "net/sunrpc/Kconfig"
1385 tristate 268source "fs/smbfs/Kconfig"
1386
1387config SUNRPC_GSS
1388 tristate
1389
1390config SUNRPC_XPRT_RDMA
1391 tristate
1392 depends on SUNRPC && INFINIBAND && EXPERIMENTAL
1393 default SUNRPC && INFINIBAND
1394 help
1395 This option enables an RPC client transport capability that
1396 allows the NFS client to mount servers via an RDMA-enabled
1397 transport.
1398
1399 To compile RPC client RDMA transport support as a module,
1400 choose M here: the module will be called xprtrdma.
1401
1402 If unsure, say N.
1403
1404config SUNRPC_REGISTER_V4
1405 bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
1406 depends on SUNRPC && EXPERIMENTAL
1407 default n
1408 help
1409 Sun added support for registering RPC services at an IPv6
1410 address by creating two new versions of the rpcbind protocol
1411 (RFC 1833).
1412
1413 This option enables support in the kernel RPC server for
1414 registering kernel RPC services via version 4 of the rpcbind
1415 protocol. If you enable this option, you must run a portmapper
1416 daemon that supports rpcbind protocol version 4.
1417
1418 Serving NFS over IPv6 from knfsd (the kernel's NFS server)
1419 requires that you enable this option and use a portmapper that
1420 supports rpcbind version 4.
1421
1422 If unsure, say N to get traditional behavior (register kernel
1423 RPC services using only rpcbind version 2). Distributions
1424 using the legacy Linux portmapper daemon must say N here.
1425
1426config RPCSEC_GSS_KRB5
1427 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
1428 depends on SUNRPC && EXPERIMENTAL
1429 select SUNRPC_GSS
1430 select CRYPTO
1431 select CRYPTO_MD5
1432 select CRYPTO_DES
1433 select CRYPTO_CBC
1434 help
1435 Choose Y here to enable Secure RPC using the Kerberos version 5
1436 GSS-API mechanism (RFC 1964).
1437
1438 Secure RPC calls with Kerberos require an auxiliary user-space
1439 daemon which may be found in the Linux nfs-utils package
1440 available from http://linux-nfs.org/. In addition, user-space
1441 Kerberos support should be installed.
1442
1443 If unsure, say N.
1444
1445config RPCSEC_GSS_SPKM3
1446 tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)"
1447 depends on SUNRPC && EXPERIMENTAL
1448 select SUNRPC_GSS
1449 select CRYPTO
1450 select CRYPTO_MD5
1451 select CRYPTO_DES
1452 select CRYPTO_CAST5
1453 select CRYPTO_CBC
1454 help
1455 Choose Y here to enable Secure RPC using the SPKM3 public key
1456 GSS-API mechansim (RFC 2025).
1457
1458 Secure RPC calls with SPKM3 require an auxiliary userspace
1459 daemon which may be found in the Linux nfs-utils package
1460 available from http://linux-nfs.org/.
1461
1462 If unsure, say N.
1463
1464config SMB_FS
1465 tristate "SMB file system support (OBSOLETE, please use CIFS)"
1466 depends on INET
1467 select NLS
1468 help
1469 SMB (Server Message Block) is the protocol Windows for Workgroups
1470 (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
1471 files and printers over local networks. Saying Y here allows you to
1472 mount their file systems (often called "shares" in this context) and
1473 access them just like any other Unix directory. Currently, this
1474 works only if the Windows machines use TCP/IP as the underlying
1475 transport protocol, and not NetBEUI. For details, read
1476 <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
1477 available from <http://www.tldp.org/docs.html#howto>.
1478
1479 Note: if you just want your box to act as an SMB *server* and make
1480 files and printing services available to Windows clients (which need
1481 to have a TCP/IP stack), you don't need to say Y here; you can use
1482 the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
1483 for that.
1484
1485 General information about how to connect Linux, Windows machines and
1486 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
1487
1488 To compile the SMB support as a module, choose M here:
1489 the module will be called smbfs. Most people say N, however.
1490
1491config SMB_NLS_DEFAULT
1492 bool "Use a default NLS"
1493 depends on SMB_FS
1494 help
1495 Enabling this will make smbfs use nls translations by default. You
1496 need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
1497 settings and you need to give the default nls for the SMB server as
1498 CONFIG_SMB_NLS_REMOTE.
1499
1500 The nls settings can be changed at mount time, if your smbmount
1501 supports that, using the codepage and iocharset parameters.
1502
1503 smbmount from samba 2.2.0 or later supports this.
1504
1505config SMB_NLS_REMOTE
1506 string "Default Remote NLS Option"
1507 depends on SMB_NLS_DEFAULT
1508 default "cp437"
1509 help
1510 This setting allows you to specify a default value for which
1511 codepage the server uses. If this field is left blank no
1512 translations will be done by default. The local codepage/charset
1513 default to CONFIG_NLS_DEFAULT.
1514
1515 The nls settings can be changed at mount time, if your smbmount
1516 supports that, using the codepage and iocharset parameters.
1517
1518 smbmount from samba 2.2.0 or later supports this.
1519
1520source "fs/cifs/Kconfig" 269source "fs/cifs/Kconfig"
1521
1522config NCP_FS
1523 tristate "NCP file system support (to mount NetWare volumes)"
1524 depends on IPX!=n || INET
1525 help
1526 NCP (NetWare Core Protocol) is a protocol that runs over IPX and is
1527 used by Novell NetWare clients to talk to file servers. It is to
1528 IPX what NFS is to TCP/IP, if that helps. Saying Y here allows you
1529 to mount NetWare file server volumes and to access them just like
1530 any other Unix directory. For details, please read the file
1531 <file:Documentation/filesystems/ncpfs.txt> in the kernel source and
1532 the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>.
1533
1534 You do not have to say Y here if you want your Linux box to act as a
1535 file *server* for Novell NetWare clients.
1536
1537 General information about how to connect Linux, Windows machines and
1538 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
1539
1540 To compile this as a module, choose M here: the module will be called
1541 ncpfs. Say N unless you are connected to a Novell network.
1542
1543source "fs/ncpfs/Kconfig" 270source "fs/ncpfs/Kconfig"
1544 271source "fs/coda/Kconfig"
1545config CODA_FS 272source "fs/afs/Kconfig"
1546 tristate "Coda file system support (advanced network fs)" 273source "fs/9p/Kconfig"
1547 depends on INET
1548 help
1549 Coda is an advanced network file system, similar to NFS in that it
1550 enables you to mount file systems of a remote server and access them
1551 with regular Unix commands as if they were sitting on your hard
1552 disk. Coda has several advantages over NFS: support for
1553 disconnected operation (e.g. for laptops), read/write server
1554 replication, security model for authentication and encryption,
1555 persistent client caches and write back caching.
1556
1557 If you say Y here, your Linux box will be able to act as a Coda
1558 *client*. You will need user level code as well, both for the
1559 client and server. Servers are currently user level, i.e. they need
1560 no kernel support. Please read
1561 <file:Documentation/filesystems/coda.txt> and check out the Coda
1562 home page <http://www.coda.cs.cmu.edu/>.
1563
1564 To compile the coda client support as a module, choose M here: the
1565 module will be called coda.
1566
1567config AFS_FS
1568 tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
1569 depends on INET && EXPERIMENTAL
1570 select AF_RXRPC
1571 help
1572 If you say Y here, you will get an experimental Andrew File System
1573 driver. It currently only supports unsecured read-only AFS access.
1574
1575 See <file:Documentation/filesystems/afs.txt> for more information.
1576
1577 If unsure, say N.
1578
1579config AFS_DEBUG
1580 bool "AFS dynamic debugging"
1581 depends on AFS_FS
1582 help
1583 Say Y here to make runtime controllable debugging messages appear.
1584
1585 See <file:Documentation/filesystems/afs.txt> for more information.
1586
1587 If unsure, say N.
1588
1589config 9P_FS
1590 tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
1591 depends on INET && NET_9P && EXPERIMENTAL
1592 help
1593 If you say Y here, you will get experimental support for
1594 Plan 9 resource sharing via the 9P2000 protocol.
1595
1596 See <http://v9fs.sf.net> for more information.
1597
1598 If unsure, say N.
1599 274
1600endif # NETWORK_FILESYSTEMS 275endif # NETWORK_FILESYSTEMS
1601 276
diff --git a/fs/Makefile b/fs/Makefile
index 38bc735c67ad..dc20db348679 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -69,10 +69,12 @@ obj-$(CONFIG_DLM) += dlm/
69# Do not add any filesystems before this line 69# Do not add any filesystems before this line
70obj-$(CONFIG_REISERFS_FS) += reiserfs/ 70obj-$(CONFIG_REISERFS_FS) += reiserfs/
71obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 71obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
72obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4 72obj-$(CONFIG_EXT2_FS) += ext2/
73# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
74# unless explicitly requested by rootfstype
75obj-$(CONFIG_EXT4_FS) += ext4/
73obj-$(CONFIG_JBD) += jbd/ 76obj-$(CONFIG_JBD) += jbd/
74obj-$(CONFIG_JBD2) += jbd2/ 77obj-$(CONFIG_JBD2) += jbd2/
75obj-$(CONFIG_EXT2_FS) += ext2/
76obj-$(CONFIG_CRAMFS) += cramfs/ 78obj-$(CONFIG_CRAMFS) += cramfs/
77obj-$(CONFIG_SQUASHFS) += squashfs/ 79obj-$(CONFIG_SQUASHFS) += squashfs/
78obj-y += ramfs/ 80obj-y += ramfs/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
new file mode 100644
index 000000000000..e55182a74605
--- /dev/null
+++ b/fs/adfs/Kconfig
@@ -0,0 +1,27 @@
1config ADFS_FS
2 tristate "ADFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 help
5 The Acorn Disc Filing System is the standard file system of the
6 RiscOS operating system which runs on Acorn's ARM-based Risc PC
7 systems and the Acorn Archimedes range of machines. If you say Y
8 here, Linux will be able to read from ADFS partitions on hard drives
9 and from ADFS-formatted floppy discs. If you also want to be able to
10 write to those devices, say Y to "ADFS write support" below.
11
12 The ADFS partition should be the first partition (i.e.,
13 /dev/[hs]d?1) on each of your drives. Please read the file
14 <file:Documentation/filesystems/adfs.txt> for further details.
15
16 To compile this code as a module, choose M here: the module will be
17 called adfs.
18
19 If unsure, say N.
20
21config ADFS_FS_RW
22 bool "ADFS write support (DANGEROUS)"
23 depends on ADFS_FS
24 help
25 If you say Y here, you will be able to write to ADFS partitions on
26 hard drives and ADFS-formatted floppy disks. This is experimental
27 codes, so if you're unsure, say N.
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
new file mode 100644
index 000000000000..cfad9afb4762
--- /dev/null
+++ b/fs/affs/Kconfig
@@ -0,0 +1,21 @@
1config AFFS_FS
2 tristate "Amiga FFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 help
5 The Fast File System (FFS) is the common file system used on hard
6 disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y
7 if you want to be able to read and write files from and to an Amiga
8 FFS partition on your hard drive. Amiga floppies however cannot be
9 read with this driver due to an incompatibility of the floppy
10 controller used in an Amiga and the standard floppy controller in
11 PCs and workstations. Read <file:Documentation/filesystems/affs.txt>
12 and <file:fs/affs/Changes>.
13
14 With this driver you can also mount disk files used by Bernd
15 Schmidt's Un*X Amiga Emulator
16 (<http://www.freiburg.linux.de/~uae/>).
17 If you want to do this, you will also need to say Y or M to "Loop
18 device support", above.
19
20 To compile this file system support as a module, choose M here: the
21 module will be called affs. If unsure, say N.
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
new file mode 100644
index 000000000000..e7b522fe15e1
--- /dev/null
+++ b/fs/afs/Kconfig
@@ -0,0 +1,21 @@
1config AFS_FS
2 tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select AF_RXRPC
5 help
6 If you say Y here, you will get an experimental Andrew File System
7 driver. It currently only supports unsecured read-only AFS access.
8
9 See <file:Documentation/filesystems/afs.txt> for more information.
10
11 If unsure, say N.
12
13config AFS_DEBUG
14 bool "AFS dynamic debugging"
15 depends on AFS_FS
16 help
17 Say Y here to make runtime controllable debugging messages appear.
18
19 See <file:Documentation/filesystems/afs.txt> for more information.
20
21 If unsure, say N.
diff --git a/fs/aio.c b/fs/aio.c
index 8fa77e233944..76da12537956 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -443,7 +443,7 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx)
443 req->private = NULL; 443 req->private = NULL;
444 req->ki_iovec = NULL; 444 req->ki_iovec = NULL;
445 INIT_LIST_HEAD(&req->ki_run_list); 445 INIT_LIST_HEAD(&req->ki_run_list);
446 req->ki_eventfd = ERR_PTR(-EINVAL); 446 req->ki_eventfd = NULL;
447 447
448 /* Check if the completion queue has enough free space to 448 /* Check if the completion queue has enough free space to
449 * accept an event from this io. 449 * accept an event from this io.
@@ -485,8 +485,6 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
485{ 485{
486 assert_spin_locked(&ctx->ctx_lock); 486 assert_spin_locked(&ctx->ctx_lock);
487 487
488 if (!IS_ERR(req->ki_eventfd))
489 fput(req->ki_eventfd);
490 if (req->ki_dtor) 488 if (req->ki_dtor)
491 req->ki_dtor(req); 489 req->ki_dtor(req);
492 if (req->ki_iovec != &req->ki_inline_vec) 490 if (req->ki_iovec != &req->ki_inline_vec)
@@ -508,8 +506,11 @@ static void aio_fput_routine(struct work_struct *data)
508 list_del(&req->ki_list); 506 list_del(&req->ki_list);
509 spin_unlock_irq(&fput_lock); 507 spin_unlock_irq(&fput_lock);
510 508
511 /* Complete the fput */ 509 /* Complete the fput(s) */
512 __fput(req->ki_filp); 510 if (req->ki_filp != NULL)
511 __fput(req->ki_filp);
512 if (req->ki_eventfd != NULL)
513 __fput(req->ki_eventfd);
513 514
514 /* Link the iocb into the context's free list */ 515 /* Link the iocb into the context's free list */
515 spin_lock_irq(&ctx->ctx_lock); 516 spin_lock_irq(&ctx->ctx_lock);
@@ -527,12 +528,14 @@ static void aio_fput_routine(struct work_struct *data)
527 */ 528 */
528static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) 529static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
529{ 530{
531 int schedule_putreq = 0;
532
530 dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", 533 dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
531 req, atomic_long_read(&req->ki_filp->f_count)); 534 req, atomic_long_read(&req->ki_filp->f_count));
532 535
533 assert_spin_locked(&ctx->ctx_lock); 536 assert_spin_locked(&ctx->ctx_lock);
534 537
535 req->ki_users --; 538 req->ki_users--;
536 BUG_ON(req->ki_users < 0); 539 BUG_ON(req->ki_users < 0);
537 if (likely(req->ki_users)) 540 if (likely(req->ki_users))
538 return 0; 541 return 0;
@@ -540,10 +543,23 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
540 req->ki_cancel = NULL; 543 req->ki_cancel = NULL;
541 req->ki_retry = NULL; 544 req->ki_retry = NULL;
542 545
543 /* Must be done under the lock to serialise against cancellation. 546 /*
544 * Call this aio_fput as it duplicates fput via the fput_work. 547 * Try to optimize the aio and eventfd file* puts, by avoiding to
548 * schedule work in case it is not __fput() time. In normal cases,
549 * we would not be holding the last reference to the file*, so
550 * this function will be executed w/out any aio kthread wakeup.
545 */ 551 */
546 if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { 552 if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
553 schedule_putreq++;
554 else
555 req->ki_filp = NULL;
556 if (req->ki_eventfd != NULL) {
557 if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
558 schedule_putreq++;
559 else
560 req->ki_eventfd = NULL;
561 }
562 if (unlikely(schedule_putreq)) {
547 get_ioctx(ctx); 563 get_ioctx(ctx);
548 spin_lock(&fput_lock); 564 spin_lock(&fput_lock);
549 list_add(&req->ki_list, &fput_head); 565 list_add(&req->ki_list, &fput_head);
@@ -571,7 +587,7 @@ int aio_put_req(struct kiocb *req)
571static struct kioctx *lookup_ioctx(unsigned long ctx_id) 587static struct kioctx *lookup_ioctx(unsigned long ctx_id)
572{ 588{
573 struct mm_struct *mm = current->mm; 589 struct mm_struct *mm = current->mm;
574 struct kioctx *ctx = NULL; 590 struct kioctx *ctx, *ret = NULL;
575 struct hlist_node *n; 591 struct hlist_node *n;
576 592
577 rcu_read_lock(); 593 rcu_read_lock();
@@ -579,12 +595,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
579 hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { 595 hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
580 if (ctx->user_id == ctx_id && !ctx->dead) { 596 if (ctx->user_id == ctx_id && !ctx->dead) {
581 get_ioctx(ctx); 597 get_ioctx(ctx);
598 ret = ctx;
582 break; 599 break;
583 } 600 }
584 } 601 }
585 602
586 rcu_read_unlock(); 603 rcu_read_unlock();
587 return ctx; 604 return ret;
588} 605}
589 606
590/* 607/*
@@ -1009,7 +1026,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
1009 * eventfd. The eventfd_signal() function is safe to be called 1026 * eventfd. The eventfd_signal() function is safe to be called
1010 * from IRQ context. 1027 * from IRQ context.
1011 */ 1028 */
1012 if (!IS_ERR(iocb->ki_eventfd)) 1029 if (iocb->ki_eventfd != NULL)
1013 eventfd_signal(iocb->ki_eventfd, 1); 1030 eventfd_signal(iocb->ki_eventfd, 1);
1014 1031
1015put_rq: 1032put_rq:
@@ -1608,6 +1625,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1608 req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); 1625 req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
1609 if (IS_ERR(req->ki_eventfd)) { 1626 if (IS_ERR(req->ki_eventfd)) {
1610 ret = PTR_ERR(req->ki_eventfd); 1627 ret = PTR_ERR(req->ki_eventfd);
1628 req->ki_eventfd = NULL;
1611 goto out_put_req; 1629 goto out_put_req;
1612 } 1630 }
1613 } 1631 }
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
new file mode 100644
index 000000000000..5f3bea90911e
--- /dev/null
+++ b/fs/autofs/Kconfig
@@ -0,0 +1,21 @@
1config AUTOFS_FS
2 tristate "Kernel automounter support"
3 help
4 The automounter is a tool to automatically mount remote file systems
5 on demand. This implementation is partially kernel-based to reduce
6 overhead in the already-mounted case; this is unlike the BSD
7 automounter (amd), which is a pure user space daemon.
8
9 To use the automounter you need the user-space tools from the autofs
10 package; you can find the location in <file:Documentation/Changes>.
11 You also want to answer Y to "NFS file system support", below.
12
13 If you want to use the newer version of the automounter with more
14 features, say N here and say Y to "Kernel automounter v4 support",
15 below.
16
17 To compile this support as a module, choose M here: the module will be
18 called autofs.
19
20 If you are not a part of a fairly large, distributed network, you
21 probably do not need an automounter, and can say N here.
diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig
new file mode 100644
index 000000000000..1204d6384d39
--- /dev/null
+++ b/fs/autofs4/Kconfig
@@ -0,0 +1,20 @@
1config AUTOFS4_FS
2 tristate "Kernel automounter version 4 support (also supports v3)"
3 help
4 The automounter is a tool to automatically mount remote file systems
5 on demand. This implementation is partially kernel-based to reduce
6 overhead in the already-mounted case; this is unlike the BSD
7 automounter (amd), which is a pure user space daemon.
8
9 To use the automounter you need the user-space tools from
10 <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
11 want to answer Y to "NFS file system support", below.
12
13 To compile this support as a module, choose M here: the module will be
14 called autofs4. You will need to add "alias autofs autofs4" to your
15 modules configuration file.
16
17 If you are not a part of a fairly large, distributed network or
18 don't have a laptop which needs to dynamically reconfigure to the
19 local network, you probably do not need an automounter, and can say
20 N here.
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
new file mode 100644
index 000000000000..7835d30f211f
--- /dev/null
+++ b/fs/befs/Kconfig
@@ -0,0 +1,26 @@
1config BEFS_FS
2 tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 select NLS
5 help
6 The BeOS File System (BeFS) is the native file system of Be, Inc's
7 BeOS. Notable features include support for arbitrary attributes
8 on files and directories, and database-like indices on selected
9 attributes. (Also note that this driver doesn't make those features
10 available at this time). It is a 64 bit filesystem, so it supports
11 extremely large volumes and files.
12
13 If you use this filesystem, you should also say Y to at least one
14 of the NLS (native language support) options below.
15
16 If you don't know what this is about, say N.
17
18 To compile this as a module, choose M here: the module will be
19 called befs.
20
21config BEFS_DEBUG
22 bool "Debug BeFS"
23 depends on BEFS_FS
24 help
25 If you say Y here, you can use the 'debug' mount option to enable
26 debugging output from the driver.
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
new file mode 100644
index 000000000000..c2336c62024f
--- /dev/null
+++ b/fs/bfs/Kconfig
@@ -0,0 +1,19 @@
1config BFS_FS
2 tristate "BFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 help
5 Boot File System (BFS) is a file system used under SCO UnixWare to
6 allow the bootloader access to the kernel image and other important
7 files during the boot process. It is usually mounted under /stand
8 and corresponds to the slice marked as "STAND" in the UnixWare
9 partition. You should say Y if you want to read or write the files
10 on your /stand slice from within Linux. You then also need to say Y
11 to "UnixWare slices support", below. More information about the BFS
12 file system is contained in the file
13 <file:Documentation/filesystems/bfs.txt>.
14
15 If you don't know what this is about, say N.
16
17 To compile this as a module, choose M here: the module will be called
18 bfs. Note that the file system of your root partition (the one
19 containing the directory /) cannot be compiled as a module.
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e3ff2b9e602f..33b7235f853b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1208,9 +1208,11 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1208 * check for an ELF header. If we find one, dump the first page to 1208 * check for an ELF header. If we find one, dump the first page to
1209 * aid in determining what was mapped here. 1209 * aid in determining what was mapped here.
1210 */ 1210 */
1211 if (FILTER(ELF_HEADERS) && vma->vm_file != NULL && vma->vm_pgoff == 0) { 1211 if (FILTER(ELF_HEADERS) &&
1212 vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
1212 u32 __user *header = (u32 __user *) vma->vm_start; 1213 u32 __user *header = (u32 __user *) vma->vm_start;
1213 u32 word; 1214 u32 word;
1215 mm_segment_t fs = get_fs();
1214 /* 1216 /*
1215 * Doing it this way gets the constant folded by GCC. 1217 * Doing it this way gets the constant folded by GCC.
1216 */ 1218 */
@@ -1223,7 +1225,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1223 magic.elfmag[EI_MAG1] = ELFMAG1; 1225 magic.elfmag[EI_MAG1] = ELFMAG1;
1224 magic.elfmag[EI_MAG2] = ELFMAG2; 1226 magic.elfmag[EI_MAG2] = ELFMAG2;
1225 magic.elfmag[EI_MAG3] = ELFMAG3; 1227 magic.elfmag[EI_MAG3] = ELFMAG3;
1226 if (get_user(word, header) == 0 && word == magic.cmp) 1228 /*
1229 * Switch to the user "segment" for get_user(),
1230 * then put back what elf_core_dump() had in place.
1231 */
1232 set_fs(USER_DS);
1233 if (unlikely(get_user(word, header)))
1234 word = 0;
1235 set_fs(fs);
1236 if (word == magic.cmp)
1227 return PAGE_SIZE; 1237 return PAGE_SIZE;
1228 } 1238 }
1229 1239
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 77ebc3c263d6..31c46a241bac 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -26,23 +26,23 @@
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27 27
28static struct kmem_cache *bio_integrity_slab __read_mostly; 28static struct kmem_cache *bio_integrity_slab __read_mostly;
29static mempool_t *bio_integrity_pool;
30static struct bio_set *integrity_bio_set;
29static struct workqueue_struct *kintegrityd_wq; 31static struct workqueue_struct *kintegrityd_wq;
30 32
31/** 33/**
32 * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio 34 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
33 * @bio: bio to attach integrity metadata to 35 * @bio: bio to attach integrity metadata to
34 * @gfp_mask: Memory allocation mask 36 * @gfp_mask: Memory allocation mask
35 * @nr_vecs: Number of integrity metadata scatter-gather elements 37 * @nr_vecs: Number of integrity metadata scatter-gather elements
36 * @bs: bio_set to allocate from
37 * 38 *
38 * Description: This function prepares a bio for attaching integrity 39 * Description: This function prepares a bio for attaching integrity
39 * metadata. nr_vecs specifies the maximum number of pages containing 40 * metadata. nr_vecs specifies the maximum number of pages containing
40 * integrity metadata that can be attached. 41 * integrity metadata that can be attached.
41 */ 42 */
42struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, 43struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
43 gfp_t gfp_mask, 44 gfp_t gfp_mask,
44 unsigned int nr_vecs, 45 unsigned int nr_vecs)
45 struct bio_set *bs)
46{ 46{
47 struct bio_integrity_payload *bip; 47 struct bio_integrity_payload *bip;
48 struct bio_vec *iv; 48 struct bio_vec *iv;
@@ -50,7 +50,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
50 50
51 BUG_ON(bio == NULL); 51 BUG_ON(bio == NULL);
52 52
53 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); 53 bip = mempool_alloc(bio_integrity_pool, gfp_mask);
54 if (unlikely(bip == NULL)) { 54 if (unlikely(bip == NULL)) {
55 printk(KERN_ERR "%s: could not alloc bip\n", __func__); 55 printk(KERN_ERR "%s: could not alloc bip\n", __func__);
56 return NULL; 56 return NULL;
@@ -58,10 +58,10 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
58 58
59 memset(bip, 0, sizeof(*bip)); 59 memset(bip, 0, sizeof(*bip));
60 60
61 iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs); 61 iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, integrity_bio_set);
62 if (unlikely(iv == NULL)) { 62 if (unlikely(iv == NULL)) {
63 printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__); 63 printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
64 mempool_free(bip, bs->bio_integrity_pool); 64 mempool_free(bip, bio_integrity_pool);
65 return NULL; 65 return NULL;
66 } 66 }
67 67
@@ -72,35 +72,16 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
72 72
73 return bip; 73 return bip;
74} 74}
75EXPORT_SYMBOL(bio_integrity_alloc_bioset);
76
77/**
78 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
79 * @bio: bio to attach integrity metadata to
80 * @gfp_mask: Memory allocation mask
81 * @nr_vecs: Number of integrity metadata scatter-gather elements
82 *
83 * Description: This function prepares a bio for attaching integrity
84 * metadata. nr_vecs specifies the maximum number of pages containing
85 * integrity metadata that can be attached.
86 */
87struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
88 gfp_t gfp_mask,
89 unsigned int nr_vecs)
90{
91 return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
92}
93EXPORT_SYMBOL(bio_integrity_alloc); 75EXPORT_SYMBOL(bio_integrity_alloc);
94 76
95/** 77/**
96 * bio_integrity_free - Free bio integrity payload 78 * bio_integrity_free - Free bio integrity payload
97 * @bio: bio containing bip to be freed 79 * @bio: bio containing bip to be freed
98 * @bs: bio_set this bio was allocated from
99 * 80 *
100 * Description: Used to free the integrity portion of a bio. Usually 81 * Description: Used to free the integrity portion of a bio. Usually
101 * called from bio_free(). 82 * called from bio_free().
102 */ 83 */
103void bio_integrity_free(struct bio *bio, struct bio_set *bs) 84void bio_integrity_free(struct bio *bio)
104{ 85{
105 struct bio_integrity_payload *bip = bio->bi_integrity; 86 struct bio_integrity_payload *bip = bio->bi_integrity;
106 87
@@ -111,8 +92,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
111 && bip->bip_buf != NULL) 92 && bip->bip_buf != NULL)
112 kfree(bip->bip_buf); 93 kfree(bip->bip_buf);
113 94
114 bvec_free_bs(bs, bip->bip_vec, bip->bip_pool); 95 bvec_free_bs(integrity_bio_set, bip->bip_vec, bip->bip_pool);
115 mempool_free(bip, bs->bio_integrity_pool); 96 mempool_free(bip, bio_integrity_pool);
116 97
117 bio->bi_integrity = NULL; 98 bio->bi_integrity = NULL;
118} 99}
@@ -140,7 +121,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
140 121
141 iv = bip_vec_idx(bip, bip->bip_vcnt); 122 iv = bip_vec_idx(bip, bip->bip_vcnt);
142 BUG_ON(iv == NULL); 123 BUG_ON(iv == NULL);
143 BUG_ON(iv->bv_page != NULL);
144 124
145 iv->bv_page = page; 125 iv->bv_page = page;
146 iv->bv_len = len; 126 iv->bv_len = len;
@@ -465,7 +445,7 @@ static int bio_integrity_verify(struct bio *bio)
465 445
466 if (ret) { 446 if (ret) {
467 kunmap_atomic(kaddr, KM_USER0); 447 kunmap_atomic(kaddr, KM_USER0);
468 break; 448 return ret;
469 } 449 }
470 450
471 sectors = bv->bv_len / bi->sector_size; 451 sectors = bv->bv_len / bi->sector_size;
@@ -493,18 +473,13 @@ static void bio_integrity_verify_fn(struct work_struct *work)
493 struct bio_integrity_payload *bip = 473 struct bio_integrity_payload *bip =
494 container_of(work, struct bio_integrity_payload, bip_work); 474 container_of(work, struct bio_integrity_payload, bip_work);
495 struct bio *bio = bip->bip_bio; 475 struct bio *bio = bip->bip_bio;
496 int error = bip->bip_error; 476 int error;
497 477
498 if (bio_integrity_verify(bio)) { 478 error = bio_integrity_verify(bio);
499 clear_bit(BIO_UPTODATE, &bio->bi_flags);
500 error = -EIO;
501 }
502 479
503 /* Restore original bio completion handler */ 480 /* Restore original bio completion handler */
504 bio->bi_end_io = bip->bip_end_io; 481 bio->bi_end_io = bip->bip_end_io;
505 482 bio_endio(bio, error);
506 if (bio->bi_end_io)
507 bio->bi_end_io(bio, error);
508} 483}
509 484
510/** 485/**
@@ -525,7 +500,17 @@ void bio_integrity_endio(struct bio *bio, int error)
525 500
526 BUG_ON(bip->bip_bio != bio); 501 BUG_ON(bip->bip_bio != bio);
527 502
528 bip->bip_error = error; 503 /* In case of an I/O error there is no point in verifying the
504 * integrity metadata. Restore original bio end_io handler
505 * and run it.
506 */
507 if (error) {
508 bio->bi_end_io = bip->bip_end_io;
509 bio_endio(bio, error);
510
511 return;
512 }
513
529 INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); 514 INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
530 queue_work(kintegrityd_wq, &bip->bip_work); 515 queue_work(kintegrityd_wq, &bip->bip_work);
531} 516}
@@ -681,19 +666,18 @@ EXPORT_SYMBOL(bio_integrity_split);
681 * bio_integrity_clone - Callback for cloning bios with integrity metadata 666 * bio_integrity_clone - Callback for cloning bios with integrity metadata
682 * @bio: New bio 667 * @bio: New bio
683 * @bio_src: Original bio 668 * @bio_src: Original bio
684 * @bs: bio_set to allocate bip from 669 * @gfp_mask: Memory allocation mask
685 * 670 *
686 * Description: Called to allocate a bip when cloning a bio 671 * Description: Called to allocate a bip when cloning a bio
687 */ 672 */
688int bio_integrity_clone(struct bio *bio, struct bio *bio_src, 673int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
689 struct bio_set *bs)
690{ 674{
691 struct bio_integrity_payload *bip_src = bio_src->bi_integrity; 675 struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
692 struct bio_integrity_payload *bip; 676 struct bio_integrity_payload *bip;
693 677
694 BUG_ON(bip_src == NULL); 678 BUG_ON(bip_src == NULL);
695 679
696 bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); 680 bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
697 681
698 if (bip == NULL) 682 if (bip == NULL)
699 return -EIO; 683 return -EIO;
@@ -709,37 +693,25 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
709} 693}
710EXPORT_SYMBOL(bio_integrity_clone); 694EXPORT_SYMBOL(bio_integrity_clone);
711 695
712int bioset_integrity_create(struct bio_set *bs, int pool_size) 696static int __init bio_integrity_init(void)
713{ 697{
714 bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, 698 kintegrityd_wq = create_workqueue("kintegrityd");
715 bio_integrity_slab);
716 if (!bs->bio_integrity_pool)
717 return -1;
718
719 return 0;
720}
721EXPORT_SYMBOL(bioset_integrity_create);
722 699
723void bioset_integrity_free(struct bio_set *bs) 700 if (!kintegrityd_wq)
724{ 701 panic("Failed to create kintegrityd\n");
725 if (bs->bio_integrity_pool)
726 mempool_destroy(bs->bio_integrity_pool);
727}
728EXPORT_SYMBOL(bioset_integrity_free);
729 702
730void __init bio_integrity_init_slab(void)
731{
732 bio_integrity_slab = KMEM_CACHE(bio_integrity_payload, 703 bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
733 SLAB_HWCACHE_ALIGN|SLAB_PANIC); 704 SLAB_HWCACHE_ALIGN|SLAB_PANIC);
734}
735 705
736static int __init integrity_init(void) 706 bio_integrity_pool = mempool_create_slab_pool(BIO_POOL_SIZE,
737{ 707 bio_integrity_slab);
738 kintegrityd_wq = create_workqueue("kintegrityd"); 708 if (!bio_integrity_pool)
709 panic("bio_integrity: can't allocate bip pool\n");
739 710
740 if (!kintegrityd_wq) 711 integrity_bio_set = bioset_create(BIO_POOL_SIZE, 0);
741 panic("Failed to create kintegrityd\n"); 712 if (!integrity_bio_set)
713 panic("bio_integrity: can't allocate bio_set\n");
742 714
743 return 0; 715 return 0;
744} 716}
745subsys_initcall(integrity_init); 717subsys_initcall(bio_integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 062299acbccd..a040cde7f6fd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -248,7 +248,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
248 bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); 248 bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
249 249
250 if (bio_integrity(bio)) 250 if (bio_integrity(bio))
251 bio_integrity_free(bio, bs); 251 bio_integrity_free(bio);
252 252
253 /* 253 /*
254 * If we have front padding, adjust the bio pointer before freeing 254 * If we have front padding, adjust the bio pointer before freeing
@@ -301,47 +301,51 @@ void bio_init(struct bio *bio)
301 **/ 301 **/
302struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 302struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
303{ 303{
304 struct bio_vec *bvl = NULL;
304 struct bio *bio = NULL; 305 struct bio *bio = NULL;
306 unsigned long idx = 0;
307 void *p = NULL;
305 308
306 if (bs) { 309 if (bs) {
307 void *p = mempool_alloc(bs->bio_pool, gfp_mask); 310 p = mempool_alloc(bs->bio_pool, gfp_mask);
308 311 if (!p)
309 if (p) 312 goto err;
310 bio = p + bs->front_pad; 313 bio = p + bs->front_pad;
311 } else 314 } else {
312 bio = kmalloc(sizeof(*bio), gfp_mask); 315 bio = kmalloc(sizeof(*bio), gfp_mask);
316 if (!bio)
317 goto err;
318 }
313 319
314 if (likely(bio)) { 320 bio_init(bio);
315 struct bio_vec *bvl = NULL; 321
316 322 if (unlikely(!nr_iovecs))
317 bio_init(bio); 323 goto out_set;
318 if (likely(nr_iovecs)) { 324
319 unsigned long uninitialized_var(idx); 325 if (nr_iovecs <= BIO_INLINE_VECS) {
320 326 bvl = bio->bi_inline_vecs;
321 if (nr_iovecs <= BIO_INLINE_VECS) { 327 nr_iovecs = BIO_INLINE_VECS;
322 idx = 0; 328 } else {
323 bvl = bio->bi_inline_vecs; 329 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
324 nr_iovecs = BIO_INLINE_VECS; 330 if (unlikely(!bvl))
325 } else { 331 goto err_free;
326 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, 332
327 bs); 333 nr_iovecs = bvec_nr_vecs(idx);
328 nr_iovecs = bvec_nr_vecs(idx);
329 }
330 if (unlikely(!bvl)) {
331 if (bs)
332 mempool_free(bio, bs->bio_pool);
333 else
334 kfree(bio);
335 bio = NULL;
336 goto out;
337 }
338 bio->bi_flags |= idx << BIO_POOL_OFFSET;
339 bio->bi_max_vecs = nr_iovecs;
340 }
341 bio->bi_io_vec = bvl;
342 } 334 }
343out: 335 bio->bi_flags |= idx << BIO_POOL_OFFSET;
336 bio->bi_max_vecs = nr_iovecs;
337out_set:
338 bio->bi_io_vec = bvl;
339
344 return bio; 340 return bio;
341
342err_free:
343 if (bs)
344 mempool_free(p, bs->bio_pool);
345 else
346 kfree(bio);
347err:
348 return NULL;
345} 349}
346 350
347struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 351struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
@@ -462,10 +466,12 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
462 if (bio_integrity(bio)) { 466 if (bio_integrity(bio)) {
463 int ret; 467 int ret;
464 468
465 ret = bio_integrity_clone(b, bio, fs_bio_set); 469 ret = bio_integrity_clone(b, bio, gfp_mask);
466 470
467 if (ret < 0) 471 if (ret < 0) {
472 bio_put(b);
468 return NULL; 473 return NULL;
474 }
469 } 475 }
470 476
471 return b; 477 return b;
@@ -1523,7 +1529,6 @@ void bioset_free(struct bio_set *bs)
1523 if (bs->bio_pool) 1529 if (bs->bio_pool)
1524 mempool_destroy(bs->bio_pool); 1530 mempool_destroy(bs->bio_pool);
1525 1531
1526 bioset_integrity_free(bs);
1527 biovec_free_pools(bs); 1532 biovec_free_pools(bs);
1528 bio_put_slab(bs); 1533 bio_put_slab(bs);
1529 1534
@@ -1564,9 +1569,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1564 if (!bs->bio_pool) 1569 if (!bs->bio_pool)
1565 goto bad; 1570 goto bad;
1566 1571
1567 if (bioset_integrity_create(bs, pool_size))
1568 goto bad;
1569
1570 if (!biovec_create_pools(bs, pool_size)) 1572 if (!biovec_create_pools(bs, pool_size))
1571 return bs; 1573 return bs;
1572 1574
@@ -1583,6 +1585,13 @@ static void __init biovec_init_slabs(void)
1583 int size; 1585 int size;
1584 struct biovec_slab *bvs = bvec_slabs + i; 1586 struct biovec_slab *bvs = bvec_slabs + i;
1585 1587
1588#ifndef CONFIG_BLK_DEV_INTEGRITY
1589 if (bvs->nr_vecs <= BIO_INLINE_VECS) {
1590 bvs->slab = NULL;
1591 continue;
1592 }
1593#endif
1594
1586 size = bvs->nr_vecs * sizeof(struct bio_vec); 1595 size = bvs->nr_vecs * sizeof(struct bio_vec);
1587 bvs->slab = kmem_cache_create(bvs->name, size, 0, 1596 bvs->slab = kmem_cache_create(bvs->name, size, 0,
1588 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1597 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
@@ -1597,7 +1606,6 @@ static int __init init_bio(void)
1597 if (!bio_slabs) 1606 if (!bio_slabs)
1598 panic("bio: can't allocate bios\n"); 1607 panic("bio: can't allocate bios\n");
1599 1608
1600 bio_integrity_init_slab();
1601 biovec_init_slabs(); 1609 biovec_init_slabs();
1602 1610
1603 fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); 1611 fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
new file mode 100644
index 000000000000..7bb3c020e570
--- /dev/null
+++ b/fs/btrfs/Kconfig
@@ -0,0 +1,31 @@
1config BTRFS_FS
2 tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
3 depends on EXPERIMENTAL
4 select LIBCRC32C
5 select ZLIB_INFLATE
6 select ZLIB_DEFLATE
7 help
8 Btrfs is a new filesystem with extents, writable snapshotting,
9 support for multiple devices and many more features.
10
11 Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
12 FINALIZED. You should say N here unless you are interested in
13 testing Btrfs with non-critical data.
14
15 To compile this file system support as a module, choose M here. The
16 module will be called btrfs.
17
18 If unsure, say N.
19
20config BTRFS_FS_POSIX_ACL
21 bool "Btrfs POSIX Access Control Lists"
22 depends on BTRFS_FS
23 select FS_POSIX_ACL
24 help
25 POSIX Access Control Lists (ACLs) support permissions for users and
26 groups beyond the owner/group/world scheme.
27
28 To learn more about Access Control Lists, visit the POSIX ACLs for
29 Linux website <http://acl.bestbits.at/>.
30
31 If you don't know what Access Control Lists are, say N
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8e2fec05dbe0..c84ca1f5259a 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -16,11 +16,11 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/version.h>
20#include <linux/kthread.h> 19#include <linux/kthread.h>
21#include <linux/list.h> 20#include <linux/list.h>
22#include <linux/spinlock.h> 21#include <linux/spinlock.h>
23# include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 24#include "async-thread.h"
25 25
26#define WORK_QUEUED_BIT 0 26#define WORK_QUEUED_BIT 0
@@ -143,6 +143,7 @@ static int worker_loop(void *arg)
143 struct btrfs_work *work; 143 struct btrfs_work *work;
144 do { 144 do {
145 spin_lock_irq(&worker->lock); 145 spin_lock_irq(&worker->lock);
146again_locked:
146 while (!list_empty(&worker->pending)) { 147 while (!list_empty(&worker->pending)) {
147 cur = worker->pending.next; 148 cur = worker->pending.next;
148 work = list_entry(cur, struct btrfs_work, list); 149 work = list_entry(cur, struct btrfs_work, list);
@@ -165,14 +166,50 @@ static int worker_loop(void *arg)
165 check_idle_worker(worker); 166 check_idle_worker(worker);
166 167
167 } 168 }
168 worker->working = 0;
169 if (freezing(current)) { 169 if (freezing(current)) {
170 worker->working = 0;
171 spin_unlock_irq(&worker->lock);
170 refrigerator(); 172 refrigerator();
171 } else { 173 } else {
172 set_current_state(TASK_INTERRUPTIBLE);
173 spin_unlock_irq(&worker->lock); 174 spin_unlock_irq(&worker->lock);
174 if (!kthread_should_stop()) 175 if (!kthread_should_stop()) {
176 cpu_relax();
177 /*
178 * we've dropped the lock, did someone else
179 * jump_in?
180 */
181 smp_mb();
182 if (!list_empty(&worker->pending))
183 continue;
184
185 /*
186 * this short schedule allows more work to
187 * come in without the queue functions
188 * needing to go through wake_up_process()
189 *
190 * worker->working is still 1, so nobody
191 * is going to try and wake us up
192 */
193 schedule_timeout(1);
194 smp_mb();
195 if (!list_empty(&worker->pending))
196 continue;
197
198 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE);
201 if (!list_empty(&worker->pending))
202 goto again_locked;
203
204 /*
205 * this makes sure we get a wakeup when someone
206 * adds something new to the queue
207 */
208 worker->working = 0;
209 spin_unlock_irq(&worker->lock);
210
175 schedule(); 211 schedule();
212 }
176 __set_current_state(TASK_RUNNING); 213 __set_current_state(TASK_RUNNING);
177 } 214 }
178 } while (!kthread_should_stop()); 215 } while (!kthread_should_stop());
@@ -350,13 +387,14 @@ int btrfs_requeue_work(struct btrfs_work *work)
350{ 387{
351 struct btrfs_worker_thread *worker = work->worker; 388 struct btrfs_worker_thread *worker = work->worker;
352 unsigned long flags; 389 unsigned long flags;
390 int wake = 0;
353 391
354 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) 392 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
355 goto out; 393 goto out;
356 394
357 spin_lock_irqsave(&worker->lock, flags); 395 spin_lock_irqsave(&worker->lock, flags);
358 atomic_inc(&worker->num_pending);
359 list_add_tail(&work->list, &worker->pending); 396 list_add_tail(&work->list, &worker->pending);
397 atomic_inc(&worker->num_pending);
360 398
361 /* by definition we're busy, take ourselves off the idle 399 /* by definition we're busy, take ourselves off the idle
362 * list 400 * list
@@ -368,10 +406,16 @@ int btrfs_requeue_work(struct btrfs_work *work)
368 &worker->workers->worker_list); 406 &worker->workers->worker_list);
369 spin_unlock_irqrestore(&worker->workers->lock, flags); 407 spin_unlock_irqrestore(&worker->workers->lock, flags);
370 } 408 }
409 if (!worker->working) {
410 wake = 1;
411 worker->working = 1;
412 }
371 413
372 spin_unlock_irqrestore(&worker->lock, flags); 414 spin_unlock_irqrestore(&worker->lock, flags);
373 415 if (wake)
416 wake_up_process(worker->task);
374out: 417out:
418
375 return 0; 419 return 0;
376} 420}
377 421
@@ -398,9 +442,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
398 } 442 }
399 443
400 spin_lock_irqsave(&worker->lock, flags); 444 spin_lock_irqsave(&worker->lock, flags);
445
446 list_add_tail(&work->list, &worker->pending);
401 atomic_inc(&worker->num_pending); 447 atomic_inc(&worker->num_pending);
402 check_busy_worker(worker); 448 check_busy_worker(worker);
403 list_add_tail(&work->list, &worker->pending);
404 449
405 /* 450 /*
406 * avoid calling into wake_up_process if this thread has already 451 * avoid calling into wake_up_process if this thread has already
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a8c9693b75ac..72677ce2b74f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,9 @@ struct btrfs_inode {
66 */ 66 */
67 struct list_head delalloc_inodes; 67 struct list_head delalloc_inodes;
68 68
69 /* the space_info for where this inode's data allocations are done */
70 struct btrfs_space_info *space_info;
71
69 /* full 64 bit generation number, struct vfs_inode doesn't have a big 72 /* full 64 bit generation number, struct vfs_inode doesn't have a big
70 * enough field for this. 73 * enough field for this.
71 */ 74 */
@@ -94,6 +97,11 @@ struct btrfs_inode {
94 */ 97 */
95 u64 delalloc_bytes; 98 u64 delalloc_bytes;
96 99
100 /* total number of bytes that may be used for this inode for
101 * delalloc
102 */
103 u64 reserved_bytes;
104
97 /* 105 /*
98 * the size of the file stored in the metadata on disk. data=ordered 106 * the size of the file stored in the metadata on disk. data=ordered
99 * means the in-memory i_size might be larger than the size on disk 107 * means the in-memory i_size might be larger than the size on disk
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ee848d8585d9..ab07627084f1 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
32#include <linux/swap.h> 32#include <linux/swap.h>
33#include <linux/writeback.h> 33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h> 34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include <linux/pagevec.h> 35#include <linux/pagevec.h>
37#include "compat.h" 36#include "compat.h"
38#include "ctree.h" 37#include "ctree.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9e46c0776816..37f31b5529aa 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,22 +38,64 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot); 39 struct btrfs_path *path, int level, int slot);
40 40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void) 41struct btrfs_path *btrfs_alloc_path(void)
47{ 42{
48 struct btrfs_path *path; 43 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); 44 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) { 45 if (path)
51 btrfs_init_path(path);
52 path->reada = 1; 46 path->reada = 1;
53 }
54 return path; 47 return path;
55} 48}
56 49
50/*
51 * set all locked nodes in the path to blocking locks. This should
52 * be done before scheduling
53 */
54noinline void btrfs_set_path_blocking(struct btrfs_path *p)
55{
56 int i;
57 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
58 if (p->nodes[i] && p->locks[i])
59 btrfs_set_lock_blocking(p->nodes[i]);
60 }
61}
62
63/*
64 * reset all the locked nodes in the patch to spinning locks.
65 *
66 * held is used to keep lockdep happy, when lockdep is enabled
67 * we set held to a blocking lock before we go around and
68 * retake all the spinlocks in the path. You can safely use NULL
69 * for held
70 */
71noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
72 struct extent_buffer *held)
73{
74 int i;
75
76#ifdef CONFIG_DEBUG_LOCK_ALLOC
77 /* lockdep really cares that we take all of these spinlocks
78 * in the right order. If any of the locks in the path are not
79 * currently blocking, it is going to complain. So, make really
80 * really sure by forcing the path to blocking before we clear
81 * the path blocking.
82 */
83 if (held)
84 btrfs_set_lock_blocking(held);
85 btrfs_set_path_blocking(p);
86#endif
87
88 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
89 if (p->nodes[i] && p->locks[i])
90 btrfs_clear_lock_blocking(p->nodes[i]);
91 }
92
93#ifdef CONFIG_DEBUG_LOCK_ALLOC
94 if (held)
95 btrfs_clear_lock_blocking(held);
96#endif
97}
98
57/* this also releases the path */ 99/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p) 100void btrfs_free_path(struct btrfs_path *p)
59{ 101{
@@ -235,7 +277,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
235 if (*cow_ret == buf) 277 if (*cow_ret == buf)
236 unlock_orig = 1; 278 unlock_orig = 1;
237 279
238 WARN_ON(!btrfs_tree_locked(buf)); 280 btrfs_assert_tree_locked(buf);
239 281
240 if (parent) 282 if (parent)
241 parent_start = parent->start; 283 parent_start = parent->start;
@@ -261,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
261 trans->transid, level, &ins); 303 trans->transid, level, &ins);
262 BUG_ON(ret); 304 BUG_ON(ret);
263 cow = btrfs_init_new_buffer(trans, root, prealloc_dest, 305 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
264 buf->len); 306 buf->len, level);
265 } else { 307 } else {
266 cow = btrfs_alloc_free_block(trans, root, buf->len, 308 cow = btrfs_alloc_free_block(trans, root, buf->len,
267 parent_start, 309 parent_start,
@@ -272,6 +314,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
272 if (IS_ERR(cow)) 314 if (IS_ERR(cow))
273 return PTR_ERR(cow); 315 return PTR_ERR(cow);
274 316
317 /* cow is set to blocking by btrfs_init_new_buffer */
318
275 copy_extent_buffer(cow, buf, 0, 0, cow->len); 319 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start); 320 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid); 321 btrfs_set_header_generation(cow, trans->transid);
@@ -388,17 +432,20 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
388 WARN_ON(1); 432 WARN_ON(1);
389 } 433 }
390 434
391 spin_lock(&root->fs_info->hash_lock);
392 if (btrfs_header_generation(buf) == trans->transid && 435 if (btrfs_header_generation(buf) == trans->transid &&
393 btrfs_header_owner(buf) == root->root_key.objectid && 436 btrfs_header_owner(buf) == root->root_key.objectid &&
394 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 437 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395 *cow_ret = buf; 438 *cow_ret = buf;
396 spin_unlock(&root->fs_info->hash_lock);
397 WARN_ON(prealloc_dest); 439 WARN_ON(prealloc_dest);
398 return 0; 440 return 0;
399 } 441 }
400 spin_unlock(&root->fs_info->hash_lock); 442
401 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 443 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
444
445 if (parent)
446 btrfs_set_lock_blocking(parent);
447 btrfs_set_lock_blocking(buf);
448
402 ret = __btrfs_cow_block(trans, root, buf, parent, 449 ret = __btrfs_cow_block(trans, root, buf, parent,
403 parent_slot, cow_ret, search_start, 0, 450 parent_slot, cow_ret, search_start, 0,
404 prealloc_dest); 451 prealloc_dest);
@@ -504,6 +551,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
504 if (parent_nritems == 1) 551 if (parent_nritems == 1)
505 return 0; 552 return 0;
506 553
554 btrfs_set_lock_blocking(parent);
555
507 for (i = start_slot; i < end_slot; i++) { 556 for (i = start_slot; i < end_slot; i++) {
508 int close = 1; 557 int close = 1;
509 558
@@ -564,6 +613,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
564 search_start = last_block; 613 search_start = last_block;
565 614
566 btrfs_tree_lock(cur); 615 btrfs_tree_lock(cur);
616 btrfs_set_lock_blocking(cur);
567 err = __btrfs_cow_block(trans, root, cur, parent, i, 617 err = __btrfs_cow_block(trans, root, cur, parent, i,
568 &cur, search_start, 618 &cur, search_start,
569 min(16 * blocksize, 619 min(16 * blocksize,
@@ -862,6 +912,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
862 return 0; 912 return 0;
863 913
864 mid = path->nodes[level]; 914 mid = path->nodes[level];
915
865 WARN_ON(!path->locks[level]); 916 WARN_ON(!path->locks[level]);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid); 917 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867 918
@@ -883,8 +934,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
883 934
884 /* promote the child to a root */ 935 /* promote the child to a root */
885 child = read_node_slot(root, mid, 0); 936 child = read_node_slot(root, mid, 0);
886 btrfs_tree_lock(child);
887 BUG_ON(!child); 937 BUG_ON(!child);
938 btrfs_tree_lock(child);
939 btrfs_set_lock_blocking(child);
888 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 940 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 BUG_ON(ret); 941 BUG_ON(ret);
890 942
@@ -900,6 +952,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
900 952
901 add_root_to_dirty_list(root); 953 add_root_to_dirty_list(root);
902 btrfs_tree_unlock(child); 954 btrfs_tree_unlock(child);
955
903 path->locks[level] = 0; 956 path->locks[level] = 0;
904 path->nodes[level] = NULL; 957 path->nodes[level] = NULL;
905 clean_tree_block(trans, root, mid); 958 clean_tree_block(trans, root, mid);
@@ -924,6 +977,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
924 left = read_node_slot(root, parent, pslot - 1); 977 left = read_node_slot(root, parent, pslot - 1);
925 if (left) { 978 if (left) {
926 btrfs_tree_lock(left); 979 btrfs_tree_lock(left);
980 btrfs_set_lock_blocking(left);
927 wret = btrfs_cow_block(trans, root, left, 981 wret = btrfs_cow_block(trans, root, left,
928 parent, pslot - 1, &left, 0); 982 parent, pslot - 1, &left, 0);
929 if (wret) { 983 if (wret) {
@@ -934,6 +988,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
934 right = read_node_slot(root, parent, pslot + 1); 988 right = read_node_slot(root, parent, pslot + 1);
935 if (right) { 989 if (right) {
936 btrfs_tree_lock(right); 990 btrfs_tree_lock(right);
991 btrfs_set_lock_blocking(right);
937 wret = btrfs_cow_block(trans, root, right, 992 wret = btrfs_cow_block(trans, root, right,
938 parent, pslot + 1, &right, 0); 993 parent, pslot + 1, &right, 0);
939 if (wret) { 994 if (wret) {
@@ -1109,6 +1164,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1109 u32 left_nr; 1164 u32 left_nr;
1110 1165
1111 btrfs_tree_lock(left); 1166 btrfs_tree_lock(left);
1167 btrfs_set_lock_blocking(left);
1168
1112 left_nr = btrfs_header_nritems(left); 1169 left_nr = btrfs_header_nritems(left);
1113 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1170 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1114 wret = 1; 1171 wret = 1;
@@ -1155,7 +1212,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1155 */ 1212 */
1156 if (right) { 1213 if (right) {
1157 u32 right_nr; 1214 u32 right_nr;
1215
1158 btrfs_tree_lock(right); 1216 btrfs_tree_lock(right);
1217 btrfs_set_lock_blocking(right);
1218
1159 right_nr = btrfs_header_nritems(right); 1219 right_nr = btrfs_header_nritems(right);
1160 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1220 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1161 wret = 1; 1221 wret = 1;
@@ -1210,8 +1270,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
1210 struct btrfs_disk_key disk_key; 1270 struct btrfs_disk_key disk_key;
1211 u32 nritems; 1271 u32 nritems;
1212 u64 search; 1272 u64 search;
1213 u64 lowest_read; 1273 u64 target;
1214 u64 highest_read;
1215 u64 nread = 0; 1274 u64 nread = 0;
1216 int direction = path->reada; 1275 int direction = path->reada;
1217 struct extent_buffer *eb; 1276 struct extent_buffer *eb;
@@ -1235,8 +1294,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
1235 return; 1294 return;
1236 } 1295 }
1237 1296
1238 highest_read = search; 1297 target = search;
1239 lowest_read = search;
1240 1298
1241 nritems = btrfs_header_nritems(node); 1299 nritems = btrfs_header_nritems(node);
1242 nr = slot; 1300 nr = slot;
@@ -1256,27 +1314,80 @@ static noinline void reada_for_search(struct btrfs_root *root,
1256 break; 1314 break;
1257 } 1315 }
1258 search = btrfs_node_blockptr(node, nr); 1316 search = btrfs_node_blockptr(node, nr);
1259 if ((search >= lowest_read && search <= highest_read) || 1317 if ((search <= target && target - search <= 65536) ||
1260 (search < lowest_read && lowest_read - search <= 16384) || 1318 (search > target && search - target <= 65536)) {
1261 (search > highest_read && search - highest_read <= 16384)) {
1262 readahead_tree_block(root, search, blocksize, 1319 readahead_tree_block(root, search, blocksize,
1263 btrfs_node_ptr_generation(node, nr)); 1320 btrfs_node_ptr_generation(node, nr));
1264 nread += blocksize; 1321 nread += blocksize;
1265 } 1322 }
1266 nscan++; 1323 nscan++;
1267 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) 1324 if ((nread > 65536 || nscan > 32))
1268 break; 1325 break;
1326 }
1327}
1269 1328
1270 if (nread > (256 * 1024) || nscan > 128) 1329/*
1271 break; 1330 * returns -EAGAIN if it had to drop the path, or zero if everything was in
1331 * cache
1332 */
1333static noinline int reada_for_balance(struct btrfs_root *root,
1334 struct btrfs_path *path, int level)
1335{
1336 int slot;
1337 int nritems;
1338 struct extent_buffer *parent;
1339 struct extent_buffer *eb;
1340 u64 gen;
1341 u64 block1 = 0;
1342 u64 block2 = 0;
1343 int ret = 0;
1344 int blocksize;
1345
1346 parent = path->nodes[level - 1];
1347 if (!parent)
1348 return 0;
1272 1349
1273 if (search < lowest_read) 1350 nritems = btrfs_header_nritems(parent);
1274 lowest_read = search; 1351 slot = path->slots[level];
1275 if (search > highest_read) 1352 blocksize = btrfs_level_size(root, level);
1276 highest_read = search; 1353
1354 if (slot > 0) {
1355 block1 = btrfs_node_blockptr(parent, slot - 1);
1356 gen = btrfs_node_ptr_generation(parent, slot - 1);
1357 eb = btrfs_find_tree_block(root, block1, blocksize);
1358 if (eb && btrfs_buffer_uptodate(eb, gen))
1359 block1 = 0;
1360 free_extent_buffer(eb);
1361 }
1362 if (slot < nritems) {
1363 block2 = btrfs_node_blockptr(parent, slot + 1);
1364 gen = btrfs_node_ptr_generation(parent, slot + 1);
1365 eb = btrfs_find_tree_block(root, block2, blocksize);
1366 if (eb && btrfs_buffer_uptodate(eb, gen))
1367 block2 = 0;
1368 free_extent_buffer(eb);
1277 } 1369 }
1370 if (block1 || block2) {
1371 ret = -EAGAIN;
1372 btrfs_release_path(root, path);
1373 if (block1)
1374 readahead_tree_block(root, block1, blocksize, 0);
1375 if (block2)
1376 readahead_tree_block(root, block2, blocksize, 0);
1377
1378 if (block1) {
1379 eb = read_tree_block(root, block1, blocksize, 0);
1380 free_extent_buffer(eb);
1381 }
1382 if (block1) {
1383 eb = read_tree_block(root, block2, blocksize, 0);
1384 free_extent_buffer(eb);
1385 }
1386 }
1387 return ret;
1278} 1388}
1279 1389
1390
1280/* 1391/*
1281 * when we walk down the tree, it is usually safe to unlock the higher layers 1392 * when we walk down the tree, it is usually safe to unlock the higher layers
1282 * in the tree. The exceptions are when our path goes through slot 0, because 1393 * in the tree. The exceptions are when our path goes through slot 0, because
@@ -1328,6 +1439,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1328} 1439}
1329 1440
1330/* 1441/*
1442 * This releases any locks held in the path starting at level and
1443 * going all the way up to the root.
1444 *
1445 * btrfs_search_slot will keep the lock held on higher nodes in a few
1446 * corner cases, such as COW of the block at slot zero in the node. This
1447 * ignores those rules, and it should only be called when there are no
1448 * more updates to be done higher up in the tree.
1449 */
1450noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1451{
1452 int i;
1453
1454 if (path->keep_locks || path->lowest_level)
1455 return;
1456
1457 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1458 if (!path->nodes[i])
1459 continue;
1460 if (!path->locks[i])
1461 continue;
1462 btrfs_tree_unlock(path->nodes[i]);
1463 path->locks[i] = 0;
1464 }
1465}
1466
1467/*
1331 * look for key in the tree. path is filled in with nodes along the way 1468 * look for key in the tree. path is filled in with nodes along the way
1332 * if key is found, we return zero and you can find the item in the leaf 1469 * if key is found, we return zero and you can find the item in the leaf
1333 * level of the path (level 0) 1470 * level of the path (level 0)
@@ -1387,32 +1524,30 @@ again:
1387 int wret; 1524 int wret;
1388 1525
1389 /* is a cow on this block not required */ 1526 /* is a cow on this block not required */
1390 spin_lock(&root->fs_info->hash_lock);
1391 if (btrfs_header_generation(b) == trans->transid && 1527 if (btrfs_header_generation(b) == trans->transid &&
1392 btrfs_header_owner(b) == root->root_key.objectid && 1528 btrfs_header_owner(b) == root->root_key.objectid &&
1393 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1529 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1394 spin_unlock(&root->fs_info->hash_lock);
1395 goto cow_done; 1530 goto cow_done;
1396 } 1531 }
1397 spin_unlock(&root->fs_info->hash_lock);
1398 1532
1399 /* ok, we have to cow, is our old prealloc the right 1533 /* ok, we have to cow, is our old prealloc the right
1400 * size? 1534 * size?
1401 */ 1535 */
1402 if (prealloc_block.objectid && 1536 if (prealloc_block.objectid &&
1403 prealloc_block.offset != b->len) { 1537 prealloc_block.offset != b->len) {
1538 btrfs_release_path(root, p);
1404 btrfs_free_reserved_extent(root, 1539 btrfs_free_reserved_extent(root,
1405 prealloc_block.objectid, 1540 prealloc_block.objectid,
1406 prealloc_block.offset); 1541 prealloc_block.offset);
1407 prealloc_block.objectid = 0; 1542 prealloc_block.objectid = 0;
1543 goto again;
1408 } 1544 }
1409 1545
1410 /* 1546 /*
1411 * for higher level blocks, try not to allocate blocks 1547 * for higher level blocks, try not to allocate blocks
1412 * with the block and the parent locks held. 1548 * with the block and the parent locks held.
1413 */ 1549 */
1414 if (level > 1 && !prealloc_block.objectid && 1550 if (level > 0 && !prealloc_block.objectid) {
1415 btrfs_path_lock_waiting(p, level)) {
1416 u32 size = b->len; 1551 u32 size = b->len;
1417 u64 hint = b->start; 1552 u64 hint = b->start;
1418 1553
@@ -1425,6 +1560,8 @@ again:
1425 goto again; 1560 goto again;
1426 } 1561 }
1427 1562
1563 btrfs_set_path_blocking(p);
1564
1428 wret = btrfs_cow_block(trans, root, b, 1565 wret = btrfs_cow_block(trans, root, b,
1429 p->nodes[level + 1], 1566 p->nodes[level + 1],
1430 p->slots[level + 1], 1567 p->slots[level + 1],
@@ -1446,6 +1583,22 @@ cow_done:
1446 if (!p->skip_locking) 1583 if (!p->skip_locking)
1447 p->locks[level] = 1; 1584 p->locks[level] = 1;
1448 1585
1586 btrfs_clear_path_blocking(p, NULL);
1587
1588 /*
1589 * we have a lock on b and as long as we aren't changing
1590 * the tree, there is no way to for the items in b to change.
1591 * It is safe to drop the lock on our parent before we
1592 * go through the expensive btree search on b.
1593 *
1594 * If cow is true, then we might be changing slot zero,
1595 * which may require changing the parent. So, we can't
1596 * drop the lock until after we know which slot we're
1597 * operating on.
1598 */
1599 if (!cow)
1600 btrfs_unlock_up_safe(p, level + 1);
1601
1449 ret = check_block(root, p, level); 1602 ret = check_block(root, p, level);
1450 if (ret) { 1603 if (ret) {
1451 ret = -1; 1604 ret = -1;
@@ -1453,6 +1606,7 @@ cow_done:
1453 } 1606 }
1454 1607
1455 ret = bin_search(b, key, level, &slot); 1608 ret = bin_search(b, key, level, &slot);
1609
1456 if (level != 0) { 1610 if (level != 0) {
1457 if (ret && slot > 0) 1611 if (ret && slot > 0)
1458 slot -= 1; 1612 slot -= 1;
@@ -1460,7 +1614,16 @@ cow_done:
1460 if ((p->search_for_split || ins_len > 0) && 1614 if ((p->search_for_split || ins_len > 0) &&
1461 btrfs_header_nritems(b) >= 1615 btrfs_header_nritems(b) >=
1462 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1616 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1463 int sret = split_node(trans, root, p, level); 1617 int sret;
1618
1619 sret = reada_for_balance(root, p, level);
1620 if (sret)
1621 goto again;
1622
1623 btrfs_set_path_blocking(p);
1624 sret = split_node(trans, root, p, level);
1625 btrfs_clear_path_blocking(p, NULL);
1626
1464 BUG_ON(sret > 0); 1627 BUG_ON(sret > 0);
1465 if (sret) { 1628 if (sret) {
1466 ret = sret; 1629 ret = sret;
@@ -1468,9 +1631,19 @@ cow_done:
1468 } 1631 }
1469 b = p->nodes[level]; 1632 b = p->nodes[level];
1470 slot = p->slots[level]; 1633 slot = p->slots[level];
1471 } else if (ins_len < 0) { 1634 } else if (ins_len < 0 &&
1472 int sret = balance_level(trans, root, p, 1635 btrfs_header_nritems(b) <
1473 level); 1636 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1637 int sret;
1638
1639 sret = reada_for_balance(root, p, level);
1640 if (sret)
1641 goto again;
1642
1643 btrfs_set_path_blocking(p);
1644 sret = balance_level(trans, root, p, level);
1645 btrfs_clear_path_blocking(p, NULL);
1646
1474 if (sret) { 1647 if (sret) {
1475 ret = sret; 1648 ret = sret;
1476 goto done; 1649 goto done;
@@ -1504,7 +1677,7 @@ cow_done:
1504 * of the btree by dropping locks before 1677 * of the btree by dropping locks before
1505 * we read. 1678 * we read.
1506 */ 1679 */
1507 if (level > 1) { 1680 if (level > 0) {
1508 btrfs_release_path(NULL, p); 1681 btrfs_release_path(NULL, p);
1509 if (tmp) 1682 if (tmp)
1510 free_extent_buffer(tmp); 1683 free_extent_buffer(tmp);
@@ -1519,6 +1692,7 @@ cow_done:
1519 free_extent_buffer(tmp); 1692 free_extent_buffer(tmp);
1520 goto again; 1693 goto again;
1521 } else { 1694 } else {
1695 btrfs_set_path_blocking(p);
1522 if (tmp) 1696 if (tmp)
1523 free_extent_buffer(tmp); 1697 free_extent_buffer(tmp);
1524 if (should_reada) 1698 if (should_reada)
@@ -1528,14 +1702,29 @@ cow_done:
1528 b = read_node_slot(root, b, slot); 1702 b = read_node_slot(root, b, slot);
1529 } 1703 }
1530 } 1704 }
1531 if (!p->skip_locking) 1705 if (!p->skip_locking) {
1532 btrfs_tree_lock(b); 1706 int lret;
1707
1708 btrfs_clear_path_blocking(p, NULL);
1709 lret = btrfs_try_spin_lock(b);
1710
1711 if (!lret) {
1712 btrfs_set_path_blocking(p);
1713 btrfs_tree_lock(b);
1714 btrfs_clear_path_blocking(p, b);
1715 }
1716 }
1533 } else { 1717 } else {
1534 p->slots[level] = slot; 1718 p->slots[level] = slot;
1535 if (ins_len > 0 && 1719 if (ins_len > 0 &&
1536 btrfs_leaf_free_space(root, b) < ins_len) { 1720 btrfs_leaf_free_space(root, b) < ins_len) {
1537 int sret = split_leaf(trans, root, key, 1721 int sret;
1722
1723 btrfs_set_path_blocking(p);
1724 sret = split_leaf(trans, root, key,
1538 p, ins_len, ret == 0); 1725 p, ins_len, ret == 0);
1726 btrfs_clear_path_blocking(p, NULL);
1727
1539 BUG_ON(sret > 0); 1728 BUG_ON(sret > 0);
1540 if (sret) { 1729 if (sret) {
1541 ret = sret; 1730 ret = sret;
@@ -1549,12 +1738,16 @@ cow_done:
1549 } 1738 }
1550 ret = 1; 1739 ret = 1;
1551done: 1740done:
1741 /*
1742 * we don't really know what they plan on doing with the path
1743 * from here on, so for now just mark it as blocking
1744 */
1745 btrfs_set_path_blocking(p);
1552 if (prealloc_block.objectid) { 1746 if (prealloc_block.objectid) {
1553 btrfs_free_reserved_extent(root, 1747 btrfs_free_reserved_extent(root,
1554 prealloc_block.objectid, 1748 prealloc_block.objectid,
1555 prealloc_block.offset); 1749 prealloc_block.offset);
1556 } 1750 }
1557
1558 return ret; 1751 return ret;
1559} 1752}
1560 1753
@@ -1578,6 +1771,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1578 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1771 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1579 BUG_ON(ret); 1772 BUG_ON(ret);
1580 1773
1774 btrfs_set_lock_blocking(eb);
1775
1581 parent = eb; 1776 parent = eb;
1582 while (1) { 1777 while (1) {
1583 level = btrfs_header_level(parent); 1778 level = btrfs_header_level(parent);
@@ -1602,6 +1797,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1602 eb = read_tree_block(root, bytenr, blocksize, 1797 eb = read_tree_block(root, bytenr, blocksize,
1603 generation); 1798 generation);
1604 btrfs_tree_lock(eb); 1799 btrfs_tree_lock(eb);
1800 btrfs_set_lock_blocking(eb);
1605 } 1801 }
1606 1802
1607 /* 1803 /*
@@ -1626,6 +1822,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1626 eb = read_tree_block(root, bytenr, blocksize, 1822 eb = read_tree_block(root, bytenr, blocksize,
1627 generation); 1823 generation);
1628 btrfs_tree_lock(eb); 1824 btrfs_tree_lock(eb);
1825 btrfs_set_lock_blocking(eb);
1629 } 1826 }
1630 1827
1631 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1828 ret = btrfs_cow_block(trans, root, eb, parent, slot,
@@ -2168,10 +2365,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2168 if (slot >= btrfs_header_nritems(upper) - 1) 2365 if (slot >= btrfs_header_nritems(upper) - 1)
2169 return 1; 2366 return 1;
2170 2367
2171 WARN_ON(!btrfs_tree_locked(path->nodes[1])); 2368 btrfs_assert_tree_locked(path->nodes[1]);
2172 2369
2173 right = read_node_slot(root, upper, slot + 1); 2370 right = read_node_slot(root, upper, slot + 1);
2174 btrfs_tree_lock(right); 2371 btrfs_tree_lock(right);
2372 btrfs_set_lock_blocking(right);
2373
2175 free_space = btrfs_leaf_free_space(root, right); 2374 free_space = btrfs_leaf_free_space(root, right);
2176 if (free_space < data_size) 2375 if (free_space < data_size)
2177 goto out_unlock; 2376 goto out_unlock;
@@ -2363,10 +2562,12 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2363 if (right_nritems == 0) 2562 if (right_nritems == 0)
2364 return 1; 2563 return 1;
2365 2564
2366 WARN_ON(!btrfs_tree_locked(path->nodes[1])); 2565 btrfs_assert_tree_locked(path->nodes[1]);
2367 2566
2368 left = read_node_slot(root, path->nodes[1], slot - 1); 2567 left = read_node_slot(root, path->nodes[1], slot - 1);
2369 btrfs_tree_lock(left); 2568 btrfs_tree_lock(left);
2569 btrfs_set_lock_blocking(left);
2570
2370 free_space = btrfs_leaf_free_space(root, left); 2571 free_space = btrfs_leaf_free_space(root, left);
2371 if (free_space < data_size) { 2572 if (free_space < data_size) {
2372 ret = 1; 2573 ret = 1;
@@ -2825,6 +3026,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
2825 path->keep_locks = 0; 3026 path->keep_locks = 0;
2826 BUG_ON(ret); 3027 BUG_ON(ret);
2827 3028
3029 /*
3030 * make sure any changes to the path from split_leaf leave it
3031 * in a blocking state
3032 */
3033 btrfs_set_path_blocking(path);
3034
2828 leaf = path->nodes[0]; 3035 leaf = path->nodes[0];
2829 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); 3036 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2830 3037
@@ -3354,6 +3561,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3354 BUG(); 3561 BUG();
3355 } 3562 }
3356out: 3563out:
3564 btrfs_unlock_up_safe(path, 1);
3357 return ret; 3565 return ret;
3358} 3566}
3359 3567
@@ -3441,15 +3649,22 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3441{ 3649{
3442 int ret; 3650 int ret;
3443 u64 root_gen = btrfs_header_generation(path->nodes[1]); 3651 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3652 u64 parent_start = path->nodes[1]->start;
3653 u64 parent_owner = btrfs_header_owner(path->nodes[1]);
3444 3654
3445 ret = del_ptr(trans, root, path, 1, path->slots[1]); 3655 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3446 if (ret) 3656 if (ret)
3447 return ret; 3657 return ret;
3448 3658
3659 /*
3660 * btrfs_free_extent is expensive, we want to make sure we
3661 * aren't holding any locks when we call it
3662 */
3663 btrfs_unlock_up_safe(path, 0);
3664
3449 ret = btrfs_free_extent(trans, root, bytenr, 3665 ret = btrfs_free_extent(trans, root, bytenr,
3450 btrfs_level_size(root, 0), 3666 btrfs_level_size(root, 0),
3451 path->nodes[1]->start, 3667 parent_start, parent_owner,
3452 btrfs_header_owner(path->nodes[1]),
3453 root_gen, 0, 1); 3668 root_gen, 0, 1);
3454 return ret; 3669 return ret;
3455} 3670}
@@ -3721,6 +3936,7 @@ find_next_key:
3721 */ 3936 */
3722 if (slot >= nritems) { 3937 if (slot >= nritems) {
3723 path->slots[level] = slot; 3938 path->slots[level] = slot;
3939 btrfs_set_path_blocking(path);
3724 sret = btrfs_find_next_key(root, path, min_key, level, 3940 sret = btrfs_find_next_key(root, path, min_key, level,
3725 cache_only, min_trans); 3941 cache_only, min_trans);
3726 if (sret == 0) { 3942 if (sret == 0) {
@@ -3738,16 +3954,20 @@ find_next_key:
3738 unlock_up(path, level, 1); 3954 unlock_up(path, level, 1);
3739 goto out; 3955 goto out;
3740 } 3956 }
3957 btrfs_set_path_blocking(path);
3741 cur = read_node_slot(root, cur, slot); 3958 cur = read_node_slot(root, cur, slot);
3742 3959
3743 btrfs_tree_lock(cur); 3960 btrfs_tree_lock(cur);
3961
3744 path->locks[level - 1] = 1; 3962 path->locks[level - 1] = 1;
3745 path->nodes[level - 1] = cur; 3963 path->nodes[level - 1] = cur;
3746 unlock_up(path, level, 1); 3964 unlock_up(path, level, 1);
3965 btrfs_clear_path_blocking(path, NULL);
3747 } 3966 }
3748out: 3967out:
3749 if (ret == 0) 3968 if (ret == 0)
3750 memcpy(min_key, &found_key, sizeof(found_key)); 3969 memcpy(min_key, &found_key, sizeof(found_key));
3970 btrfs_set_path_blocking(path);
3751 return ret; 3971 return ret;
3752} 3972}
3753 3973
@@ -3843,6 +4063,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3843 if (ret < 0) 4063 if (ret < 0)
3844 return ret; 4064 return ret;
3845 4065
4066 btrfs_set_path_blocking(path);
3846 nritems = btrfs_header_nritems(path->nodes[0]); 4067 nritems = btrfs_header_nritems(path->nodes[0]);
3847 /* 4068 /*
3848 * by releasing the path above we dropped all our locks. A balance 4069 * by releasing the path above we dropped all our locks. A balance
@@ -3873,14 +4094,16 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3873 free_extent_buffer(next); 4094 free_extent_buffer(next);
3874 } 4095 }
3875 4096
4097 /* the path was set to blocking above */
3876 if (level == 1 && (path->locks[1] || path->skip_locking) && 4098 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3877 path->reada) 4099 path->reada)
3878 reada_for_search(root, path, level, slot, 0); 4100 reada_for_search(root, path, level, slot, 0);
3879 4101
3880 next = read_node_slot(root, c, slot); 4102 next = read_node_slot(root, c, slot);
3881 if (!path->skip_locking) { 4103 if (!path->skip_locking) {
3882 WARN_ON(!btrfs_tree_locked(c)); 4104 btrfs_assert_tree_locked(c);
3883 btrfs_tree_lock(next); 4105 btrfs_tree_lock(next);
4106 btrfs_set_lock_blocking(next);
3884 } 4107 }
3885 break; 4108 break;
3886 } 4109 }
@@ -3897,12 +4120,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3897 path->locks[level] = 1; 4120 path->locks[level] = 1;
3898 if (!level) 4121 if (!level)
3899 break; 4122 break;
4123
4124 btrfs_set_path_blocking(path);
3900 if (level == 1 && path->locks[1] && path->reada) 4125 if (level == 1 && path->locks[1] && path->reada)
3901 reada_for_search(root, path, level, slot, 0); 4126 reada_for_search(root, path, level, slot, 0);
3902 next = read_node_slot(root, next, 0); 4127 next = read_node_slot(root, next, 0);
3903 if (!path->skip_locking) { 4128 if (!path->skip_locking) {
3904 WARN_ON(!btrfs_tree_locked(path->nodes[level])); 4129 btrfs_assert_tree_locked(path->nodes[level]);
3905 btrfs_tree_lock(next); 4130 btrfs_tree_lock(next);
4131 btrfs_set_lock_blocking(next);
3906 } 4132 }
3907 } 4133 }
3908done: 4134done:
@@ -3927,6 +4153,7 @@ int btrfs_previous_item(struct btrfs_root *root,
3927 4153
3928 while (1) { 4154 while (1) {
3929 if (path->slots[0] == 0) { 4155 if (path->slots[0] == 0) {
4156 btrfs_set_path_blocking(path);
3930 ret = btrfs_prev_leaf(root, path); 4157 ret = btrfs_prev_leaf(root, path);
3931 if (ret != 0) 4158 if (ret != 0)
3932 return ret; 4159 return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eee060f88113..5e1d4e30e9d8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,7 @@ struct btrfs_ordered_sum;
43 43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1) 44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45 45
46#ifdef CONFIG_LOCKDEP 46#define BTRFS_MAX_LEVEL 8
47# define BTRFS_MAX_LEVEL 7
48#else
49# define BTRFS_MAX_LEVEL 8
50#endif
51 47
52/* holds pointers to all of the tree roots */ 48/* holds pointers to all of the tree roots */
53#define BTRFS_ROOT_TREE_OBJECTID 1ULL 49#define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -454,17 +450,11 @@ struct btrfs_timespec {
454 __le32 nsec; 450 __le32 nsec;
455} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
456 452
457typedef enum { 453enum btrfs_compression_type {
458 BTRFS_COMPRESS_NONE = 0, 454 BTRFS_COMPRESS_NONE = 0,
459 BTRFS_COMPRESS_ZLIB = 1, 455 BTRFS_COMPRESS_ZLIB = 1,
460 BTRFS_COMPRESS_LAST = 2, 456 BTRFS_COMPRESS_LAST = 2,
461} btrfs_compression_type; 457};
462
463/* we don't understand any encryption methods right now */
464typedef enum {
465 BTRFS_ENCRYPTION_NONE = 0,
466 BTRFS_ENCRYPTION_LAST = 1,
467} btrfs_encryption_type;
468 458
469struct btrfs_inode_item { 459struct btrfs_inode_item {
470 /* nfs style generation number */ 460 /* nfs style generation number */
@@ -606,13 +596,27 @@ struct btrfs_block_group_item {
606 596
607struct btrfs_space_info { 597struct btrfs_space_info {
608 u64 flags; 598 u64 flags;
609 u64 total_bytes; 599
610 u64 bytes_used; 600 u64 total_bytes; /* total bytes in the space */
611 u64 bytes_pinned; 601 u64 bytes_used; /* total bytes used on disk */
612 u64 bytes_reserved; 602 u64 bytes_pinned; /* total bytes pinned, will be freed when the
613 u64 bytes_readonly; 603 transaction finishes */
614 int full; 604 u64 bytes_reserved; /* total bytes the allocator has reserved for
615 int force_alloc; 605 current allocations */
606 u64 bytes_readonly; /* total bytes that are read only */
607
608 /* delalloc accounting */
609 u64 bytes_delalloc; /* number of bytes reserved for allocation,
610 this space is not necessarily reserved yet
611 by the allocator */
612 u64 bytes_may_use; /* number of bytes that may be used for
613 delalloc */
614
615 int full; /* indicates that we cannot allocate any more
616 chunks for this space */
617 int force_alloc; /* set if we need to force a chunk alloc for
618 this space */
619
616 struct list_head list; 620 struct list_head list;
617 621
618 /* for block groups in our same type */ 622 /* for block groups in our same type */
@@ -701,9 +705,7 @@ struct btrfs_fs_info {
701 struct btrfs_transaction *running_transaction; 705 struct btrfs_transaction *running_transaction;
702 wait_queue_head_t transaction_throttle; 706 wait_queue_head_t transaction_throttle;
703 wait_queue_head_t transaction_wait; 707 wait_queue_head_t transaction_wait;
704
705 wait_queue_head_t async_submit_wait; 708 wait_queue_head_t async_submit_wait;
706 wait_queue_head_t tree_log_wait;
707 709
708 struct btrfs_super_block super_copy; 710 struct btrfs_super_block super_copy;
709 struct btrfs_super_block super_for_commit; 711 struct btrfs_super_block super_for_commit;
@@ -711,7 +713,6 @@ struct btrfs_fs_info {
711 struct super_block *sb; 713 struct super_block *sb;
712 struct inode *btree_inode; 714 struct inode *btree_inode;
713 struct backing_dev_info bdi; 715 struct backing_dev_info bdi;
714 spinlock_t hash_lock;
715 struct mutex trans_mutex; 716 struct mutex trans_mutex;
716 struct mutex tree_log_mutex; 717 struct mutex tree_log_mutex;
717 struct mutex transaction_kthread_mutex; 718 struct mutex transaction_kthread_mutex;
@@ -730,10 +731,6 @@ struct btrfs_fs_info {
730 atomic_t async_submit_draining; 731 atomic_t async_submit_draining;
731 atomic_t nr_async_bios; 732 atomic_t nr_async_bios;
732 atomic_t async_delalloc_pages; 733 atomic_t async_delalloc_pages;
733 atomic_t tree_log_writers;
734 atomic_t tree_log_commit;
735 unsigned long tree_log_batch;
736 u64 tree_log_transid;
737 734
738 /* 735 /*
739 * this is used by the balancing code to wait for all the pending 736 * this is used by the balancing code to wait for all the pending
@@ -787,7 +784,14 @@ struct btrfs_fs_info {
787 struct list_head dirty_cowonly_roots; 784 struct list_head dirty_cowonly_roots;
788 785
789 struct btrfs_fs_devices *fs_devices; 786 struct btrfs_fs_devices *fs_devices;
787
788 /*
789 * the space_info list is almost entirely read only. It only changes
790 * when we add a new raid type to the FS, and that happens
791 * very rarely. RCU is used to protect it.
792 */
790 struct list_head space_info; 793 struct list_head space_info;
794
791 spinlock_t delalloc_lock; 795 spinlock_t delalloc_lock;
792 spinlock_t new_trans_lock; 796 spinlock_t new_trans_lock;
793 u64 delalloc_bytes; 797 u64 delalloc_bytes;
@@ -833,7 +837,14 @@ struct btrfs_root {
833 struct kobject root_kobj; 837 struct kobject root_kobj;
834 struct completion kobj_unregister; 838 struct completion kobj_unregister;
835 struct mutex objectid_mutex; 839 struct mutex objectid_mutex;
840
836 struct mutex log_mutex; 841 struct mutex log_mutex;
842 wait_queue_head_t log_writer_wait;
843 wait_queue_head_t log_commit_wait[2];
844 atomic_t log_writers;
845 atomic_t log_commit[2];
846 unsigned long log_transid;
847 unsigned long log_batch;
837 848
838 u64 objectid; 849 u64 objectid;
839 u64 last_trans; 850 u64 last_trans;
@@ -1721,7 +1732,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1721 u64 empty_size); 1732 u64 empty_size);
1722struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 1733struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1723 struct btrfs_root *root, 1734 struct btrfs_root *root,
1724 u64 bytenr, u32 blocksize); 1735 u64 bytenr, u32 blocksize,
1736 int level);
1725int btrfs_alloc_extent(struct btrfs_trans_handle *trans, 1737int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1726 struct btrfs_root *root, 1738 struct btrfs_root *root,
1727 u64 num_bytes, u64 parent, u64 min_bytes, 1739 u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1791,6 +1803,18 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1791int btrfs_cleanup_reloc_trees(struct btrfs_root *root); 1803int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1792int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 1804int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
1793u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 1805u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1806void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
1807void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
1808
1809int btrfs_check_metadata_free_space(struct btrfs_root *root);
1810int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
1811 u64 bytes);
1812void btrfs_free_reserved_data_space(struct btrfs_root *root,
1813 struct inode *inode, u64 bytes);
1814void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1815 u64 bytes);
1816void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1817 u64 bytes);
1794/* ctree.c */ 1818/* ctree.c */
1795int btrfs_previous_item(struct btrfs_root *root, 1819int btrfs_previous_item(struct btrfs_root *root,
1796 struct btrfs_path *path, u64 min_objectid, 1820 struct btrfs_path *path, u64 min_objectid,
@@ -1840,7 +1864,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1840void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); 1864void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1841struct btrfs_path *btrfs_alloc_path(void); 1865struct btrfs_path *btrfs_alloc_path(void);
1842void btrfs_free_path(struct btrfs_path *p); 1866void btrfs_free_path(struct btrfs_path *p);
1843void btrfs_init_path(struct btrfs_path *p); 1867void btrfs_set_path_blocking(struct btrfs_path *p);
1868void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
1869
1844int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1870int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1845 struct btrfs_path *path, int slot, int nr); 1871 struct btrfs_path *path, int slot, int nr);
1846int btrfs_del_leaf(struct btrfs_trans_handle *trans, 1872int btrfs_del_leaf(struct btrfs_trans_handle *trans,
@@ -2034,8 +2060,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2034unsigned long btrfs_force_ra(struct address_space *mapping, 2060unsigned long btrfs_force_ra(struct address_space *mapping,
2035 struct file_ra_state *ra, struct file *file, 2061 struct file_ra_state *ra, struct file *file,
2036 pgoff_t offset, pgoff_t last_index); 2062 pgoff_t offset, pgoff_t last_index);
2037int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
2038 int for_del);
2039int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); 2063int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
2040int btrfs_readpage(struct file *file, struct page *page); 2064int btrfs_readpage(struct file *file, struct page *page);
2041void btrfs_delete_inode(struct inode *inode); 2065void btrfs_delete_inode(struct inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81a313874ae5..6ec80c0fc869 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/version.h>
20#include <linux/fs.h> 19#include <linux/fs.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/scatterlist.h> 21#include <linux/scatterlist.h>
@@ -76,6 +75,40 @@ struct async_submit_bio {
76 struct btrfs_work work; 75 struct btrfs_work work;
77}; 76};
78 77
78/* These are used to set the lockdep class on the extent buffer locks.
79 * The class is set by the readpage_end_io_hook after the buffer has
80 * passed csum validation but before the pages are unlocked.
81 *
82 * The lockdep class is also set by btrfs_init_new_buffer on freshly
83 * allocated blocks.
84 *
85 * The class is based on the level in the tree block, which allows lockdep
86 * to know that lower nodes nest inside the locks of higher nodes.
87 *
88 * We also add a check to make sure the highest level of the tree is
89 * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this
90 * code needs update as well.
91 */
92#ifdef CONFIG_DEBUG_LOCK_ALLOC
93# if BTRFS_MAX_LEVEL != 8
94# error
95# endif
96static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
97static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
98 /* leaf */
99 "btrfs-extent-00",
100 "btrfs-extent-01",
101 "btrfs-extent-02",
102 "btrfs-extent-03",
103 "btrfs-extent-04",
104 "btrfs-extent-05",
105 "btrfs-extent-06",
106 "btrfs-extent-07",
107 /* highest possible level */
108 "btrfs-extent-08",
109};
110#endif
111
79/* 112/*
80 * extents on the btree inode are pretty simple, there's one extent 113 * extents on the btree inode are pretty simple, there's one extent
81 * that covers the entire device 114 * that covers the entire device
@@ -348,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
348 return ret; 381 return ret;
349} 382}
350 383
384#ifdef CONFIG_DEBUG_LOCK_ALLOC
385void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
386{
387 lockdep_set_class_and_name(&eb->lock,
388 &btrfs_eb_class[level],
389 btrfs_eb_name[level]);
390}
391#endif
392
351static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, 393static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
352 struct extent_state *state) 394 struct extent_state *state)
353{ 395{
@@ -393,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
393 } 435 }
394 found_level = btrfs_header_level(eb); 436 found_level = btrfs_header_level(eb);
395 437
438 btrfs_set_buffer_lockdep_class(eb, found_level);
439
396 ret = csum_tree_block(root, eb, 1); 440 ret = csum_tree_block(root, eb, 1);
397 if (ret) 441 if (ret)
398 ret = -EIO; 442 ret = -EIO;
@@ -800,7 +844,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
800 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 844 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
801 845
802 if (ret == 0) 846 if (ret == 0)
803 buf->flags |= EXTENT_UPTODATE; 847 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
804 else 848 else
805 WARN_ON(1); 849 WARN_ON(1);
806 return buf; 850 return buf;
@@ -813,7 +857,11 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
813 struct inode *btree_inode = root->fs_info->btree_inode; 857 struct inode *btree_inode = root->fs_info->btree_inode;
814 if (btrfs_header_generation(buf) == 858 if (btrfs_header_generation(buf) ==
815 root->fs_info->running_transaction->transid) { 859 root->fs_info->running_transaction->transid) {
816 WARN_ON(!btrfs_tree_locked(buf)); 860 btrfs_assert_tree_locked(buf);
861
862 /* ugh, clear_extent_buffer_dirty can be expensive */
863 btrfs_set_lock_blocking(buf);
864
817 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 865 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
818 buf); 866 buf);
819 } 867 }
@@ -850,6 +898,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
850 spin_lock_init(&root->list_lock); 898 spin_lock_init(&root->list_lock);
851 mutex_init(&root->objectid_mutex); 899 mutex_init(&root->objectid_mutex);
852 mutex_init(&root->log_mutex); 900 mutex_init(&root->log_mutex);
901 init_waitqueue_head(&root->log_writer_wait);
902 init_waitqueue_head(&root->log_commit_wait[0]);
903 init_waitqueue_head(&root->log_commit_wait[1]);
904 atomic_set(&root->log_commit[0], 0);
905 atomic_set(&root->log_commit[1], 0);
906 atomic_set(&root->log_writers, 0);
907 root->log_batch = 0;
908 root->log_transid = 0;
853 extent_io_tree_init(&root->dirty_log_pages, 909 extent_io_tree_init(&root->dirty_log_pages,
854 fs_info->btree_inode->i_mapping, GFP_NOFS); 910 fs_info->btree_inode->i_mapping, GFP_NOFS);
855 911
@@ -934,15 +990,16 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
934 return 0; 990 return 0;
935} 991}
936 992
937int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 993static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
938 struct btrfs_fs_info *fs_info) 994 struct btrfs_fs_info *fs_info)
939{ 995{
940 struct btrfs_root *root; 996 struct btrfs_root *root;
941 struct btrfs_root *tree_root = fs_info->tree_root; 997 struct btrfs_root *tree_root = fs_info->tree_root;
998 struct extent_buffer *leaf;
942 999
943 root = kzalloc(sizeof(*root), GFP_NOFS); 1000 root = kzalloc(sizeof(*root), GFP_NOFS);
944 if (!root) 1001 if (!root)
945 return -ENOMEM; 1002 return ERR_PTR(-ENOMEM);
946 1003
947 __setup_root(tree_root->nodesize, tree_root->leafsize, 1004 __setup_root(tree_root->nodesize, tree_root->leafsize,
948 tree_root->sectorsize, tree_root->stripesize, 1005 tree_root->sectorsize, tree_root->stripesize,
@@ -951,12 +1008,23 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
951 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 1008 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
952 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1009 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
953 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; 1010 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1011 /*
1012 * log trees do not get reference counted because they go away
1013 * before a real commit is actually done. They do store pointers
1014 * to file data extents, and those reference counts still get
1015 * updated (along with back refs to the log tree).
1016 */
954 root->ref_cows = 0; 1017 root->ref_cows = 0;
955 1018
956 root->node = btrfs_alloc_free_block(trans, root, root->leafsize, 1019 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
957 0, BTRFS_TREE_LOG_OBJECTID, 1020 0, BTRFS_TREE_LOG_OBJECTID,
958 trans->transid, 0, 0, 0); 1021 trans->transid, 0, 0, 0);
1022 if (IS_ERR(leaf)) {
1023 kfree(root);
1024 return ERR_CAST(leaf);
1025 }
959 1026
1027 root->node = leaf;
960 btrfs_set_header_nritems(root->node, 0); 1028 btrfs_set_header_nritems(root->node, 0);
961 btrfs_set_header_level(root->node, 0); 1029 btrfs_set_header_level(root->node, 0);
962 btrfs_set_header_bytenr(root->node, root->node->start); 1030 btrfs_set_header_bytenr(root->node, root->node->start);
@@ -968,7 +1036,48 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
968 BTRFS_FSID_SIZE); 1036 BTRFS_FSID_SIZE);
969 btrfs_mark_buffer_dirty(root->node); 1037 btrfs_mark_buffer_dirty(root->node);
970 btrfs_tree_unlock(root->node); 1038 btrfs_tree_unlock(root->node);
971 fs_info->log_root_tree = root; 1039 return root;
1040}
1041
1042int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1043 struct btrfs_fs_info *fs_info)
1044{
1045 struct btrfs_root *log_root;
1046
1047 log_root = alloc_log_tree(trans, fs_info);
1048 if (IS_ERR(log_root))
1049 return PTR_ERR(log_root);
1050 WARN_ON(fs_info->log_root_tree);
1051 fs_info->log_root_tree = log_root;
1052 return 0;
1053}
1054
1055int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1056 struct btrfs_root *root)
1057{
1058 struct btrfs_root *log_root;
1059 struct btrfs_inode_item *inode_item;
1060
1061 log_root = alloc_log_tree(trans, root->fs_info);
1062 if (IS_ERR(log_root))
1063 return PTR_ERR(log_root);
1064
1065 log_root->last_trans = trans->transid;
1066 log_root->root_key.offset = root->root_key.objectid;
1067
1068 inode_item = &log_root->root_item.inode;
1069 inode_item->generation = cpu_to_le64(1);
1070 inode_item->size = cpu_to_le64(3);
1071 inode_item->nlink = cpu_to_le32(1);
1072 inode_item->nbytes = cpu_to_le64(root->leafsize);
1073 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
1074
1075 btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
1076 btrfs_set_root_generation(&log_root->root_item, trans->transid);
1077
1078 WARN_ON(root->log_root);
1079 root->log_root = log_root;
1080 root->log_transid = 0;
972 return 0; 1081 return 0;
973} 1082}
974 1083
@@ -1136,7 +1245,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1136{ 1245{
1137 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; 1246 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1138 int ret = 0; 1247 int ret = 0;
1139 struct list_head *cur;
1140 struct btrfs_device *device; 1248 struct btrfs_device *device;
1141 struct backing_dev_info *bdi; 1249 struct backing_dev_info *bdi;
1142#if 0 1250#if 0
@@ -1144,8 +1252,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1144 btrfs_congested_async(info, 0)) 1252 btrfs_congested_async(info, 0))
1145 return 1; 1253 return 1;
1146#endif 1254#endif
1147 list_for_each(cur, &info->fs_devices->devices) { 1255 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1148 device = list_entry(cur, struct btrfs_device, dev_list);
1149 if (!device->bdev) 1256 if (!device->bdev)
1150 continue; 1257 continue;
1151 bdi = blk_get_backing_dev_info(device->bdev); 1258 bdi = blk_get_backing_dev_info(device->bdev);
@@ -1163,13 +1270,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1163 */ 1270 */
1164static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1271static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1165{ 1272{
1166 struct list_head *cur;
1167 struct btrfs_device *device; 1273 struct btrfs_device *device;
1168 struct btrfs_fs_info *info; 1274 struct btrfs_fs_info *info;
1169 1275
1170 info = (struct btrfs_fs_info *)bdi->unplug_io_data; 1276 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1171 list_for_each(cur, &info->fs_devices->devices) { 1277 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1172 device = list_entry(cur, struct btrfs_device, dev_list);
1173 if (!device->bdev) 1278 if (!device->bdev)
1174 continue; 1279 continue;
1175 1280
@@ -1447,7 +1552,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1447 INIT_LIST_HEAD(&fs_info->dead_roots); 1552 INIT_LIST_HEAD(&fs_info->dead_roots);
1448 INIT_LIST_HEAD(&fs_info->hashers); 1553 INIT_LIST_HEAD(&fs_info->hashers);
1449 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1554 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1450 spin_lock_init(&fs_info->hash_lock);
1451 spin_lock_init(&fs_info->delalloc_lock); 1555 spin_lock_init(&fs_info->delalloc_lock);
1452 spin_lock_init(&fs_info->new_trans_lock); 1556 spin_lock_init(&fs_info->new_trans_lock);
1453 spin_lock_init(&fs_info->ref_cache_lock); 1557 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1535,10 +1639,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1535 init_waitqueue_head(&fs_info->transaction_throttle); 1639 init_waitqueue_head(&fs_info->transaction_throttle);
1536 init_waitqueue_head(&fs_info->transaction_wait); 1640 init_waitqueue_head(&fs_info->transaction_wait);
1537 init_waitqueue_head(&fs_info->async_submit_wait); 1641 init_waitqueue_head(&fs_info->async_submit_wait);
1538 init_waitqueue_head(&fs_info->tree_log_wait);
1539 atomic_set(&fs_info->tree_log_commit, 0);
1540 atomic_set(&fs_info->tree_log_writers, 0);
1541 fs_info->tree_log_transid = 0;
1542 1642
1543 __setup_root(4096, 4096, 4096, 4096, tree_root, 1643 __setup_root(4096, 4096, 4096, 4096, tree_root,
1544 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1644 fs_info, BTRFS_ROOT_TREE_OBJECTID);
@@ -1627,6 +1727,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 * low idle thresh 1727 * low idle thresh
1628 */ 1728 */
1629 fs_info->endio_workers.idle_thresh = 4; 1729 fs_info->endio_workers.idle_thresh = 4;
1730 fs_info->endio_meta_workers.idle_thresh = 4;
1731
1630 fs_info->endio_write_workers.idle_thresh = 64; 1732 fs_info->endio_write_workers.idle_thresh = 64;
1631 fs_info->endio_meta_write_workers.idle_thresh = 64; 1733 fs_info->endio_meta_write_workers.idle_thresh = 64;
1632 1734
@@ -1720,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1720 ret = find_and_setup_root(tree_root, fs_info, 1822 ret = find_and_setup_root(tree_root, fs_info,
1721 BTRFS_DEV_TREE_OBJECTID, dev_root); 1823 BTRFS_DEV_TREE_OBJECTID, dev_root);
1722 dev_root->track_dirty = 1; 1824 dev_root->track_dirty = 1;
1723
1724 if (ret) 1825 if (ret)
1725 goto fail_extent_root; 1826 goto fail_extent_root;
1726 1827
@@ -1740,13 +1841,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1740 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1841 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1741 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1842 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1742 "btrfs-cleaner"); 1843 "btrfs-cleaner");
1743 if (!fs_info->cleaner_kthread) 1844 if (IS_ERR(fs_info->cleaner_kthread))
1744 goto fail_csum_root; 1845 goto fail_csum_root;
1745 1846
1746 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1847 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1747 tree_root, 1848 tree_root,
1748 "btrfs-transaction"); 1849 "btrfs-transaction");
1749 if (!fs_info->transaction_kthread) 1850 if (IS_ERR(fs_info->transaction_kthread))
1750 goto fail_cleaner; 1851 goto fail_cleaner;
1751 1852
1752 if (btrfs_super_log_root(disk_super) != 0) { 1853 if (btrfs_super_log_root(disk_super) != 0) {
@@ -1828,13 +1929,14 @@ fail_sb_buffer:
1828fail_iput: 1929fail_iput:
1829 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 1930 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1830 iput(fs_info->btree_inode); 1931 iput(fs_info->btree_inode);
1831fail: 1932
1832 btrfs_close_devices(fs_info->fs_devices); 1933 btrfs_close_devices(fs_info->fs_devices);
1833 btrfs_mapping_tree_free(&fs_info->mapping_tree); 1934 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1935 bdi_destroy(&fs_info->bdi);
1834 1936
1937fail:
1835 kfree(extent_root); 1938 kfree(extent_root);
1836 kfree(tree_root); 1939 kfree(tree_root);
1837 bdi_destroy(&fs_info->bdi);
1838 kfree(fs_info); 1940 kfree(fs_info);
1839 kfree(chunk_root); 1941 kfree(chunk_root);
1840 kfree(dev_root); 1942 kfree(dev_root);
@@ -1995,7 +2097,6 @@ static int write_dev_supers(struct btrfs_device *device,
1995 2097
1996int write_all_supers(struct btrfs_root *root, int max_mirrors) 2098int write_all_supers(struct btrfs_root *root, int max_mirrors)
1997{ 2099{
1998 struct list_head *cur;
1999 struct list_head *head = &root->fs_info->fs_devices->devices; 2100 struct list_head *head = &root->fs_info->fs_devices->devices;
2000 struct btrfs_device *dev; 2101 struct btrfs_device *dev;
2001 struct btrfs_super_block *sb; 2102 struct btrfs_super_block *sb;
@@ -2011,8 +2112,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2011 2112
2012 sb = &root->fs_info->super_for_commit; 2113 sb = &root->fs_info->super_for_commit;
2013 dev_item = &sb->dev_item; 2114 dev_item = &sb->dev_item;
2014 list_for_each(cur, head) { 2115 list_for_each_entry(dev, head, dev_list) {
2015 dev = list_entry(cur, struct btrfs_device, dev_list);
2016 if (!dev->bdev) { 2116 if (!dev->bdev) {
2017 total_errors++; 2117 total_errors++;
2018 continue; 2118 continue;
@@ -2045,8 +2145,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2045 } 2145 }
2046 2146
2047 total_errors = 0; 2147 total_errors = 0;
2048 list_for_each(cur, head) { 2148 list_for_each_entry(dev, head, dev_list) {
2049 dev = list_entry(cur, struct btrfs_device, dev_list);
2050 if (!dev->bdev) 2149 if (!dev->bdev)
2051 continue; 2150 continue;
2052 if (!dev->in_fs_metadata || !dev->writeable) 2151 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2260,7 +2359,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2260 u64 transid = btrfs_header_generation(buf); 2359 u64 transid = btrfs_header_generation(buf);
2261 struct inode *btree_inode = root->fs_info->btree_inode; 2360 struct inode *btree_inode = root->fs_info->btree_inode;
2262 2361
2263 WARN_ON(!btrfs_tree_locked(buf)); 2362 btrfs_set_lock_blocking(buf);
2363
2364 btrfs_assert_tree_locked(buf);
2264 if (transid != root->fs_info->generation) { 2365 if (transid != root->fs_info->generation) {
2265 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 2366 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
2266 "found %llu running %llu\n", 2367 "found %llu running %llu\n",
@@ -2284,7 +2385,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2284 unsigned long thresh = 32 * 1024 * 1024; 2385 unsigned long thresh = 32 * 1024 * 1024;
2285 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 2386 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2286 2387
2287 if (current_is_pdflush() || current->flags & PF_MEMALLOC) 2388 if (current->flags & PF_MEMALLOC)
2288 return; 2389 return;
2289 2390
2290 num_dirty = count_range_bits(tree, &start, (u64)-1, 2391 num_dirty = count_range_bits(tree, &start, (u64)-1,
@@ -2302,14 +2403,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2302 int ret; 2403 int ret;
2303 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 2404 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2304 if (ret == 0) 2405 if (ret == 0)
2305 buf->flags |= EXTENT_UPTODATE; 2406 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
2306 return ret; 2407 return ret;
2307} 2408}
2308 2409
2309int btree_lock_page_hook(struct page *page) 2410int btree_lock_page_hook(struct page *page)
2310{ 2411{
2311 struct inode *inode = page->mapping->host; 2412 struct inode *inode = page->mapping->host;
2312 struct btrfs_root *root = BTRFS_I(inode)->root;
2313 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2413 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2314 struct extent_buffer *eb; 2414 struct extent_buffer *eb;
2315 unsigned long len; 2415 unsigned long len;
@@ -2324,9 +2424,7 @@ int btree_lock_page_hook(struct page *page)
2324 goto out; 2424 goto out;
2325 2425
2326 btrfs_tree_lock(eb); 2426 btrfs_tree_lock(eb);
2327 spin_lock(&root->fs_info->hash_lock);
2328 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2427 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2329 spin_unlock(&root->fs_info->hash_lock);
2330 btrfs_tree_unlock(eb); 2428 btrfs_tree_unlock(eb);
2331 free_extent_buffer(eb); 2429 free_extent_buffer(eb);
2332out: 2430out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c0ff404c31b7..95029db227be 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -98,5 +98,17 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
98 struct btrfs_fs_info *fs_info); 98 struct btrfs_fs_info *fs_info);
99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
100 struct btrfs_fs_info *fs_info); 100 struct btrfs_fs_info *fs_info);
101int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
102 struct btrfs_root *root);
101int btree_lock_page_hook(struct page *page); 103int btree_lock_page_hook(struct page *page);
104
105
106#ifdef CONFIG_DEBUG_LOCK_ALLOC
107void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
108#else
109static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
110 int level)
111{
112}
113#endif
102#endif 114#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 293da650873f..fefe83ad2059 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,7 +19,8 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/version.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h>
23#include "compat.h" 24#include "compat.h"
24#include "hash.h" 25#include "hash.h"
25#include "crc32c.h" 26#include "crc32c.h"
@@ -30,7 +31,6 @@
30#include "volumes.h" 31#include "volumes.h"
31#include "locking.h" 32#include "locking.h"
32#include "ref-cache.h" 33#include "ref-cache.h"
33#include "compat.h"
34 34
35#define PENDING_EXTENT_INSERT 0 35#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 36#define PENDING_EXTENT_DELETE 1
@@ -61,6 +61,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
61 u64 bytenr, u64 num_bytes, int alloc, 61 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free); 62 int mark_free);
63 63
64static int do_chunk_alloc(struct btrfs_trans_handle *trans,
65 struct btrfs_root *extent_root, u64 alloc_bytes,
66 u64 flags, int force);
67
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 68static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{ 69{
66 return (cache->flags & bits) == bits; 70 return (cache->flags & bits) == bits;
@@ -326,16 +330,34 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags) 330 u64 flags)
327{ 331{
328 struct list_head *head = &info->space_info; 332 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found; 333 struct btrfs_space_info *found;
331 list_for_each(cur, head) { 334
332 found = list_entry(cur, struct btrfs_space_info, list); 335 rcu_read_lock();
333 if (found->flags == flags) 336 list_for_each_entry_rcu(found, head, list) {
337 if (found->flags == flags) {
338 rcu_read_unlock();
334 return found; 339 return found;
340 }
335 } 341 }
342 rcu_read_unlock();
336 return NULL; 343 return NULL;
337} 344}
338 345
346/*
347 * after adding space to the filesystem, we need to clear the full flags
348 * on all the space infos.
349 */
350void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
351{
352 struct list_head *head = &info->space_info;
353 struct btrfs_space_info *found;
354
355 rcu_read_lock();
356 list_for_each_entry_rcu(found, head, list)
357 found->full = 0;
358 rcu_read_unlock();
359}
360
339static u64 div_factor(u64 num, int factor) 361static u64 div_factor(u64 num, int factor)
340{ 362{
341 if (factor == 10) 363 if (factor == 10)
@@ -1326,8 +1348,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1326int btrfs_extent_post_op(struct btrfs_trans_handle *trans, 1348int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root) 1349 struct btrfs_root *root)
1328{ 1350{
1329 finish_current_insert(trans, root->fs_info->extent_root, 1); 1351 u64 start;
1330 del_pending_extents(trans, root->fs_info->extent_root, 1); 1352 u64 end;
1353 int ret;
1354
1355 while(1) {
1356 finish_current_insert(trans, root->fs_info->extent_root, 1);
1357 del_pending_extents(trans, root->fs_info->extent_root, 1);
1358
1359 /* is there more work to do? */
1360 ret = find_first_extent_bit(&root->fs_info->pending_del,
1361 0, &start, &end, EXTENT_WRITEBACK);
1362 if (!ret)
1363 continue;
1364 ret = find_first_extent_bit(&root->fs_info->extent_ins,
1365 0, &start, &end, EXTENT_WRITEBACK);
1366 if (!ret)
1367 continue;
1368 break;
1369 }
1331 return 0; 1370 return 0;
1332} 1371}
1333 1372
@@ -1525,15 +1564,55 @@ out:
1525 return ret; 1564 return ret;
1526} 1565}
1527 1566
1528int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1567/* when a block goes through cow, we update the reference counts of
1529 struct extent_buffer *orig_buf, struct extent_buffer *buf, 1568 * everything that block points to. The internal pointers of the block
1530 u32 *nr_extents) 1569 * can be in just about any order, and it is likely to have clusters of
1570 * things that are close together and clusters of things that are not.
1571 *
1572 * To help reduce the seeks that come with updating all of these reference
1573 * counts, sort them by byte number before actual updates are done.
1574 *
1575 * struct refsort is used to match byte number to slot in the btree block.
1576 * we sort based on the byte number and then use the slot to actually
1577 * find the item.
1578 *
1579 * struct refsort is smaller than strcut btrfs_item and smaller than
1580 * struct btrfs_key_ptr. Since we're currently limited to the page size
1581 * for a btree block, there's no way for a kmalloc of refsorts for a
1582 * single node to be bigger than a page.
1583 */
1584struct refsort {
1585 u64 bytenr;
1586 u32 slot;
1587};
1588
1589/*
1590 * for passing into sort()
1591 */
1592static int refsort_cmp(const void *a_void, const void *b_void)
1593{
1594 const struct refsort *a = a_void;
1595 const struct refsort *b = b_void;
1596
1597 if (a->bytenr < b->bytenr)
1598 return -1;
1599 if (a->bytenr > b->bytenr)
1600 return 1;
1601 return 0;
1602}
1603
1604
1605noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1606 struct btrfs_root *root,
1607 struct extent_buffer *orig_buf,
1608 struct extent_buffer *buf, u32 *nr_extents)
1531{ 1609{
1532 u64 bytenr; 1610 u64 bytenr;
1533 u64 ref_root; 1611 u64 ref_root;
1534 u64 orig_root; 1612 u64 orig_root;
1535 u64 ref_generation; 1613 u64 ref_generation;
1536 u64 orig_generation; 1614 u64 orig_generation;
1615 struct refsort *sorted;
1537 u32 nritems; 1616 u32 nritems;
1538 u32 nr_file_extents = 0; 1617 u32 nr_file_extents = 0;
1539 struct btrfs_key key; 1618 struct btrfs_key key;
@@ -1542,6 +1621,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1542 int level; 1621 int level;
1543 int ret = 0; 1622 int ret = 0;
1544 int faili = 0; 1623 int faili = 0;
1624 int refi = 0;
1625 int slot;
1545 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1626 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1546 u64, u64, u64, u64, u64, u64, u64, u64); 1627 u64, u64, u64, u64, u64, u64, u64, u64);
1547 1628
@@ -1553,6 +1634,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1553 nritems = btrfs_header_nritems(buf); 1634 nritems = btrfs_header_nritems(buf);
1554 level = btrfs_header_level(buf); 1635 level = btrfs_header_level(buf);
1555 1636
1637 sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1638 BUG_ON(!sorted);
1639
1556 if (root->ref_cows) { 1640 if (root->ref_cows) {
1557 process_func = __btrfs_inc_extent_ref; 1641 process_func = __btrfs_inc_extent_ref;
1558 } else { 1642 } else {
@@ -1565,6 +1649,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1565 process_func = __btrfs_update_extent_ref; 1649 process_func = __btrfs_update_extent_ref;
1566 } 1650 }
1567 1651
1652 /*
1653 * we make two passes through the items. In the first pass we
1654 * only record the byte number and slot. Then we sort based on
1655 * byte number and do the actual work based on the sorted results
1656 */
1568 for (i = 0; i < nritems; i++) { 1657 for (i = 0; i < nritems; i++) {
1569 cond_resched(); 1658 cond_resched();
1570 if (level == 0) { 1659 if (level == 0) {
@@ -1581,6 +1670,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1581 continue; 1670 continue;
1582 1671
1583 nr_file_extents++; 1672 nr_file_extents++;
1673 sorted[refi].bytenr = bytenr;
1674 sorted[refi].slot = i;
1675 refi++;
1676 } else {
1677 bytenr = btrfs_node_blockptr(buf, i);
1678 sorted[refi].bytenr = bytenr;
1679 sorted[refi].slot = i;
1680 refi++;
1681 }
1682 }
1683 /*
1684 * if refi == 0, we didn't actually put anything into the sorted
1685 * array and we're done
1686 */
1687 if (refi == 0)
1688 goto out;
1689
1690 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1691
1692 for (i = 0; i < refi; i++) {
1693 cond_resched();
1694 slot = sorted[i].slot;
1695 bytenr = sorted[i].bytenr;
1696
1697 if (level == 0) {
1698 btrfs_item_key_to_cpu(buf, &key, slot);
1584 1699
1585 ret = process_func(trans, root, bytenr, 1700 ret = process_func(trans, root, bytenr,
1586 orig_buf->start, buf->start, 1701 orig_buf->start, buf->start,
@@ -1589,25 +1704,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1589 key.objectid); 1704 key.objectid);
1590 1705
1591 if (ret) { 1706 if (ret) {
1592 faili = i; 1707 faili = slot;
1593 WARN_ON(1); 1708 WARN_ON(1);
1594 goto fail; 1709 goto fail;
1595 } 1710 }
1596 } else { 1711 } else {
1597 bytenr = btrfs_node_blockptr(buf, i);
1598 ret = process_func(trans, root, bytenr, 1712 ret = process_func(trans, root, bytenr,
1599 orig_buf->start, buf->start, 1713 orig_buf->start, buf->start,
1600 orig_root, ref_root, 1714 orig_root, ref_root,
1601 orig_generation, ref_generation, 1715 orig_generation, ref_generation,
1602 level - 1); 1716 level - 1);
1603 if (ret) { 1717 if (ret) {
1604 faili = i; 1718 faili = slot;
1605 WARN_ON(1); 1719 WARN_ON(1);
1606 goto fail; 1720 goto fail;
1607 } 1721 }
1608 } 1722 }
1609 } 1723 }
1610out: 1724out:
1725 kfree(sorted);
1611 if (nr_extents) { 1726 if (nr_extents) {
1612 if (level == 0) 1727 if (level == 0)
1613 *nr_extents = nr_file_extents; 1728 *nr_extents = nr_file_extents;
@@ -1616,6 +1731,7 @@ out:
1616 } 1731 }
1617 return 0; 1732 return 0;
1618fail: 1733fail:
1734 kfree(sorted);
1619 WARN_ON(1); 1735 WARN_ON(1);
1620 return ret; 1736 return ret;
1621} 1737}
@@ -1808,7 +1924,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1808 if (!found) 1924 if (!found)
1809 return -ENOMEM; 1925 return -ENOMEM;
1810 1926
1811 list_add(&found->list, &info->space_info);
1812 INIT_LIST_HEAD(&found->block_groups); 1927 INIT_LIST_HEAD(&found->block_groups);
1813 init_rwsem(&found->groups_sem); 1928 init_rwsem(&found->groups_sem);
1814 spin_lock_init(&found->lock); 1929 spin_lock_init(&found->lock);
@@ -1818,9 +1933,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1818 found->bytes_pinned = 0; 1933 found->bytes_pinned = 0;
1819 found->bytes_reserved = 0; 1934 found->bytes_reserved = 0;
1820 found->bytes_readonly = 0; 1935 found->bytes_readonly = 0;
1936 found->bytes_delalloc = 0;
1821 found->full = 0; 1937 found->full = 0;
1822 found->force_alloc = 0; 1938 found->force_alloc = 0;
1823 *space_info = found; 1939 *space_info = found;
1940 list_add_rcu(&found->list, &info->space_info);
1824 return 0; 1941 return 0;
1825} 1942}
1826 1943
@@ -1881,6 +1998,233 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1881 return flags; 1998 return flags;
1882} 1999}
1883 2000
2001static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
2002{
2003 struct btrfs_fs_info *info = root->fs_info;
2004 u64 alloc_profile;
2005
2006 if (data) {
2007 alloc_profile = info->avail_data_alloc_bits &
2008 info->data_alloc_profile;
2009 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2010 } else if (root == root->fs_info->chunk_root) {
2011 alloc_profile = info->avail_system_alloc_bits &
2012 info->system_alloc_profile;
2013 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2014 } else {
2015 alloc_profile = info->avail_metadata_alloc_bits &
2016 info->metadata_alloc_profile;
2017 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2018 }
2019
2020 return btrfs_reduce_alloc_profile(root, data);
2021}
2022
2023void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2024{
2025 u64 alloc_target;
2026
2027 alloc_target = btrfs_get_alloc_profile(root, 1);
2028 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2029 alloc_target);
2030}
2031
2032/*
2033 * for now this just makes sure we have at least 5% of our metadata space free
2034 * for use.
2035 */
2036int btrfs_check_metadata_free_space(struct btrfs_root *root)
2037{
2038 struct btrfs_fs_info *info = root->fs_info;
2039 struct btrfs_space_info *meta_sinfo;
2040 u64 alloc_target, thresh;
2041 int committed = 0, ret;
2042
2043 /* get the space info for where the metadata will live */
2044 alloc_target = btrfs_get_alloc_profile(root, 0);
2045 meta_sinfo = __find_space_info(info, alloc_target);
2046
2047again:
2048 spin_lock(&meta_sinfo->lock);
2049 if (!meta_sinfo->full)
2050 thresh = meta_sinfo->total_bytes * 80;
2051 else
2052 thresh = meta_sinfo->total_bytes * 95;
2053
2054 do_div(thresh, 100);
2055
2056 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2057 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
2058 struct btrfs_trans_handle *trans;
2059 if (!meta_sinfo->full) {
2060 meta_sinfo->force_alloc = 1;
2061 spin_unlock(&meta_sinfo->lock);
2062
2063 trans = btrfs_start_transaction(root, 1);
2064 if (!trans)
2065 return -ENOMEM;
2066
2067 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2068 2 * 1024 * 1024, alloc_target, 0);
2069 btrfs_end_transaction(trans, root);
2070 goto again;
2071 }
2072 spin_unlock(&meta_sinfo->lock);
2073
2074 if (!committed) {
2075 committed = 1;
2076 trans = btrfs_join_transaction(root, 1);
2077 if (!trans)
2078 return -ENOMEM;
2079 ret = btrfs_commit_transaction(trans, root);
2080 if (ret)
2081 return ret;
2082 goto again;
2083 }
2084 return -ENOSPC;
2085 }
2086 spin_unlock(&meta_sinfo->lock);
2087
2088 return 0;
2089}
2090
2091/*
2092 * This will check the space that the inode allocates from to make sure we have
2093 * enough space for bytes.
2094 */
2095int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2096 u64 bytes)
2097{
2098 struct btrfs_space_info *data_sinfo;
2099 int ret = 0, committed = 0;
2100
2101 /* make sure bytes are sectorsize aligned */
2102 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2103
2104 data_sinfo = BTRFS_I(inode)->space_info;
2105again:
2106 /* make sure we have enough space to handle the data first */
2107 spin_lock(&data_sinfo->lock);
2108 if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2109 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2110 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2111 data_sinfo->bytes_may_use < bytes) {
2112 struct btrfs_trans_handle *trans;
2113
2114 /*
2115 * if we don't have enough free bytes in this space then we need
2116 * to alloc a new chunk.
2117 */
2118 if (!data_sinfo->full) {
2119 u64 alloc_target;
2120
2121 data_sinfo->force_alloc = 1;
2122 spin_unlock(&data_sinfo->lock);
2123
2124 alloc_target = btrfs_get_alloc_profile(root, 1);
2125 trans = btrfs_start_transaction(root, 1);
2126 if (!trans)
2127 return -ENOMEM;
2128
2129 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2130 bytes + 2 * 1024 * 1024,
2131 alloc_target, 0);
2132 btrfs_end_transaction(trans, root);
2133 if (ret)
2134 return ret;
2135 goto again;
2136 }
2137 spin_unlock(&data_sinfo->lock);
2138
2139 /* commit the current transaction and try again */
2140 if (!committed) {
2141 committed = 1;
2142 trans = btrfs_join_transaction(root, 1);
2143 if (!trans)
2144 return -ENOMEM;
2145 ret = btrfs_commit_transaction(trans, root);
2146 if (ret)
2147 return ret;
2148 goto again;
2149 }
2150
2151 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
2152 ", %llu bytes_used, %llu bytes_reserved, "
2153 "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
2154 "%llu total\n", bytes, data_sinfo->bytes_delalloc,
2155 data_sinfo->bytes_used, data_sinfo->bytes_reserved,
2156 data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
2157 data_sinfo->bytes_may_use, data_sinfo->total_bytes);
2158 return -ENOSPC;
2159 }
2160 data_sinfo->bytes_may_use += bytes;
2161 BTRFS_I(inode)->reserved_bytes += bytes;
2162 spin_unlock(&data_sinfo->lock);
2163
2164 return btrfs_check_metadata_free_space(root);
2165}
2166
2167/*
2168 * if there was an error for whatever reason after calling
2169 * btrfs_check_data_free_space, call this so we can cleanup the counters.
2170 */
2171void btrfs_free_reserved_data_space(struct btrfs_root *root,
2172 struct inode *inode, u64 bytes)
2173{
2174 struct btrfs_space_info *data_sinfo;
2175
2176 /* make sure bytes are sectorsize aligned */
2177 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2178
2179 data_sinfo = BTRFS_I(inode)->space_info;
2180 spin_lock(&data_sinfo->lock);
2181 data_sinfo->bytes_may_use -= bytes;
2182 BTRFS_I(inode)->reserved_bytes -= bytes;
2183 spin_unlock(&data_sinfo->lock);
2184}
2185
2186/* called when we are adding a delalloc extent to the inode's io_tree */
2187void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
2188 u64 bytes)
2189{
2190 struct btrfs_space_info *data_sinfo;
2191
2192 /* get the space info for where this inode will be storing its data */
2193 data_sinfo = BTRFS_I(inode)->space_info;
2194
2195 /* make sure we have enough space to handle the data first */
2196 spin_lock(&data_sinfo->lock);
2197 data_sinfo->bytes_delalloc += bytes;
2198
2199 /*
2200 * we are adding a delalloc extent without calling
2201 * btrfs_check_data_free_space first. This happens on a weird
2202 * writepage condition, but shouldn't hurt our accounting
2203 */
2204 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
2205 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
2206 BTRFS_I(inode)->reserved_bytes = 0;
2207 } else {
2208 data_sinfo->bytes_may_use -= bytes;
2209 BTRFS_I(inode)->reserved_bytes -= bytes;
2210 }
2211
2212 spin_unlock(&data_sinfo->lock);
2213}
2214
2215/* called when we are clearing an delalloc extent from the inode's io_tree */
2216void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2217 u64 bytes)
2218{
2219 struct btrfs_space_info *info;
2220
2221 info = BTRFS_I(inode)->space_info;
2222
2223 spin_lock(&info->lock);
2224 info->bytes_delalloc -= bytes;
2225 spin_unlock(&info->lock);
2226}
2227
1884static int do_chunk_alloc(struct btrfs_trans_handle *trans, 2228static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1885 struct btrfs_root *extent_root, u64 alloc_bytes, 2229 struct btrfs_root *extent_root, u64 alloc_bytes,
1886 u64 flags, int force) 2230 u64 flags, int force)
@@ -2137,13 +2481,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
2137 u64 end; 2481 u64 end;
2138 u64 priv; 2482 u64 priv;
2139 u64 search = 0; 2483 u64 search = 0;
2140 u64 skipped = 0;
2141 struct btrfs_fs_info *info = extent_root->fs_info; 2484 struct btrfs_fs_info *info = extent_root->fs_info;
2142 struct btrfs_path *path; 2485 struct btrfs_path *path;
2143 struct pending_extent_op *extent_op, *tmp; 2486 struct pending_extent_op *extent_op, *tmp;
2144 struct list_head insert_list, update_list; 2487 struct list_head insert_list, update_list;
2145 int ret; 2488 int ret;
2146 int num_inserts = 0, max_inserts; 2489 int num_inserts = 0, max_inserts, restart = 0;
2147 2490
2148 path = btrfs_alloc_path(); 2491 path = btrfs_alloc_path();
2149 INIT_LIST_HEAD(&insert_list); 2492 INIT_LIST_HEAD(&insert_list);
@@ -2159,18 +2502,19 @@ again:
2159 ret = find_first_extent_bit(&info->extent_ins, search, &start, 2502 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2160 &end, EXTENT_WRITEBACK); 2503 &end, EXTENT_WRITEBACK);
2161 if (ret) { 2504 if (ret) {
2162 if (skipped && all && !num_inserts) { 2505 if (restart && !num_inserts &&
2163 skipped = 0; 2506 list_empty(&update_list)) {
2507 restart = 0;
2164 search = 0; 2508 search = 0;
2165 continue; 2509 continue;
2166 } 2510 }
2167 mutex_unlock(&info->extent_ins_mutex);
2168 break; 2511 break;
2169 } 2512 }
2170 2513
2171 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); 2514 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2172 if (!ret) { 2515 if (!ret) {
2173 skipped = 1; 2516 if (all)
2517 restart = 1;
2174 search = end + 1; 2518 search = end + 1;
2175 if (need_resched()) { 2519 if (need_resched()) {
2176 mutex_unlock(&info->extent_ins_mutex); 2520 mutex_unlock(&info->extent_ins_mutex);
@@ -2189,7 +2533,7 @@ again:
2189 list_add_tail(&extent_op->list, &insert_list); 2533 list_add_tail(&extent_op->list, &insert_list);
2190 search = end + 1; 2534 search = end + 1;
2191 if (num_inserts == max_inserts) { 2535 if (num_inserts == max_inserts) {
2192 mutex_unlock(&info->extent_ins_mutex); 2536 restart = 1;
2193 break; 2537 break;
2194 } 2538 }
2195 } else if (extent_op->type == PENDING_BACKREF_UPDATE) { 2539 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2205,7 +2549,6 @@ again:
2205 * somebody marked this thing for deletion then just unlock it and be 2549 * somebody marked this thing for deletion then just unlock it and be
2206 * done, the free_extents will handle it 2550 * done, the free_extents will handle it
2207 */ 2551 */
2208 mutex_lock(&info->extent_ins_mutex);
2209 list_for_each_entry_safe(extent_op, tmp, &update_list, list) { 2552 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2210 clear_extent_bits(&info->extent_ins, extent_op->bytenr, 2553 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2211 extent_op->bytenr + extent_op->num_bytes - 1, 2554 extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2227,6 +2570,10 @@ again:
2227 if (!list_empty(&update_list)) { 2570 if (!list_empty(&update_list)) {
2228 ret = update_backrefs(trans, extent_root, path, &update_list); 2571 ret = update_backrefs(trans, extent_root, path, &update_list);
2229 BUG_ON(ret); 2572 BUG_ON(ret);
2573
2574 /* we may have COW'ed new blocks, so lets start over */
2575 if (all)
2576 restart = 1;
2230 } 2577 }
2231 2578
2232 /* 2579 /*
@@ -2234,9 +2581,9 @@ again:
2234 * need to make sure everything is cleaned then reset everything and 2581 * need to make sure everything is cleaned then reset everything and
2235 * go back to the beginning 2582 * go back to the beginning
2236 */ 2583 */
2237 if (!num_inserts && all && skipped) { 2584 if (!num_inserts && restart) {
2238 search = 0; 2585 search = 0;
2239 skipped = 0; 2586 restart = 0;
2240 INIT_LIST_HEAD(&update_list); 2587 INIT_LIST_HEAD(&update_list);
2241 INIT_LIST_HEAD(&insert_list); 2588 INIT_LIST_HEAD(&insert_list);
2242 goto again; 2589 goto again;
@@ -2293,27 +2640,19 @@ again:
2293 BUG_ON(ret); 2640 BUG_ON(ret);
2294 2641
2295 /* 2642 /*
2296 * if we broke out of the loop in order to insert stuff because we hit 2643 * if restart is set for whatever reason we need to go back and start
2297 * the maximum number of inserts at a time we can handle, then loop 2644 * searching through the pending list again.
2298 * back and pick up where we left off 2645 *
2299 */ 2646 * We just inserted some extents, which could have resulted in new
2300 if (num_inserts == max_inserts) { 2647 * blocks being allocated, which would result in new blocks needing
2301 INIT_LIST_HEAD(&insert_list); 2648 * updates, so if all is set we _must_ restart to get the updated
2302 INIT_LIST_HEAD(&update_list); 2649 * blocks.
2303 num_inserts = 0;
2304 goto again;
2305 }
2306
2307 /*
2308 * again, if we need to make absolutely sure there are no more pending
2309 * extent operations left and we know that we skipped some, go back to
2310 * the beginning and do it all again
2311 */ 2650 */
2312 if (all && skipped) { 2651 if (restart || all) {
2313 INIT_LIST_HEAD(&insert_list); 2652 INIT_LIST_HEAD(&insert_list);
2314 INIT_LIST_HEAD(&update_list); 2653 INIT_LIST_HEAD(&update_list);
2315 search = 0; 2654 search = 0;
2316 skipped = 0; 2655 restart = 0;
2317 num_inserts = 0; 2656 num_inserts = 0;
2318 goto again; 2657 goto again;
2319 } 2658 }
@@ -2547,6 +2886,7 @@ again:
2547 if (ret) { 2886 if (ret) {
2548 if (all && skipped && !nr) { 2887 if (all && skipped && !nr) {
2549 search = 0; 2888 search = 0;
2889 skipped = 0;
2550 continue; 2890 continue;
2551 } 2891 }
2552 mutex_unlock(&info->extent_ins_mutex); 2892 mutex_unlock(&info->extent_ins_mutex);
@@ -2633,6 +2973,8 @@ again:
2633 goto again; 2973 goto again;
2634 } 2974 }
2635 2975
2976 if (!err)
2977 finish_current_insert(trans, extent_root, 0);
2636 return err; 2978 return err;
2637} 2979}
2638 2980
@@ -2700,13 +3042,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2700 /* if metadata always pin */ 3042 /* if metadata always pin */
2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 3043 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3044 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2703 struct btrfs_block_group_cache *cache; 3045 mutex_lock(&root->fs_info->pinned_mutex);
2704 3046 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2705 /* btrfs_free_reserved_extent */ 3047 mutex_unlock(&root->fs_info->pinned_mutex);
2706 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2707 BUG_ON(!cache);
2708 btrfs_add_free_space(cache, bytenr, num_bytes);
2709 put_block_group(cache);
2710 update_reserved_extents(root, bytenr, num_bytes, 0); 3048 update_reserved_extents(root, bytenr, num_bytes, 0);
2711 return 0; 3049 return 0;
2712 } 3050 }
@@ -2787,7 +3125,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2787 3125
2788 if (data & BTRFS_BLOCK_GROUP_METADATA) { 3126 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2789 last_ptr = &root->fs_info->last_alloc; 3127 last_ptr = &root->fs_info->last_alloc;
2790 empty_cluster = 64 * 1024; 3128 if (!btrfs_test_opt(root, SSD))
3129 empty_cluster = 64 * 1024;
2791 } 3130 }
2792 3131
2793 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 3132 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
@@ -3014,16 +3353,18 @@ loop_check:
3014static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 3353static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3015{ 3354{
3016 struct btrfs_block_group_cache *cache; 3355 struct btrfs_block_group_cache *cache;
3017 struct list_head *l;
3018 3356
3019 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 3357 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3020 (unsigned long long)(info->total_bytes - info->bytes_used - 3358 (unsigned long long)(info->total_bytes - info->bytes_used -
3021 info->bytes_pinned - info->bytes_reserved), 3359 info->bytes_pinned - info->bytes_reserved),
3022 (info->full) ? "" : "not "); 3360 (info->full) ? "" : "not ");
3361 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3362 " may_use=%llu, used=%llu\n", info->total_bytes,
3363 info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
3364 info->bytes_used);
3023 3365
3024 down_read(&info->groups_sem); 3366 down_read(&info->groups_sem);
3025 list_for_each(l, &info->block_groups) { 3367 list_for_each_entry(cache, &info->block_groups, list) {
3026 cache = list_entry(l, struct btrfs_block_group_cache, list);
3027 spin_lock(&cache->lock); 3368 spin_lock(&cache->lock);
3028 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 3369 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3029 "%llu pinned %llu reserved\n", 3370 "%llu pinned %llu reserved\n",
@@ -3047,24 +3388,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3047{ 3388{
3048 int ret; 3389 int ret;
3049 u64 search_start = 0; 3390 u64 search_start = 0;
3050 u64 alloc_profile;
3051 struct btrfs_fs_info *info = root->fs_info; 3391 struct btrfs_fs_info *info = root->fs_info;
3052 3392
3053 if (data) { 3393 data = btrfs_get_alloc_profile(root, data);
3054 alloc_profile = info->avail_data_alloc_bits &
3055 info->data_alloc_profile;
3056 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3057 } else if (root == root->fs_info->chunk_root) {
3058 alloc_profile = info->avail_system_alloc_bits &
3059 info->system_alloc_profile;
3060 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3061 } else {
3062 alloc_profile = info->avail_metadata_alloc_bits &
3063 info->metadata_alloc_profile;
3064 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3065 }
3066again: 3394again:
3067 data = btrfs_reduce_alloc_profile(root, data);
3068 /* 3395 /*
3069 * the only place that sets empty_size is btrfs_realloc_node, which 3396 * the only place that sets empty_size is btrfs_realloc_node, which
3070 * is not called recursively on allocations 3397 * is not called recursively on allocations
@@ -3332,7 +3659,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3332 3659
3333struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 3660struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3334 struct btrfs_root *root, 3661 struct btrfs_root *root,
3335 u64 bytenr, u32 blocksize) 3662 u64 bytenr, u32 blocksize,
3663 int level)
3336{ 3664{
3337 struct extent_buffer *buf; 3665 struct extent_buffer *buf;
3338 3666
@@ -3340,9 +3668,13 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3340 if (!buf) 3668 if (!buf)
3341 return ERR_PTR(-ENOMEM); 3669 return ERR_PTR(-ENOMEM);
3342 btrfs_set_header_generation(buf, trans->transid); 3670 btrfs_set_header_generation(buf, trans->transid);
3671 btrfs_set_buffer_lockdep_class(buf, level);
3343 btrfs_tree_lock(buf); 3672 btrfs_tree_lock(buf);
3344 clean_tree_block(trans, root, buf); 3673 clean_tree_block(trans, root, buf);
3674
3675 btrfs_set_lock_blocking(buf);
3345 btrfs_set_buffer_uptodate(buf); 3676 btrfs_set_buffer_uptodate(buf);
3677
3346 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3678 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3347 set_extent_dirty(&root->dirty_log_pages, buf->start, 3679 set_extent_dirty(&root->dirty_log_pages, buf->start,
3348 buf->start + buf->len - 1, GFP_NOFS); 3680 buf->start + buf->len - 1, GFP_NOFS);
@@ -3351,6 +3683,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3351 buf->start + buf->len - 1, GFP_NOFS); 3683 buf->start + buf->len - 1, GFP_NOFS);
3352 } 3684 }
3353 trans->blocks_used++; 3685 trans->blocks_used++;
3686 /* this returns a buffer locked for blocking */
3354 return buf; 3687 return buf;
3355} 3688}
3356 3689
@@ -3379,7 +3712,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3379 return ERR_PTR(ret); 3712 return ERR_PTR(ret);
3380 } 3713 }
3381 3714
3382 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); 3715 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
3716 blocksize, level);
3383 return buf; 3717 return buf;
3384} 3718}
3385 3719
@@ -3388,36 +3722,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3388{ 3722{
3389 u64 leaf_owner; 3723 u64 leaf_owner;
3390 u64 leaf_generation; 3724 u64 leaf_generation;
3725 struct refsort *sorted;
3391 struct btrfs_key key; 3726 struct btrfs_key key;
3392 struct btrfs_file_extent_item *fi; 3727 struct btrfs_file_extent_item *fi;
3393 int i; 3728 int i;
3394 int nritems; 3729 int nritems;
3395 int ret; 3730 int ret;
3731 int refi = 0;
3732 int slot;
3396 3733
3397 BUG_ON(!btrfs_is_leaf(leaf)); 3734 BUG_ON(!btrfs_is_leaf(leaf));
3398 nritems = btrfs_header_nritems(leaf); 3735 nritems = btrfs_header_nritems(leaf);
3399 leaf_owner = btrfs_header_owner(leaf); 3736 leaf_owner = btrfs_header_owner(leaf);
3400 leaf_generation = btrfs_header_generation(leaf); 3737 leaf_generation = btrfs_header_generation(leaf);
3401 3738
3739 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3740 /* we do this loop twice. The first time we build a list
3741 * of the extents we have a reference on, then we sort the list
3742 * by bytenr. The second time around we actually do the
3743 * extent freeing.
3744 */
3402 for (i = 0; i < nritems; i++) { 3745 for (i = 0; i < nritems; i++) {
3403 u64 disk_bytenr; 3746 u64 disk_bytenr;
3404 cond_resched(); 3747 cond_resched();
3405 3748
3406 btrfs_item_key_to_cpu(leaf, &key, i); 3749 btrfs_item_key_to_cpu(leaf, &key, i);
3750
3751 /* only extents have references, skip everything else */
3407 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3752 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3408 continue; 3753 continue;
3754
3409 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3755 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3756
3757 /* inline extents live in the btree, they don't have refs */
3410 if (btrfs_file_extent_type(leaf, fi) == 3758 if (btrfs_file_extent_type(leaf, fi) ==
3411 BTRFS_FILE_EXTENT_INLINE) 3759 BTRFS_FILE_EXTENT_INLINE)
3412 continue; 3760 continue;
3413 /* 3761
3414 * FIXME make sure to insert a trans record that
3415 * repeats the snapshot del on crash
3416 */
3417 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3762 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3763
3764 /* holes don't have refs */
3418 if (disk_bytenr == 0) 3765 if (disk_bytenr == 0)
3419 continue; 3766 continue;
3420 3767
3768 sorted[refi].bytenr = disk_bytenr;
3769 sorted[refi].slot = i;
3770 refi++;
3771 }
3772
3773 if (refi == 0)
3774 goto out;
3775
3776 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3777
3778 for (i = 0; i < refi; i++) {
3779 u64 disk_bytenr;
3780
3781 disk_bytenr = sorted[i].bytenr;
3782 slot = sorted[i].slot;
3783
3784 cond_resched();
3785
3786 btrfs_item_key_to_cpu(leaf, &key, slot);
3787 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3788 continue;
3789
3790 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3791
3421 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3792 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3422 btrfs_file_extent_disk_num_bytes(leaf, fi), 3793 btrfs_file_extent_disk_num_bytes(leaf, fi),
3423 leaf->start, leaf_owner, leaf_generation, 3794 leaf->start, leaf_owner, leaf_generation,
@@ -3428,6 +3799,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3428 wake_up(&root->fs_info->transaction_throttle); 3799 wake_up(&root->fs_info->transaction_throttle);
3429 cond_resched(); 3800 cond_resched();
3430 } 3801 }
3802out:
3803 kfree(sorted);
3431 return 0; 3804 return 0;
3432} 3805}
3433 3806
@@ -3437,9 +3810,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3437{ 3810{
3438 int i; 3811 int i;
3439 int ret; 3812 int ret;
3440 struct btrfs_extent_info *info = ref->extents; 3813 struct btrfs_extent_info *info;
3814 struct refsort *sorted;
3815
3816 if (ref->nritems == 0)
3817 return 0;
3441 3818
3819 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3442 for (i = 0; i < ref->nritems; i++) { 3820 for (i = 0; i < ref->nritems; i++) {
3821 sorted[i].bytenr = ref->extents[i].bytenr;
3822 sorted[i].slot = i;
3823 }
3824 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3825
3826 /*
3827 * the items in the ref were sorted when the ref was inserted
3828 * into the ref cache, so this is already in order
3829 */
3830 for (i = 0; i < ref->nritems; i++) {
3831 info = ref->extents + sorted[i].slot;
3443 ret = __btrfs_free_extent(trans, root, info->bytenr, 3832 ret = __btrfs_free_extent(trans, root, info->bytenr,
3444 info->num_bytes, ref->bytenr, 3833 info->num_bytes, ref->bytenr,
3445 ref->owner, ref->generation, 3834 ref->owner, ref->generation,
@@ -3453,6 +3842,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3453 info++; 3842 info++;
3454 } 3843 }
3455 3844
3845 kfree(sorted);
3456 return 0; 3846 return 0;
3457} 3847}
3458 3848
@@ -3497,6 +3887,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3497} 3887}
3498 3888
3499/* 3889/*
3890 * this is used while deleting old snapshots, and it drops the refs
3891 * on a whole subtree starting from a level 1 node.
3892 *
3893 * The idea is to sort all the leaf pointers, and then drop the
3894 * ref on all the leaves in order. Most of the time the leaves
3895 * will have ref cache entries, so no leaf IOs will be required to
3896 * find the extents they have references on.
3897 *
3898 * For each leaf, any references it has are also dropped in order
3899 *
3900 * This ends up dropping the references in something close to optimal
3901 * order for reading and modifying the extent allocation tree.
3902 */
3903static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3904 struct btrfs_root *root,
3905 struct btrfs_path *path)
3906{
3907 u64 bytenr;
3908 u64 root_owner;
3909 u64 root_gen;
3910 struct extent_buffer *eb = path->nodes[1];
3911 struct extent_buffer *leaf;
3912 struct btrfs_leaf_ref *ref;
3913 struct refsort *sorted = NULL;
3914 int nritems = btrfs_header_nritems(eb);
3915 int ret;
3916 int i;
3917 int refi = 0;
3918 int slot = path->slots[1];
3919 u32 blocksize = btrfs_level_size(root, 0);
3920 u32 refs;
3921
3922 if (nritems == 0)
3923 goto out;
3924
3925 root_owner = btrfs_header_owner(eb);
3926 root_gen = btrfs_header_generation(eb);
3927 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3928
3929 /*
3930 * step one, sort all the leaf pointers so we don't scribble
3931 * randomly into the extent allocation tree
3932 */
3933 for (i = slot; i < nritems; i++) {
3934 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3935 sorted[refi].slot = i;
3936 refi++;
3937 }
3938
3939 /*
3940 * nritems won't be zero, but if we're picking up drop_snapshot
3941 * after a crash, slot might be > 0, so double check things
3942 * just in case.
3943 */
3944 if (refi == 0)
3945 goto out;
3946
3947 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3948
3949 /*
3950 * the first loop frees everything the leaves point to
3951 */
3952 for (i = 0; i < refi; i++) {
3953 u64 ptr_gen;
3954
3955 bytenr = sorted[i].bytenr;
3956
3957 /*
3958 * check the reference count on this leaf. If it is > 1
3959 * we just decrement it below and don't update any
3960 * of the refs the leaf points to.
3961 */
3962 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3963 BUG_ON(ret);
3964 if (refs != 1)
3965 continue;
3966
3967 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3968
3969 /*
3970 * the leaf only had one reference, which means the
3971 * only thing pointing to this leaf is the snapshot
3972 * we're deleting. It isn't possible for the reference
3973 * count to increase again later
3974 *
3975 * The reference cache is checked for the leaf,
3976 * and if found we'll be able to drop any refs held by
3977 * the leaf without needing to read it in.
3978 */
3979 ref = btrfs_lookup_leaf_ref(root, bytenr);
3980 if (ref && ref->generation != ptr_gen) {
3981 btrfs_free_leaf_ref(root, ref);
3982 ref = NULL;
3983 }
3984 if (ref) {
3985 ret = cache_drop_leaf_ref(trans, root, ref);
3986 BUG_ON(ret);
3987 btrfs_remove_leaf_ref(root, ref);
3988 btrfs_free_leaf_ref(root, ref);
3989 } else {
3990 /*
3991 * the leaf wasn't in the reference cache, so
3992 * we have to read it.
3993 */
3994 leaf = read_tree_block(root, bytenr, blocksize,
3995 ptr_gen);
3996 ret = btrfs_drop_leaf_ref(trans, root, leaf);
3997 BUG_ON(ret);
3998 free_extent_buffer(leaf);
3999 }
4000 atomic_inc(&root->fs_info->throttle_gen);
4001 wake_up(&root->fs_info->transaction_throttle);
4002 cond_resched();
4003 }
4004
4005 /*
4006 * run through the loop again to free the refs on the leaves.
4007 * This is faster than doing it in the loop above because
4008 * the leaves are likely to be clustered together. We end up
4009 * working in nice chunks on the extent allocation tree.
4010 */
4011 for (i = 0; i < refi; i++) {
4012 bytenr = sorted[i].bytenr;
4013 ret = __btrfs_free_extent(trans, root, bytenr,
4014 blocksize, eb->start,
4015 root_owner, root_gen, 0, 1);
4016 BUG_ON(ret);
4017
4018 atomic_inc(&root->fs_info->throttle_gen);
4019 wake_up(&root->fs_info->transaction_throttle);
4020 cond_resched();
4021 }
4022out:
4023 kfree(sorted);
4024
4025 /*
4026 * update the path to show we've processed the entire level 1
4027 * node. This will get saved into the root's drop_snapshot_progress
4028 * field so these drops are not repeated again if this transaction
4029 * commits.
4030 */
4031 path->slots[1] = nritems;
4032 return 0;
4033}
4034
4035/*
3500 * helper function for drop_snapshot, this walks down the tree dropping ref 4036 * helper function for drop_snapshot, this walks down the tree dropping ref
3501 * counts as it goes. 4037 * counts as it goes.
3502 */ 4038 */
@@ -3511,7 +4047,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3511 struct extent_buffer *next; 4047 struct extent_buffer *next;
3512 struct extent_buffer *cur; 4048 struct extent_buffer *cur;
3513 struct extent_buffer *parent; 4049 struct extent_buffer *parent;
3514 struct btrfs_leaf_ref *ref;
3515 u32 blocksize; 4050 u32 blocksize;
3516 int ret; 4051 int ret;
3517 u32 refs; 4052 u32 refs;
@@ -3538,17 +4073,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3538 if (path->slots[*level] >= 4073 if (path->slots[*level] >=
3539 btrfs_header_nritems(cur)) 4074 btrfs_header_nritems(cur))
3540 break; 4075 break;
4076
4077 /* the new code goes down to level 1 and does all the
4078 * leaves pointed to that node in bulk. So, this check
4079 * for level 0 will always be false.
4080 *
4081 * But, the disk format allows the drop_snapshot_progress
4082 * field in the root to leave things in a state where
4083 * a leaf will need cleaning up here. If someone crashes
4084 * with the old code and then boots with the new code,
4085 * we might find a leaf here.
4086 */
3541 if (*level == 0) { 4087 if (*level == 0) {
3542 ret = btrfs_drop_leaf_ref(trans, root, cur); 4088 ret = btrfs_drop_leaf_ref(trans, root, cur);
3543 BUG_ON(ret); 4089 BUG_ON(ret);
3544 break; 4090 break;
3545 } 4091 }
4092
4093 /*
4094 * once we get to level one, process the whole node
4095 * at once, including everything below it.
4096 */
4097 if (*level == 1) {
4098 ret = drop_level_one_refs(trans, root, path);
4099 BUG_ON(ret);
4100 break;
4101 }
4102
3546 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 4103 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3547 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 4104 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3548 blocksize = btrfs_level_size(root, *level - 1); 4105 blocksize = btrfs_level_size(root, *level - 1);
3549 4106
3550 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 4107 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3551 BUG_ON(ret); 4108 BUG_ON(ret);
4109
4110 /*
4111 * if there is more than one reference, we don't need
4112 * to read that node to drop any references it has. We
4113 * just drop the ref we hold on that node and move on to the
4114 * next slot in this level.
4115 */
3552 if (refs != 1) { 4116 if (refs != 1) {
3553 parent = path->nodes[*level]; 4117 parent = path->nodes[*level];
3554 root_owner = btrfs_header_owner(parent); 4118 root_owner = btrfs_header_owner(parent);
@@ -3567,46 +4131,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3567 4131
3568 continue; 4132 continue;
3569 } 4133 }
4134
3570 /* 4135 /*
3571 * at this point, we have a single ref, and since the 4136 * we need to keep freeing things in the next level down.
3572 * only place referencing this extent is a dead root 4137 * read the block and loop around to process it
3573 * the reference count should never go higher.
3574 * So, we don't need to check it again
3575 */ 4138 */
3576 if (*level == 1) { 4139 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3577 ref = btrfs_lookup_leaf_ref(root, bytenr);
3578 if (ref && ref->generation != ptr_gen) {
3579 btrfs_free_leaf_ref(root, ref);
3580 ref = NULL;
3581 }
3582 if (ref) {
3583 ret = cache_drop_leaf_ref(trans, root, ref);
3584 BUG_ON(ret);
3585 btrfs_remove_leaf_ref(root, ref);
3586 btrfs_free_leaf_ref(root, ref);
3587 *level = 0;
3588 break;
3589 }
3590 }
3591 next = btrfs_find_tree_block(root, bytenr, blocksize);
3592 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3593 free_extent_buffer(next);
3594
3595 next = read_tree_block(root, bytenr, blocksize,
3596 ptr_gen);
3597 cond_resched();
3598#if 0
3599 /*
3600 * this is a debugging check and can go away
3601 * the ref should never go all the way down to 1
3602 * at this point
3603 */
3604 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3605 &refs);
3606 BUG_ON(ret);
3607 WARN_ON(refs != 1);
3608#endif
3609 }
3610 WARN_ON(*level <= 0); 4140 WARN_ON(*level <= 0);
3611 if (path->nodes[*level-1]) 4141 if (path->nodes[*level-1])
3612 free_extent_buffer(path->nodes[*level-1]); 4142 free_extent_buffer(path->nodes[*level-1]);
@@ -3631,11 +4161,16 @@ out:
3631 root_owner = btrfs_header_owner(parent); 4161 root_owner = btrfs_header_owner(parent);
3632 root_gen = btrfs_header_generation(parent); 4162 root_gen = btrfs_header_generation(parent);
3633 4163
4164 /*
4165 * cleanup and free the reference on the last node
4166 * we processed
4167 */
3634 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 4168 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3635 parent->start, root_owner, root_gen, 4169 parent->start, root_owner, root_gen,
3636 *level, 1); 4170 *level, 1);
3637 free_extent_buffer(path->nodes[*level]); 4171 free_extent_buffer(path->nodes[*level]);
3638 path->nodes[*level] = NULL; 4172 path->nodes[*level] = NULL;
4173
3639 *level += 1; 4174 *level += 1;
3640 BUG_ON(ret); 4175 BUG_ON(ret);
3641 4176
@@ -3687,6 +4222,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3687 4222
3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 4223 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3689 btrfs_tree_lock(next); 4224 btrfs_tree_lock(next);
4225 btrfs_set_lock_blocking(next);
3690 4226
3691 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 4227 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3692 &refs); 4228 &refs);
@@ -3754,6 +4290,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4290 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3755 struct extent_buffer *node; 4291 struct extent_buffer *node;
3756 struct btrfs_disk_key disk_key; 4292 struct btrfs_disk_key disk_key;
4293
4294 /*
4295 * there is more work to do in this level.
4296 * Update the drop_progress marker to reflect
4297 * the work we've done so far, and then bump
4298 * the slot number
4299 */
3757 node = path->nodes[i]; 4300 node = path->nodes[i];
3758 path->slots[i]++; 4301 path->slots[i]++;
3759 *level = i; 4302 *level = i;
@@ -3765,6 +4308,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3765 return 0; 4308 return 0;
3766 } else { 4309 } else {
3767 struct extent_buffer *parent; 4310 struct extent_buffer *parent;
4311
4312 /*
4313 * this whole node is done, free our reference
4314 * on it and go up one level
4315 */
3768 if (path->nodes[*level] == root->node) 4316 if (path->nodes[*level] == root->node)
3769 parent = path->nodes[*level]; 4317 parent = path->nodes[*level];
3770 else 4318 else
@@ -3891,13 +4439,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3891 path = btrfs_alloc_path(); 4439 path = btrfs_alloc_path();
3892 BUG_ON(!path); 4440 BUG_ON(!path);
3893 4441
3894 BUG_ON(!btrfs_tree_locked(parent)); 4442 btrfs_assert_tree_locked(parent);
3895 parent_level = btrfs_header_level(parent); 4443 parent_level = btrfs_header_level(parent);
3896 extent_buffer_get(parent); 4444 extent_buffer_get(parent);
3897 path->nodes[parent_level] = parent; 4445 path->nodes[parent_level] = parent;
3898 path->slots[parent_level] = btrfs_header_nritems(parent); 4446 path->slots[parent_level] = btrfs_header_nritems(parent);
3899 4447
3900 BUG_ON(!btrfs_tree_locked(node)); 4448 btrfs_assert_tree_locked(node);
3901 level = btrfs_header_level(node); 4449 level = btrfs_header_level(node);
3902 extent_buffer_get(node); 4450 extent_buffer_get(node);
3903 path->nodes[level] = node; 4451 path->nodes[level] = node;
@@ -4444,7 +4992,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4444 u64 lock_end = 0; 4992 u64 lock_end = 0;
4445 u64 num_bytes; 4993 u64 num_bytes;
4446 u64 ext_offset; 4994 u64 ext_offset;
4447 u64 first_pos; 4995 u64 search_end = (u64)-1;
4448 u32 nritems; 4996 u32 nritems;
4449 int nr_scaned = 0; 4997 int nr_scaned = 0;
4450 int extent_locked = 0; 4998 int extent_locked = 0;
@@ -4452,7 +5000,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4452 int ret; 5000 int ret;
4453 5001
4454 memcpy(&key, leaf_key, sizeof(key)); 5002 memcpy(&key, leaf_key, sizeof(key));
4455 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4456 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 5003 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4457 if (key.objectid < ref_path->owner_objectid || 5004 if (key.objectid < ref_path->owner_objectid ||
4458 (key.objectid == ref_path->owner_objectid && 5005 (key.objectid == ref_path->owner_objectid &&
@@ -4501,7 +5048,7 @@ next:
4501 if ((key.objectid > ref_path->owner_objectid) || 5048 if ((key.objectid > ref_path->owner_objectid) ||
4502 (key.objectid == ref_path->owner_objectid && 5049 (key.objectid == ref_path->owner_objectid &&
4503 key.type > BTRFS_EXTENT_DATA_KEY) || 5050 key.type > BTRFS_EXTENT_DATA_KEY) ||
4504 (key.offset >= first_pos + extent_key->offset)) 5051 key.offset >= search_end)
4505 break; 5052 break;
4506 } 5053 }
4507 5054
@@ -4534,8 +5081,10 @@ next:
4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 5081 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4535 ext_offset = btrfs_file_extent_offset(leaf, fi); 5082 ext_offset = btrfs_file_extent_offset(leaf, fi);
4536 5083
4537 if (first_pos > key.offset - ext_offset) 5084 if (search_end == (u64)-1) {
4538 first_pos = key.offset - ext_offset; 5085 search_end = key.offset - ext_offset +
5086 btrfs_file_extent_ram_bytes(leaf, fi);
5087 }
4539 5088
4540 if (!extent_locked) { 5089 if (!extent_locked) {
4541 lock_start = key.offset; 5090 lock_start = key.offset;
@@ -4724,7 +5273,7 @@ next:
4724 } 5273 }
4725skip: 5274skip:
4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 5275 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4727 key.offset >= first_pos + extent_key->offset) 5276 key.offset >= search_end)
4728 break; 5277 break;
4729 5278
4730 cond_resched(); 5279 cond_resched();
@@ -4778,6 +5327,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4778 ref->bytenr = buf->start; 5327 ref->bytenr = buf->start;
4779 ref->owner = btrfs_header_owner(buf); 5328 ref->owner = btrfs_header_owner(buf);
4780 ref->generation = btrfs_header_generation(buf); 5329 ref->generation = btrfs_header_generation(buf);
5330
4781 ret = btrfs_add_leaf_ref(root, ref, 0); 5331 ret = btrfs_add_leaf_ref(root, ref, 0);
4782 WARN_ON(ret); 5332 WARN_ON(ret);
4783 btrfs_free_leaf_ref(root, ref); 5333 btrfs_free_leaf_ref(root, ref);
@@ -5351,7 +5901,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5351 prev_block = block_start; 5901 prev_block = block_start;
5352 } 5902 }
5353 5903
5904 mutex_lock(&extent_root->fs_info->trans_mutex);
5354 btrfs_record_root_in_trans(found_root); 5905 btrfs_record_root_in_trans(found_root);
5906 mutex_unlock(&extent_root->fs_info->trans_mutex);
5355 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 5907 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5356 /* 5908 /*
5357 * try to update data extent references while 5909 * try to update data extent references while
@@ -5789,6 +6341,7 @@ out:
5789int btrfs_free_block_groups(struct btrfs_fs_info *info) 6341int btrfs_free_block_groups(struct btrfs_fs_info *info)
5790{ 6342{
5791 struct btrfs_block_group_cache *block_group; 6343 struct btrfs_block_group_cache *block_group;
6344 struct btrfs_space_info *space_info;
5792 struct rb_node *n; 6345 struct rb_node *n;
5793 6346
5794 spin_lock(&info->block_group_cache_lock); 6347 spin_lock(&info->block_group_cache_lock);
@@ -5810,6 +6363,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
5810 spin_lock(&info->block_group_cache_lock); 6363 spin_lock(&info->block_group_cache_lock);
5811 } 6364 }
5812 spin_unlock(&info->block_group_cache_lock); 6365 spin_unlock(&info->block_group_cache_lock);
6366
6367 /* now that all the block groups are freed, go through and
6368 * free all the space_info structs. This is only called during
6369 * the final stages of unmount, and so we know nobody is
6370 * using them. We call synchronize_rcu() once before we start,
6371 * just to be on the safe side.
6372 */
6373 synchronize_rcu();
6374
6375 while(!list_empty(&info->space_info)) {
6376 space_info = list_entry(info->space_info.next,
6377 struct btrfs_space_info,
6378 list);
6379
6380 list_del(&space_info->list);
6381 kfree(space_info);
6382 }
5813 return 0; 6383 return 0;
5814} 6384}
5815 6385
@@ -5957,9 +6527,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5957 path = btrfs_alloc_path(); 6527 path = btrfs_alloc_path();
5958 BUG_ON(!path); 6528 BUG_ON(!path);
5959 6529
5960 btrfs_remove_free_space_cache(block_group); 6530 spin_lock(&root->fs_info->block_group_cache_lock);
5961 rb_erase(&block_group->cache_node, 6531 rb_erase(&block_group->cache_node,
5962 &root->fs_info->block_group_cache_tree); 6532 &root->fs_info->block_group_cache_tree);
6533 spin_unlock(&root->fs_info->block_group_cache_lock);
6534 btrfs_remove_free_space_cache(block_group);
5963 down_write(&block_group->space_info->groups_sem); 6535 down_write(&block_group->space_info->groups_sem);
5964 list_del(&block_group->list); 6536 list_del(&block_group->list);
5965 up_write(&block_group->space_info->groups_sem); 6537 up_write(&block_group->space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e086d407f1fa..ebe6b29e6069 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -9,7 +9,6 @@
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/swap.h> 11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h> 12#include <linux/writeback.h>
14#include <linux/pagevec.h> 13#include <linux/pagevec.h>
15#include "extent_io.h" 14#include "extent_io.h"
@@ -31,7 +30,7 @@ static LIST_HEAD(buffers);
31static LIST_HEAD(states); 30static LIST_HEAD(states);
32 31
33#define LEAK_DEBUG 0 32#define LEAK_DEBUG 0
34#ifdef LEAK_DEBUG 33#if LEAK_DEBUG
35static DEFINE_SPINLOCK(leak_lock); 34static DEFINE_SPINLOCK(leak_lock);
36#endif 35#endif
37 36
@@ -120,7 +119,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
120static struct extent_state *alloc_extent_state(gfp_t mask) 119static struct extent_state *alloc_extent_state(gfp_t mask)
121{ 120{
122 struct extent_state *state; 121 struct extent_state *state;
123#ifdef LEAK_DEBUG 122#if LEAK_DEBUG
124 unsigned long flags; 123 unsigned long flags;
125#endif 124#endif
126 125
@@ -130,7 +129,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
130 state->state = 0; 129 state->state = 0;
131 state->private = 0; 130 state->private = 0;
132 state->tree = NULL; 131 state->tree = NULL;
133#ifdef LEAK_DEBUG 132#if LEAK_DEBUG
134 spin_lock_irqsave(&leak_lock, flags); 133 spin_lock_irqsave(&leak_lock, flags);
135 list_add(&state->leak_list, &states); 134 list_add(&state->leak_list, &states);
136 spin_unlock_irqrestore(&leak_lock, flags); 135 spin_unlock_irqrestore(&leak_lock, flags);
@@ -145,11 +144,11 @@ static void free_extent_state(struct extent_state *state)
145 if (!state) 144 if (!state)
146 return; 145 return;
147 if (atomic_dec_and_test(&state->refs)) { 146 if (atomic_dec_and_test(&state->refs)) {
148#ifdef LEAK_DEBUG 147#if LEAK_DEBUG
149 unsigned long flags; 148 unsigned long flags;
150#endif 149#endif
151 WARN_ON(state->tree); 150 WARN_ON(state->tree);
152#ifdef LEAK_DEBUG 151#if LEAK_DEBUG
153 spin_lock_irqsave(&leak_lock, flags); 152 spin_lock_irqsave(&leak_lock, flags);
154 list_del(&state->leak_list); 153 list_del(&state->leak_list);
155 spin_unlock_irqrestore(&leak_lock, flags); 154 spin_unlock_irqrestore(&leak_lock, flags);
@@ -416,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
416 415
417 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 416 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
418 if (node) { 417 if (node) {
419 struct extent_state *found;
420 found = rb_entry(node, struct extent_state, rb_node);
421 free_extent_state(prealloc); 418 free_extent_state(prealloc);
422 return -EEXIST; 419 return -EEXIST;
423 } 420 }
@@ -2378,11 +2375,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2378 int scanned = 0; 2375 int scanned = 0;
2379 int range_whole = 0; 2376 int range_whole = 0;
2380 2377
2381 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2382 wbc->encountered_congestion = 1;
2383 return 0;
2384 }
2385
2386 pagevec_init(&pvec, 0); 2378 pagevec_init(&pvec, 0);
2387 if (wbc->range_cyclic) { 2379 if (wbc->range_cyclic) {
2388 index = mapping->writeback_index; /* Start from prev offset */ 2380 index = mapping->writeback_index; /* Start from prev offset */
@@ -2855,6 +2847,98 @@ out:
2855 return sector; 2847 return sector;
2856} 2848}
2857 2849
2850int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2851 __u64 start, __u64 len, get_extent_t *get_extent)
2852{
2853 int ret;
2854 u64 off = start;
2855 u64 max = start + len;
2856 u32 flags = 0;
2857 u64 disko = 0;
2858 struct extent_map *em = NULL;
2859 int end = 0;
2860 u64 em_start = 0, em_len = 0;
2861 unsigned long emflags;
2862 ret = 0;
2863
2864 if (len == 0)
2865 return -EINVAL;
2866
2867 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2868 GFP_NOFS);
2869 em = get_extent(inode, NULL, 0, off, max - off, 0);
2870 if (!em)
2871 goto out;
2872 if (IS_ERR(em)) {
2873 ret = PTR_ERR(em);
2874 goto out;
2875 }
2876 while (!end) {
2877 off = em->start + em->len;
2878 if (off >= max)
2879 end = 1;
2880
2881 em_start = em->start;
2882 em_len = em->len;
2883
2884 disko = 0;
2885 flags = 0;
2886
2887 switch (em->block_start) {
2888 case EXTENT_MAP_LAST_BYTE:
2889 end = 1;
2890 flags |= FIEMAP_EXTENT_LAST;
2891 break;
2892 case EXTENT_MAP_HOLE:
2893 flags |= FIEMAP_EXTENT_UNWRITTEN;
2894 break;
2895 case EXTENT_MAP_INLINE:
2896 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2897 FIEMAP_EXTENT_NOT_ALIGNED);
2898 break;
2899 case EXTENT_MAP_DELALLOC:
2900 flags |= (FIEMAP_EXTENT_DELALLOC |
2901 FIEMAP_EXTENT_UNKNOWN);
2902 break;
2903 default:
2904 disko = em->block_start;
2905 break;
2906 }
2907 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2908 flags |= FIEMAP_EXTENT_ENCODED;
2909
2910 emflags = em->flags;
2911 free_extent_map(em);
2912 em = NULL;
2913
2914 if (!end) {
2915 em = get_extent(inode, NULL, 0, off, max - off, 0);
2916 if (!em)
2917 goto out;
2918 if (IS_ERR(em)) {
2919 ret = PTR_ERR(em);
2920 goto out;
2921 }
2922 emflags = em->flags;
2923 }
2924 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2925 flags |= FIEMAP_EXTENT_LAST;
2926 end = 1;
2927 }
2928
2929 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2930 em_len, flags);
2931 if (ret)
2932 goto out_free;
2933 }
2934out_free:
2935 free_extent_map(em);
2936out:
2937 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2938 GFP_NOFS);
2939 return ret;
2940}
2941
2858static inline struct page *extent_buffer_page(struct extent_buffer *eb, 2942static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2859 unsigned long i) 2943 unsigned long i)
2860{ 2944{
@@ -2892,15 +2976,17 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2892 gfp_t mask) 2976 gfp_t mask)
2893{ 2977{
2894 struct extent_buffer *eb = NULL; 2978 struct extent_buffer *eb = NULL;
2895#ifdef LEAK_DEBUG 2979#if LEAK_DEBUG
2896 unsigned long flags; 2980 unsigned long flags;
2897#endif 2981#endif
2898 2982
2899 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 2983 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2900 eb->start = start; 2984 eb->start = start;
2901 eb->len = len; 2985 eb->len = len;
2902 mutex_init(&eb->mutex); 2986 spin_lock_init(&eb->lock);
2903#ifdef LEAK_DEBUG 2987 init_waitqueue_head(&eb->lock_wq);
2988
2989#if LEAK_DEBUG
2904 spin_lock_irqsave(&leak_lock, flags); 2990 spin_lock_irqsave(&leak_lock, flags);
2905 list_add(&eb->leak_list, &buffers); 2991 list_add(&eb->leak_list, &buffers);
2906 spin_unlock_irqrestore(&leak_lock, flags); 2992 spin_unlock_irqrestore(&leak_lock, flags);
@@ -2912,7 +2998,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2912 2998
2913static void __free_extent_buffer(struct extent_buffer *eb) 2999static void __free_extent_buffer(struct extent_buffer *eb)
2914{ 3000{
2915#ifdef LEAK_DEBUG 3001#if LEAK_DEBUG
2916 unsigned long flags; 3002 unsigned long flags;
2917 spin_lock_irqsave(&leak_lock, flags); 3003 spin_lock_irqsave(&leak_lock, flags);
2918 list_del(&eb->leak_list); 3004 list_del(&eb->leak_list);
@@ -2980,8 +3066,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2980 unlock_page(p); 3066 unlock_page(p);
2981 } 3067 }
2982 if (uptodate) 3068 if (uptodate)
2983 eb->flags |= EXTENT_UPTODATE; 3069 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
2984 eb->flags |= EXTENT_BUFFER_FILLED;
2985 3070
2986 spin_lock(&tree->buffer_lock); 3071 spin_lock(&tree->buffer_lock);
2987 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3072 exists = buffer_tree_insert(tree, start, &eb->rb_node);
@@ -3135,7 +3220,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3135 unsigned long num_pages; 3220 unsigned long num_pages;
3136 3221
3137 num_pages = num_extent_pages(eb->start, eb->len); 3222 num_pages = num_extent_pages(eb->start, eb->len);
3138 eb->flags &= ~EXTENT_UPTODATE; 3223 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3139 3224
3140 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3225 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3141 GFP_NOFS); 3226 GFP_NOFS);
@@ -3206,7 +3291,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3206 struct page *page; 3291 struct page *page;
3207 int pg_uptodate = 1; 3292 int pg_uptodate = 1;
3208 3293
3209 if (eb->flags & EXTENT_UPTODATE) 3294 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3210 return 1; 3295 return 1;
3211 3296
3212 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3297 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3242,7 +3327,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3242 struct bio *bio = NULL; 3327 struct bio *bio = NULL;
3243 unsigned long bio_flags = 0; 3328 unsigned long bio_flags = 0;
3244 3329
3245 if (eb->flags & EXTENT_UPTODATE) 3330 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3246 return 0; 3331 return 0;
3247 3332
3248 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3333 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3273,7 +3358,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3273 } 3358 }
3274 if (all_uptodate) { 3359 if (all_uptodate) {
3275 if (start_i == 0) 3360 if (start_i == 0)
3276 eb->flags |= EXTENT_UPTODATE; 3361 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3277 goto unlock_exit; 3362 goto unlock_exit;
3278 } 3363 }
3279 3364
@@ -3309,7 +3394,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3309 } 3394 }
3310 3395
3311 if (!ret) 3396 if (!ret)
3312 eb->flags |= EXTENT_UPTODATE; 3397 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3313 return ret; 3398 return ret;
3314 3399
3315unlock_exit: 3400unlock_exit:
@@ -3406,7 +3491,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3406 unmap_extent_buffer(eb, eb->map_token, km); 3491 unmap_extent_buffer(eb, eb->map_token, km);
3407 eb->map_token = NULL; 3492 eb->map_token = NULL;
3408 save = 1; 3493 save = 1;
3409 WARN_ON(!mutex_is_locked(&eb->mutex));
3410 } 3494 }
3411 err = map_private_extent_buffer(eb, start, min_len, token, map, 3495 err = map_private_extent_buffer(eb, start, min_len, token, map,
3412 map_start, map_len, km); 3496 map_start, map_len, km);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c5b483a79137..1f9df88afbf6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -22,6 +22,10 @@
22/* flags for bio submission */ 22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1 23#define EXTENT_BIO_COMPRESSED 1
24 24
25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1
28
25/* 29/*
26 * page->private values. Every page that is controlled by the extent 30 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one. 31 * map has page->private set to one.
@@ -95,11 +99,19 @@ struct extent_buffer {
95 unsigned long map_start; 99 unsigned long map_start;
96 unsigned long map_len; 100 unsigned long map_len;
97 struct page *first_page; 101 struct page *first_page;
102 unsigned long bflags;
98 atomic_t refs; 103 atomic_t refs;
99 int flags;
100 struct list_head leak_list; 104 struct list_head leak_list;
101 struct rb_node rb_node; 105 struct rb_node rb_node;
102 struct mutex mutex; 106
107 /* the spinlock is used to protect most operations */
108 spinlock_t lock;
109
110 /*
111 * when we keep the lock held while blocking, waiters go onto
112 * the wq
113 */
114 wait_queue_head_t lock_wq;
103}; 115};
104 116
105struct extent_map_tree; 117struct extent_map_tree;
@@ -193,6 +205,8 @@ int extent_commit_write(struct extent_io_tree *tree,
193 unsigned from, unsigned to); 205 unsigned from, unsigned to);
194sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 206sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
195 get_extent_t *get_extent); 207 get_extent_t *get_extent);
208int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
209 __u64 start, __u64 len, get_extent_t *get_extent);
196int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); 210int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
197int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); 211int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
198int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); 212int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a83e33ada32..50da69da20ce 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,7 +3,6 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/spinlock.h> 5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h> 6#include <linux/hardirq.h>
8#include "extent_map.h" 7#include "extent_map.h"
9 8
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 90268334145e..dc78954861b3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -29,7 +29,6 @@
29#include <linux/writeback.h> 29#include <linux/writeback.h>
30#include <linux/statfs.h> 30#include <linux/statfs.h>
31#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h" 32#include "ctree.h"
34#include "disk-io.h" 33#include "disk-io.h"
35#include "transaction.h" 34#include "transaction.h"
@@ -1092,19 +1091,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1092 WARN_ON(num_pages > nrptrs); 1091 WARN_ON(num_pages > nrptrs);
1093 memset(pages, 0, sizeof(struct page *) * nrptrs); 1092 memset(pages, 0, sizeof(struct page *) * nrptrs);
1094 1093
1095 ret = btrfs_check_free_space(root, write_bytes, 0); 1094 ret = btrfs_check_data_free_space(root, inode, write_bytes);
1096 if (ret) 1095 if (ret)
1097 goto out; 1096 goto out;
1098 1097
1099 ret = prepare_pages(root, file, pages, num_pages, 1098 ret = prepare_pages(root, file, pages, num_pages,
1100 pos, first_index, last_index, 1099 pos, first_index, last_index,
1101 write_bytes); 1100 write_bytes);
1102 if (ret) 1101 if (ret) {
1102 btrfs_free_reserved_data_space(root, inode,
1103 write_bytes);
1103 goto out; 1104 goto out;
1105 }
1104 1106
1105 ret = btrfs_copy_from_user(pos, num_pages, 1107 ret = btrfs_copy_from_user(pos, num_pages,
1106 write_bytes, pages, buf); 1108 write_bytes, pages, buf);
1107 if (ret) { 1109 if (ret) {
1110 btrfs_free_reserved_data_space(root, inode,
1111 write_bytes);
1108 btrfs_drop_pages(pages, num_pages); 1112 btrfs_drop_pages(pages, num_pages);
1109 goto out; 1113 goto out;
1110 } 1114 }
@@ -1112,8 +1116,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1112 ret = dirty_and_release_pages(NULL, root, file, pages, 1116 ret = dirty_and_release_pages(NULL, root, file, pages,
1113 num_pages, pos, write_bytes); 1117 num_pages, pos, write_bytes);
1114 btrfs_drop_pages(pages, num_pages); 1118 btrfs_drop_pages(pages, num_pages);
1115 if (ret) 1119 if (ret) {
1120 btrfs_free_reserved_data_space(root, inode,
1121 write_bytes);
1116 goto out; 1122 goto out;
1123 }
1117 1124
1118 if (will_write) { 1125 if (will_write) {
1119 btrfs_fdatawrite_range(inode->i_mapping, pos, 1126 btrfs_fdatawrite_range(inode->i_mapping, pos,
@@ -1137,6 +1144,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1137 } 1144 }
1138out: 1145out:
1139 mutex_unlock(&inode->i_mutex); 1146 mutex_unlock(&inode->i_mutex);
1147 if (ret)
1148 err = ret;
1140 1149
1141out_nolock: 1150out_nolock:
1142 kfree(pages); 1151 kfree(pages);
@@ -1215,15 +1224,15 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1215 } 1224 }
1216 mutex_unlock(&root->fs_info->trans_mutex); 1225 mutex_unlock(&root->fs_info->trans_mutex);
1217 1226
1218 root->fs_info->tree_log_batch++; 1227 root->log_batch++;
1219 filemap_fdatawrite(inode->i_mapping); 1228 filemap_fdatawrite(inode->i_mapping);
1220 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1229 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1221 root->fs_info->tree_log_batch++; 1230 root->log_batch++;
1222 1231
1223 /* 1232 /*
1224 * ok we haven't committed the transaction yet, lets do a commit 1233 * ok we haven't committed the transaction yet, lets do a commit
1225 */ 1234 */
1226 if (file->private_data) 1235 if (file && file->private_data)
1227 btrfs_ioctl_trans_end(file); 1236 btrfs_ioctl_trans_end(file);
1228 1237
1229 trans = btrfs_start_transaction(root, 1); 1238 trans = btrfs_start_transaction(root, 1);
@@ -1232,7 +1241,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1232 goto out; 1241 goto out;
1233 } 1242 }
1234 1243
1235 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); 1244 ret = btrfs_log_dentry_safe(trans, root, dentry);
1236 if (ret < 0) 1245 if (ret < 0)
1237 goto out; 1246 goto out;
1238 1247
@@ -1246,7 +1255,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1246 * file again, but that will end up using the synchronization 1255 * file again, but that will end up using the synchronization
1247 * inside btrfs_sync_log to keep things safe. 1256 * inside btrfs_sync_log to keep things safe.
1248 */ 1257 */
1249 mutex_unlock(&file->f_dentry->d_inode->i_mutex); 1258 mutex_unlock(&dentry->d_inode->i_mutex);
1250 1259
1251 if (ret > 0) { 1260 if (ret > 0) {
1252 ret = btrfs_commit_transaction(trans, root); 1261 ret = btrfs_commit_transaction(trans, root);
@@ -1254,7 +1263,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1254 btrfs_sync_log(trans, root); 1263 btrfs_sync_log(trans, root);
1255 ret = btrfs_end_transaction(trans, root); 1264 ret = btrfs_end_transaction(trans, root);
1256 } 1265 }
1257 mutex_lock(&file->f_dentry->d_inode->i_mutex); 1266 mutex_lock(&dentry->d_inode->i_mutex);
1258out: 1267out:
1259 return ret > 0 ? EIO : ret; 1268 return ret > 0 ? EIO : ret;
1260} 1269}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa79873eb46..cc7334d833c9 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
84 search_key.type = 0; 84 search_key.type = 0;
85 search_key.offset = 0; 85 search_key.offset = 0;
86 86
87 btrfs_init_path(path);
88 start_found = 0; 87 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); 88 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0) 89 if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8adfe059ab41..7d4f948bc22a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -34,7 +34,6 @@
34#include <linux/statfs.h> 34#include <linux/statfs.h>
35#include <linux/compat.h> 35#include <linux/compat.h>
36#include <linux/bit_spinlock.h> 36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h> 37#include <linux/xattr.h>
39#include <linux/posix_acl.h> 38#include <linux/posix_acl.h>
40#include <linux/falloc.h> 39#include <linux/falloc.h>
@@ -51,6 +50,7 @@
51#include "tree-log.h" 50#include "tree-log.h"
52#include "ref-cache.h" 51#include "ref-cache.h"
53#include "compression.h" 52#include "compression.h"
53#include "locking.h"
54 54
55struct btrfs_iget_args { 55struct btrfs_iget_args {
56 u64 ino; 56 u64 ino;
@@ -91,32 +91,14 @@ static noinline int cow_file_range(struct inode *inode,
91 u64 start, u64 end, int *page_started, 91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock); 92 unsigned long *nr_written, int unlock);
93 93
94/* 94static int btrfs_init_inode_security(struct inode *inode, struct inode *dir)
95 * a very lame attempt at stopping writes when the FS is 85% full. There
96 * are countless ways this is incorrect, but it is better than nothing.
97 */
98int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
99 int for_del)
100{ 95{
101 u64 total; 96 int err;
102 u64 used;
103 u64 thresh;
104 int ret = 0;
105
106 spin_lock(&root->fs_info->delalloc_lock);
107 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
108 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
109 if (for_del)
110 thresh = total * 90;
111 else
112 thresh = total * 85;
113
114 do_div(thresh, 100);
115 97
116 if (used + root->fs_info->delalloc_bytes + num_required > thresh) 98 err = btrfs_init_acl(inode, dir);
117 ret = -ENOSPC; 99 if (!err)
118 spin_unlock(&root->fs_info->delalloc_lock); 100 err = btrfs_xattr_security_init(inode, dir);
119 return ret; 101 return err;
120} 102}
121 103
122/* 104/*
@@ -350,6 +332,19 @@ again:
350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 332 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
351 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 333 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
352 334
335 /*
336 * we don't want to send crud past the end of i_size through
337 * compression, that's just a waste of CPU time. So, if the
338 * end of the file is before the start of our current
339 * requested range of bytes, we bail out to the uncompressed
340 * cleanup code that can deal with all of this.
341 *
342 * It isn't really the fastest way to fix things, but this is a
343 * very uncommon corner.
344 */
345 if (actual_end <= start)
346 goto cleanup_and_bail_uncompressed;
347
353 total_compressed = actual_end - start; 348 total_compressed = actual_end - start;
354 349
355 /* we want to make sure that amount of ram required to uncompress 350 /* we want to make sure that amount of ram required to uncompress
@@ -494,6 +489,7 @@ again:
494 goto again; 489 goto again;
495 } 490 }
496 } else { 491 } else {
492cleanup_and_bail_uncompressed:
497 /* 493 /*
498 * No compression, but we still need to write the pages in 494 * No compression, but we still need to write the pages in
499 * the file we've been given so far. redirty the locked 495 * the file we've been given so far. redirty the locked
@@ -1166,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1166 */ 1162 */
1167 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1163 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1168 struct btrfs_root *root = BTRFS_I(inode)->root; 1164 struct btrfs_root *root = BTRFS_I(inode)->root;
1165 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1169 spin_lock(&root->fs_info->delalloc_lock); 1166 spin_lock(&root->fs_info->delalloc_lock);
1170 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1167 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1171 root->fs_info->delalloc_bytes += end - start + 1; 1168 root->fs_info->delalloc_bytes += end - start + 1;
@@ -1199,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1199 (unsigned long long)end - start + 1, 1196 (unsigned long long)end - start + 1,
1200 (unsigned long long) 1197 (unsigned long long)
1201 root->fs_info->delalloc_bytes); 1198 root->fs_info->delalloc_bytes);
1199 btrfs_delalloc_free_space(root, inode, (u64)-1);
1202 root->fs_info->delalloc_bytes = 0; 1200 root->fs_info->delalloc_bytes = 0;
1203 BTRFS_I(inode)->delalloc_bytes = 0; 1201 BTRFS_I(inode)->delalloc_bytes = 0;
1204 } else { 1202 } else {
1203 btrfs_delalloc_free_space(root, inode,
1204 end - start + 1);
1205 root->fs_info->delalloc_bytes -= end - start + 1; 1205 root->fs_info->delalloc_bytes -= end - start + 1;
1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1207 } 1207 }
@@ -1324,12 +1324,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1324 struct inode *inode, u64 file_offset, 1324 struct inode *inode, u64 file_offset,
1325 struct list_head *list) 1325 struct list_head *list)
1326{ 1326{
1327 struct list_head *cur;
1328 struct btrfs_ordered_sum *sum; 1327 struct btrfs_ordered_sum *sum;
1329 1328
1330 btrfs_set_trans_block_group(trans, inode); 1329 btrfs_set_trans_block_group(trans, inode);
1331 list_for_each(cur, list) { 1330
1332 sum = list_entry(cur, struct btrfs_ordered_sum, list); 1331 list_for_each_entry(sum, list, list) {
1333 btrfs_csum_file_blocks(trans, 1332 btrfs_csum_file_blocks(trans,
1334 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1333 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1335 } 1334 }
@@ -2013,6 +2012,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2013 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2012 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2014 2013
2015 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2014 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2015
2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2017 alloc_group_block, 0); 2017 alloc_group_block, 0);
2018 btrfs_free_path(path); 2018 btrfs_free_path(path);
@@ -2039,6 +2039,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2040 break; 2040 break;
2041 default: 2041 default:
2042 inode->i_op = &btrfs_special_inode_operations;
2042 init_special_inode(inode, inode->i_mode, rdev); 2043 init_special_inode(inode, inode->i_mode, rdev);
2043 break; 2044 break;
2044 } 2045 }
@@ -2108,6 +2109,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2108 goto failed; 2109 goto failed;
2109 } 2110 }
2110 2111
2112 btrfs_unlock_up_safe(path, 1);
2111 leaf = path->nodes[0]; 2113 leaf = path->nodes[0];
2112 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2114 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2113 struct btrfs_inode_item); 2115 struct btrfs_inode_item);
@@ -2219,10 +2221,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2219 2221
2220 root = BTRFS_I(dir)->root; 2222 root = BTRFS_I(dir)->root;
2221 2223
2222 ret = btrfs_check_free_space(root, 1, 1);
2223 if (ret)
2224 goto fail;
2225
2226 trans = btrfs_start_transaction(root, 1); 2224 trans = btrfs_start_transaction(root, 1);
2227 2225
2228 btrfs_set_trans_block_group(trans, dir); 2226 btrfs_set_trans_block_group(trans, dir);
@@ -2235,7 +2233,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2235 nr = trans->blocks_used; 2233 nr = trans->blocks_used;
2236 2234
2237 btrfs_end_transaction_throttle(trans, root); 2235 btrfs_end_transaction_throttle(trans, root);
2238fail:
2239 btrfs_btree_balance_dirty(root, nr); 2236 btrfs_btree_balance_dirty(root, nr);
2240 return ret; 2237 return ret;
2241} 2238}
@@ -2258,10 +2255,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2258 return -ENOTEMPTY; 2255 return -ENOTEMPTY;
2259 } 2256 }
2260 2257
2261 ret = btrfs_check_free_space(root, 1, 1);
2262 if (ret)
2263 goto fail;
2264
2265 trans = btrfs_start_transaction(root, 1); 2258 trans = btrfs_start_transaction(root, 1);
2266 btrfs_set_trans_block_group(trans, dir); 2259 btrfs_set_trans_block_group(trans, dir);
2267 2260
@@ -2278,7 +2271,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2278fail_trans: 2271fail_trans:
2279 nr = trans->blocks_used; 2272 nr = trans->blocks_used;
2280 ret = btrfs_end_transaction_throttle(trans, root); 2273 ret = btrfs_end_transaction_throttle(trans, root);
2281fail:
2282 btrfs_btree_balance_dirty(root, nr); 2274 btrfs_btree_balance_dirty(root, nr);
2283 2275
2284 if (ret && !err) 2276 if (ret && !err)
@@ -2429,6 +2421,8 @@ next_node:
2429 ref->generation = leaf_gen; 2421 ref->generation = leaf_gen;
2430 ref->nritems = 0; 2422 ref->nritems = 0;
2431 2423
2424 btrfs_sort_leaf_ref(ref);
2425
2432 ret = btrfs_add_leaf_ref(root, ref, 0); 2426 ret = btrfs_add_leaf_ref(root, ref, 0);
2433 WARN_ON(ret); 2427 WARN_ON(ret);
2434 btrfs_free_leaf_ref(root, ref); 2428 btrfs_free_leaf_ref(root, ref);
@@ -2476,7 +2470,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2476 struct btrfs_path *path; 2470 struct btrfs_path *path;
2477 struct btrfs_key key; 2471 struct btrfs_key key;
2478 struct btrfs_key found_key; 2472 struct btrfs_key found_key;
2479 u32 found_type; 2473 u32 found_type = (u8)-1;
2480 struct extent_buffer *leaf; 2474 struct extent_buffer *leaf;
2481 struct btrfs_file_extent_item *fi; 2475 struct btrfs_file_extent_item *fi;
2482 u64 extent_start = 0; 2476 u64 extent_start = 0;
@@ -2503,8 +2497,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2503 key.offset = (u64)-1; 2497 key.offset = (u64)-1;
2504 key.type = (u8)-1; 2498 key.type = (u8)-1;
2505 2499
2506 btrfs_init_path(path);
2507
2508search_again: 2500search_again:
2509 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2510 if (ret < 0) 2502 if (ret < 0)
@@ -2663,6 +2655,8 @@ next:
2663 if (pending_del_nr) 2655 if (pending_del_nr)
2664 goto del_pending; 2656 goto del_pending;
2665 btrfs_release_path(root, path); 2657 btrfs_release_path(root, path);
2658 if (found_type == BTRFS_INODE_ITEM_KEY)
2659 break;
2666 goto search_again; 2660 goto search_again;
2667 } 2661 }
2668 2662
@@ -2679,6 +2673,8 @@ del_pending:
2679 BUG_ON(ret); 2673 BUG_ON(ret);
2680 pending_del_nr = 0; 2674 pending_del_nr = 0;
2681 btrfs_release_path(root, path); 2675 btrfs_release_path(root, path);
2676 if (found_type == BTRFS_INODE_ITEM_KEY)
2677 break;
2682 goto search_again; 2678 goto search_again;
2683 } 2679 }
2684 } 2680 }
@@ -2788,7 +2784,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2788 if (size <= hole_start) 2784 if (size <= hole_start)
2789 return 0; 2785 return 0;
2790 2786
2791 err = btrfs_check_free_space(root, 1, 0); 2787 err = btrfs_check_metadata_free_space(root);
2792 if (err) 2788 if (err)
2793 return err; 2789 return err;
2794 2790
@@ -2984,6 +2980,7 @@ static noinline void init_btrfs_i(struct inode *inode)
2984 bi->last_trans = 0; 2980 bi->last_trans = 0;
2985 bi->logged_trans = 0; 2981 bi->logged_trans = 0;
2986 bi->delalloc_bytes = 0; 2982 bi->delalloc_bytes = 0;
2983 bi->reserved_bytes = 0;
2987 bi->disk_i_size = 0; 2984 bi->disk_i_size = 0;
2988 bi->flags = 0; 2985 bi->flags = 0;
2989 bi->index_cnt = (u64)-1; 2986 bi->index_cnt = (u64)-1;
@@ -3005,6 +3002,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
3005 inode->i_ino = args->ino; 3002 inode->i_ino = args->ino;
3006 init_btrfs_i(inode); 3003 init_btrfs_i(inode);
3007 BTRFS_I(inode)->root = args->root; 3004 BTRFS_I(inode)->root = args->root;
3005 btrfs_set_inode_space_info(args->root, inode);
3008 return 0; 3006 return 0;
3009} 3007}
3010 3008
@@ -3265,7 +3263,7 @@ skip:
3265 3263
3266 /* Reached end of directory/root. Bump pos past the last item. */ 3264 /* Reached end of directory/root. Bump pos past the last item. */
3267 if (key_type == BTRFS_DIR_INDEX_KEY) 3265 if (key_type == BTRFS_DIR_INDEX_KEY)
3268 filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); 3266 filp->f_pos = INT_LIMIT(off_t);
3269 else 3267 else
3270 filp->f_pos++; 3268 filp->f_pos++;
3271nopos: 3269nopos:
@@ -3425,6 +3423,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3425 BTRFS_I(inode)->index_cnt = 2; 3423 BTRFS_I(inode)->index_cnt = 2;
3426 BTRFS_I(inode)->root = root; 3424 BTRFS_I(inode)->root = root;
3427 BTRFS_I(inode)->generation = trans->transid; 3425 BTRFS_I(inode)->generation = trans->transid;
3426 btrfs_set_inode_space_info(root, inode);
3428 3427
3429 if (mode & S_IFDIR) 3428 if (mode & S_IFDIR)
3430 owner = 0; 3429 owner = 0;
@@ -3458,7 +3457,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3458 root->highest_inode = objectid; 3457 root->highest_inode = objectid;
3459 3458
3460 inode->i_uid = current_fsuid(); 3459 inode->i_uid = current_fsuid();
3461 inode->i_gid = current_fsgid(); 3460
3461 if (dir && (dir->i_mode & S_ISGID)) {
3462 inode->i_gid = dir->i_gid;
3463 if (S_ISDIR(mode))
3464 mode |= S_ISGID;
3465 } else
3466 inode->i_gid = current_fsgid();
3467
3462 inode->i_mode = mode; 3468 inode->i_mode = mode;
3463 inode->i_ino = objectid; 3469 inode->i_ino = objectid;
3464 inode_set_bytes(inode, 0); 3470 inode_set_bytes(inode, 0);
@@ -3565,7 +3571,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3565 if (!new_valid_dev(rdev)) 3571 if (!new_valid_dev(rdev))
3566 return -EINVAL; 3572 return -EINVAL;
3567 3573
3568 err = btrfs_check_free_space(root, 1, 0); 3574 err = btrfs_check_metadata_free_space(root);
3569 if (err) 3575 if (err)
3570 goto fail; 3576 goto fail;
3571 3577
@@ -3586,7 +3592,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3586 if (IS_ERR(inode)) 3592 if (IS_ERR(inode))
3587 goto out_unlock; 3593 goto out_unlock;
3588 3594
3589 err = btrfs_init_acl(inode, dir); 3595 err = btrfs_init_inode_security(inode, dir);
3590 if (err) { 3596 if (err) {
3591 drop_inode = 1; 3597 drop_inode = 1;
3592 goto out_unlock; 3598 goto out_unlock;
@@ -3628,7 +3634,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3628 u64 objectid; 3634 u64 objectid;
3629 u64 index = 0; 3635 u64 index = 0;
3630 3636
3631 err = btrfs_check_free_space(root, 1, 0); 3637 err = btrfs_check_metadata_free_space(root);
3632 if (err) 3638 if (err)
3633 goto fail; 3639 goto fail;
3634 trans = btrfs_start_transaction(root, 1); 3640 trans = btrfs_start_transaction(root, 1);
@@ -3649,7 +3655,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3649 if (IS_ERR(inode)) 3655 if (IS_ERR(inode))
3650 goto out_unlock; 3656 goto out_unlock;
3651 3657
3652 err = btrfs_init_acl(inode, dir); 3658 err = btrfs_init_inode_security(inode, dir);
3653 if (err) { 3659 if (err) {
3654 drop_inode = 1; 3660 drop_inode = 1;
3655 goto out_unlock; 3661 goto out_unlock;
@@ -3696,7 +3702,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3696 return -ENOENT; 3702 return -ENOENT;
3697 3703
3698 btrfs_inc_nlink(inode); 3704 btrfs_inc_nlink(inode);
3699 err = btrfs_check_free_space(root, 1, 0); 3705 err = btrfs_check_metadata_free_space(root);
3700 if (err) 3706 if (err)
3701 goto fail; 3707 goto fail;
3702 err = btrfs_set_inode_index(dir, &index); 3708 err = btrfs_set_inode_index(dir, &index);
@@ -3742,7 +3748,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3742 u64 index = 0; 3748 u64 index = 0;
3743 unsigned long nr = 1; 3749 unsigned long nr = 1;
3744 3750
3745 err = btrfs_check_free_space(root, 1, 0); 3751 err = btrfs_check_metadata_free_space(root);
3746 if (err) 3752 if (err)
3747 goto out_unlock; 3753 goto out_unlock;
3748 3754
@@ -3772,7 +3778,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3772 3778
3773 drop_on_err = 1; 3779 drop_on_err = 1;
3774 3780
3775 err = btrfs_init_acl(inode, dir); 3781 err = btrfs_init_inode_security(inode, dir);
3776 if (err) 3782 if (err)
3777 goto out_fail; 3783 goto out_fail;
3778 3784
@@ -4158,9 +4164,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4158 return -EINVAL; 4164 return -EINVAL;
4159} 4165}
4160 4166
4161static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) 4167static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4168 __u64 start, __u64 len)
4162{ 4169{
4163 return extent_bmap(mapping, iblock, btrfs_get_extent); 4170 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
4164} 4171}
4165 4172
4166int btrfs_readpage(struct file *file, struct page *page) 4173int btrfs_readpage(struct file *file, struct page *page)
@@ -4223,7 +4230,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4223{ 4230{
4224 if (PageWriteback(page) || PageDirty(page)) 4231 if (PageWriteback(page) || PageDirty(page))
4225 return 0; 4232 return 0;
4226 return __btrfs_releasepage(page, gfp_flags); 4233 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
4227} 4234}
4228 4235
4229static void btrfs_invalidatepage(struct page *page, unsigned long offset) 4236static void btrfs_invalidatepage(struct page *page, unsigned long offset)
@@ -4298,7 +4305,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4298 u64 page_start; 4305 u64 page_start;
4299 u64 page_end; 4306 u64 page_end;
4300 4307
4301 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0); 4308 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
4302 if (ret) 4309 if (ret)
4303 goto out; 4310 goto out;
4304 4311
@@ -4311,6 +4318,7 @@ again:
4311 4318
4312 if ((page->mapping != inode->i_mapping) || 4319 if ((page->mapping != inode->i_mapping) ||
4313 (page_start >= size)) { 4320 (page_start >= size)) {
4321 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4314 /* page got truncated out from underneath us */ 4322 /* page got truncated out from underneath us */
4315 goto out_unlock; 4323 goto out_unlock;
4316 } 4324 }
@@ -4593,7 +4601,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4593 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 4601 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4594 return -EXDEV; 4602 return -EXDEV;
4595 4603
4596 ret = btrfs_check_free_space(root, 1, 0); 4604 ret = btrfs_check_metadata_free_space(root);
4597 if (ret) 4605 if (ret)
4598 goto out_unlock; 4606 goto out_unlock;
4599 4607
@@ -4711,7 +4719,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4711 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 4719 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4712 return -ENAMETOOLONG; 4720 return -ENAMETOOLONG;
4713 4721
4714 err = btrfs_check_free_space(root, 1, 0); 4722 err = btrfs_check_metadata_free_space(root);
4715 if (err) 4723 if (err)
4716 goto out_fail; 4724 goto out_fail;
4717 4725
@@ -4733,7 +4741,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4733 if (IS_ERR(inode)) 4741 if (IS_ERR(inode))
4734 goto out_unlock; 4742 goto out_unlock;
4735 4743
4736 err = btrfs_init_acl(inode, dir); 4744 err = btrfs_init_inode_security(inode, dir);
4737 if (err) { 4745 if (err) {
4738 drop_inode = 1; 4746 drop_inode = 1;
4739 goto out_unlock; 4747 goto out_unlock;
@@ -4987,13 +4995,24 @@ static struct extent_io_ops btrfs_extent_io_ops = {
4987 .clear_bit_hook = btrfs_clear_bit_hook, 4995 .clear_bit_hook = btrfs_clear_bit_hook,
4988}; 4996};
4989 4997
4998/*
4999 * btrfs doesn't support the bmap operation because swapfiles
5000 * use bmap to make a mapping of extents in the file. They assume
5001 * these extents won't change over the life of the file and they
5002 * use the bmap result to do IO directly to the drive.
5003 *
5004 * the btrfs bmap call would return logical addresses that aren't
5005 * suitable for IO and they also will change frequently as COW
5006 * operations happen. So, swapfile + btrfs == corruption.
5007 *
5008 * For now we're avoiding this by dropping bmap.
5009 */
4990static struct address_space_operations btrfs_aops = { 5010static struct address_space_operations btrfs_aops = {
4991 .readpage = btrfs_readpage, 5011 .readpage = btrfs_readpage,
4992 .writepage = btrfs_writepage, 5012 .writepage = btrfs_writepage,
4993 .writepages = btrfs_writepages, 5013 .writepages = btrfs_writepages,
4994 .readpages = btrfs_readpages, 5014 .readpages = btrfs_readpages,
4995 .sync_page = block_sync_page, 5015 .sync_page = block_sync_page,
4996 .bmap = btrfs_bmap,
4997 .direct_IO = btrfs_direct_IO, 5016 .direct_IO = btrfs_direct_IO,
4998 .invalidatepage = btrfs_invalidatepage, 5017 .invalidatepage = btrfs_invalidatepage,
4999 .releasepage = btrfs_releasepage, 5018 .releasepage = btrfs_releasepage,
@@ -5017,6 +5036,7 @@ static struct inode_operations btrfs_file_inode_operations = {
5017 .removexattr = btrfs_removexattr, 5036 .removexattr = btrfs_removexattr,
5018 .permission = btrfs_permission, 5037 .permission = btrfs_permission,
5019 .fallocate = btrfs_fallocate, 5038 .fallocate = btrfs_fallocate,
5039 .fiemap = btrfs_fiemap,
5020}; 5040};
5021static struct inode_operations btrfs_special_inode_operations = { 5041static struct inode_operations btrfs_special_inode_operations = {
5022 .getattr = btrfs_getattr, 5042 .getattr = btrfs_getattr,
@@ -5032,4 +5052,8 @@ static struct inode_operations btrfs_symlink_inode_operations = {
5032 .follow_link = page_follow_link_light, 5052 .follow_link = page_follow_link_light,
5033 .put_link = page_put_link, 5053 .put_link = page_put_link,
5034 .permission = btrfs_permission, 5054 .permission = btrfs_permission,
5055 .setxattr = btrfs_setxattr,
5056 .getxattr = btrfs_getxattr,
5057 .listxattr = btrfs_listxattr,
5058 .removexattr = btrfs_removexattr,
5035}; 5059};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c2aa33e3feb5..bca729fc80c8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -38,7 +38,6 @@
38#include <linux/compat.h> 38#include <linux/compat.h>
39#include <linux/bit_spinlock.h> 39#include <linux/bit_spinlock.h>
40#include <linux/security.h> 40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h> 41#include <linux/xattr.h>
43#include <linux/vmalloc.h> 42#include <linux/vmalloc.h>
44#include "compat.h" 43#include "compat.h"
@@ -71,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root,
71 u64 index = 0; 70 u64 index = 0;
72 unsigned long nr = 1; 71 unsigned long nr = 1;
73 72
74 ret = btrfs_check_free_space(root, 1, 0); 73 ret = btrfs_check_metadata_free_space(root);
75 if (ret) 74 if (ret)
76 goto fail_commit; 75 goto fail_commit;
77 76
@@ -204,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
204 if (!root->ref_cows) 203 if (!root->ref_cows)
205 return -EINVAL; 204 return -EINVAL;
206 205
207 ret = btrfs_check_free_space(root, 1, 0); 206 ret = btrfs_check_metadata_free_space(root);
208 if (ret) 207 if (ret)
209 goto fail_unlock; 208 goto fail_unlock;
210 209
@@ -375,7 +374,7 @@ static int btrfs_defrag_file(struct file *file)
375 unsigned long i; 374 unsigned long i;
376 int ret; 375 int ret;
377 376
378 ret = btrfs_check_free_space(root, inode->i_size, 0); 377 ret = btrfs_check_data_free_space(root, inode, inode->i_size);
379 if (ret) 378 if (ret)
380 return -ENOSPC; 379 return -ENOSPC;
381 380
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 78049ea208db..b320b103fa13 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,13 +22,20 @@
22 22
23#define BTRFS_IOCTL_MAGIC 0x94 23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255 24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 3072 25#define BTRFS_PATH_NAME_MAX 4087
26 26
27/* this should be 4k */
27struct btrfs_ioctl_vol_args { 28struct btrfs_ioctl_vol_args {
28 __s64 fd; 29 __s64 fd;
29 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
30}; 31};
31 32
33struct btrfs_ioctl_clone_range_args {
34 __s64 src_fd;
35 __u64 src_offset, src_length;
36 __u64 dest_offset;
37};
38
32#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 39#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
33 struct btrfs_ioctl_vol_args) 40 struct btrfs_ioctl_vol_args)
34#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 41#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -52,11 +59,6 @@ struct btrfs_ioctl_vol_args {
52 struct btrfs_ioctl_vol_args) 59 struct btrfs_ioctl_vol_args)
53#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ 60#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
54 struct btrfs_ioctl_vol_args) 61 struct btrfs_ioctl_vol_args)
55struct btrfs_ioctl_clone_range_args {
56 __s64 src_fd;
57 __u64 src_offset, src_length;
58 __u64 dest_offset;
59};
60 62
61#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ 63#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
62 struct btrfs_ioctl_clone_range_args) 64 struct btrfs_ioctl_clone_range_args)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db6..47b0a88c12a2 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,64 +25,203 @@
25#include "extent_io.h" 25#include "extent_io.h"
26#include "locking.h" 26#include "locking.h"
27 27
28static inline void spin_nested(struct extent_buffer *eb)
29{
30 spin_lock(&eb->lock);
31}
32
28/* 33/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks 34 * Setting a lock to blocking will drop the spinlock and set the
30 * and the spin is not tuned very extensively. The spinning does make a big 35 * flag that forces other procs who want the lock to wait. After
31 * difference in almost every workload, but spinning for the right amount of 36 * this you can safely schedule with the lock held.
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */ 37 */
38void btrfs_set_lock_blocking(struct extent_buffer *eb)
39{
40 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
41 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
42 spin_unlock(&eb->lock);
43 }
44 /* exit with the spin lock released and the bit set */
45}
37 46
38int btrfs_tree_lock(struct extent_buffer *eb) 47/*
48 * clearing the blocking flag will take the spinlock again.
49 * After this you can't safely schedule
50 */
51void btrfs_clear_lock_blocking(struct extent_buffer *eb)
39{ 52{
40 int i; 53 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
54 spin_nested(eb);
55 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
56 smp_mb__after_clear_bit();
57 }
58 /* exit with the spin lock held */
59}
41 60
42 if (mutex_trylock(&eb->mutex)) 61/*
43 return 0; 62 * unfortunately, many of the places that currently set a lock to blocking
63 * don't end up blocking for every long, and often they don't block
64 * at all. For a dbench 50 run, if we don't spin one the blocking bit
65 * at all, the context switch rate can jump up to 400,000/sec or more.
66 *
67 * So, we're still stuck with this crummy spin on the blocking bit,
68 * at least until the most common causes of the short blocks
69 * can be dealt with.
70 */
71static int btrfs_spin_on_block(struct extent_buffer *eb)
72{
73 int i;
44 for (i = 0; i < 512; i++) { 74 for (i = 0; i < 512; i++) {
45 cpu_relax(); 75 cpu_relax();
46 if (mutex_trylock(&eb->mutex)) 76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
47 return 0; 77 return 1;
78 if (need_resched())
79 break;
48 } 80 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0; 81 return 0;
52} 82}
53 83
54int btrfs_try_tree_lock(struct extent_buffer *eb) 84/*
85 * This is somewhat different from trylock. It will take the
86 * spinlock but if it finds the lock is set to blocking, it will
87 * return without the lock held.
88 *
89 * returns 1 if it was able to take the lock and zero otherwise
90 *
91 * After this call, scheduling is not safe without first calling
92 * btrfs_set_lock_blocking()
93 */
94int btrfs_try_spin_lock(struct extent_buffer *eb)
55{ 95{
56 return mutex_trylock(&eb->mutex); 96 int i;
97
98 spin_nested(eb);
99 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
100 return 1;
101 spin_unlock(&eb->lock);
102
103 /* spin for a bit on the BLOCKING flag */
104 for (i = 0; i < 2; i++) {
105 if (!btrfs_spin_on_block(eb))
106 break;
107
108 spin_nested(eb);
109 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
110 return 1;
111 spin_unlock(&eb->lock);
112 }
113 return 0;
57} 114}
58 115
59int btrfs_tree_unlock(struct extent_buffer *eb) 116/*
117 * the autoremove wake function will return 0 if it tried to wake up
118 * a process that was already awake, which means that process won't
119 * count as an exclusive wakeup. The waitq code will continue waking
120 * procs until it finds one that was actually sleeping.
121 *
122 * For btrfs, this isn't quite what we want. We want a single proc
123 * to be notified that the lock is ready for taking. If that proc
124 * already happen to be awake, great, it will loop around and try for
125 * the lock.
126 *
127 * So, btrfs_wake_function always returns 1, even when the proc that we
128 * tried to wake up was already awake.
129 */
130static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
131 int sync, void *key)
60{ 132{
61 mutex_unlock(&eb->mutex); 133 autoremove_wake_function(wait, mode, sync, key);
62 return 0; 134 return 1;
63} 135}
64 136
65int btrfs_tree_locked(struct extent_buffer *eb) 137/*
138 * returns with the extent buffer spinlocked.
139 *
140 * This will spin and/or wait as required to take the lock, and then
141 * return with the spinlock held.
142 *
143 * After this call, scheduling is not safe without first calling
144 * btrfs_set_lock_blocking()
145 */
146int btrfs_tree_lock(struct extent_buffer *eb)
66{ 147{
67 return mutex_is_locked(&eb->mutex); 148 DEFINE_WAIT(wait);
149 wait.func = btrfs_wake_function;
150
151 while(1) {
152 spin_nested(eb);
153
154 /* nobody is blocking, exit with the spinlock held */
155 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
156 return 0;
157
158 /*
159 * we have the spinlock, but the real owner is blocking.
160 * wait for them
161 */
162 spin_unlock(&eb->lock);
163
164 /*
165 * spin for a bit, and if the blocking flag goes away,
166 * loop around
167 */
168 if (btrfs_spin_on_block(eb))
169 continue;
170
171 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
172 TASK_UNINTERRUPTIBLE);
173
174 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
175 schedule();
176
177 finish_wait(&eb->lock_wq, &wait);
178 }
179 return 0;
68} 180}
69 181
70/* 182/*
71 * btrfs_search_slot uses this to decide if it should drop its locks 183 * Very quick trylock, this does not spin or schedule. It returns
72 * before doing something expensive like allocating free blocks for cow. 184 * 1 with the spinlock held if it was able to take the lock, or it
185 * returns zero if it was unable to take the lock.
186 *
187 * After this call, scheduling is not safe without first calling
188 * btrfs_set_lock_blocking()
73 */ 189 */
74int btrfs_path_lock_waiting(struct btrfs_path *path, int level) 190int btrfs_try_tree_lock(struct extent_buffer *eb)
75{ 191{
76 int i; 192 if (spin_trylock(&eb->lock)) {
77 struct extent_buffer *eb; 193 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { 194 /*
79 eb = path->nodes[i]; 195 * we've got the spinlock, but the real owner is
80 if (!eb) 196 * blocking. Drop the spinlock and return failure
81 break; 197 */
82 smp_mb(); 198 spin_unlock(&eb->lock);
83 if (!list_empty(&eb->mutex.wait_list)) 199 return 0;
84 return 1; 200 }
201 return 1;
85 } 202 }
203 /* someone else has the spinlock giveup */
86 return 0; 204 return 0;
87} 205}
88 206
207int btrfs_tree_unlock(struct extent_buffer *eb)
208{
209 /*
210 * if we were a blocking owner, we don't have the spinlock held
211 * just clear the bit and look for waiters
212 */
213 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
214 smp_mb__after_clear_bit();
215 else
216 spin_unlock(&eb->lock);
217
218 if (waitqueue_active(&eb->lock_wq))
219 wake_up(&eb->lock_wq);
220 return 0;
221}
222
223void btrfs_assert_tree_locked(struct extent_buffer *eb)
224{
225 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
226 assert_spin_locked(&eb->lock);
227}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bc1faef12519..6c4ce457168c 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,7 +21,11 @@
21 21
22int btrfs_tree_lock(struct extent_buffer *eb); 22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb); 24
25int btrfs_try_tree_lock(struct extent_buffer *eb); 25int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level); 26int btrfs_try_spin_lock(struct extent_buffer *eb);
27
28void btrfs_set_lock_blocking(struct extent_buffer *eb);
29void btrfs_clear_lock_blocking(struct extent_buffer *eb);
30void btrfs_assert_tree_locked(struct extent_buffer *eb);
27#endif 31#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a20940170274..77c2411a5f0f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -613,7 +613,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
613 struct btrfs_sector_sum *sector_sums; 613 struct btrfs_sector_sum *sector_sums;
614 struct btrfs_ordered_extent *ordered; 614 struct btrfs_ordered_extent *ordered;
615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
616 struct list_head *cur;
617 unsigned long num_sectors; 616 unsigned long num_sectors;
618 unsigned long i; 617 unsigned long i;
619 u32 sectorsize = BTRFS_I(inode)->root->sectorsize; 618 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
@@ -624,8 +623,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
624 return 1; 623 return 1;
625 624
626 mutex_lock(&tree->mutex); 625 mutex_lock(&tree->mutex);
627 list_for_each_prev(cur, &ordered->list) { 626 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
628 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
629 if (disk_bytenr >= ordered_sum->bytenr) { 627 if (disk_bytenr >= ordered_sum->bytenr) {
630 num_sectors = ordered_sum->len / sectorsize; 628 num_sectors = ordered_sum->len / sectorsize;
631 sector_sums = ordered_sum->sums; 629 sector_sums = ordered_sum->sums;
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 6f0acc4c9eab..d0cc62bccb94 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sort.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "ref-cache.h" 22#include "ref-cache.h"
22#include "transaction.h" 23#include "transaction.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 16f3183d7c59..bc283ad2db73 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -73,5 +73,4 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, 73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared); 74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); 75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif 76#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7256cf242eb0..19a4daf03ccb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -37,7 +37,6 @@
37#include <linux/ctype.h> 37#include <linux/ctype.h>
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/version.h>
41#include <linux/magic.h> 40#include <linux/magic.h>
42#include "compat.h" 41#include "compat.h"
43#include "ctree.h" 42#include "ctree.h"
@@ -380,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
380 btrfs_start_delalloc_inodes(root); 379 btrfs_start_delalloc_inodes(root);
381 btrfs_wait_ordered_extents(root, 0); 380 btrfs_wait_ordered_extents(root, 0);
382 381
383 btrfs_clean_old_snapshots(root);
384 trans = btrfs_start_transaction(root, 1); 382 trans = btrfs_start_transaction(root, 1);
385 ret = btrfs_commit_transaction(trans, root); 383 ret = btrfs_commit_transaction(trans, root);
386 sb->s_dirt = 0; 384 sb->s_dirt = 0;
@@ -512,6 +510,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
512 struct btrfs_root *root = btrfs_sb(sb); 510 struct btrfs_root *root = btrfs_sb(sb);
513 int ret; 511 int ret;
514 512
513 ret = btrfs_parse_options(root, data);
514 if (ret)
515 return -EINVAL;
516
515 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 517 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
516 return 0; 518 return 0;
517 519
@@ -582,18 +584,20 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
582{ 584{
583 struct btrfs_ioctl_vol_args *vol; 585 struct btrfs_ioctl_vol_args *vol;
584 struct btrfs_fs_devices *fs_devices; 586 struct btrfs_fs_devices *fs_devices;
585 int ret = 0; 587 int ret = -ENOTTY;
586 int len;
587 588
588 if (!capable(CAP_SYS_ADMIN)) 589 if (!capable(CAP_SYS_ADMIN))
589 return -EPERM; 590 return -EPERM;
590 591
591 vol = kmalloc(sizeof(*vol), GFP_KERNEL); 592 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
593 if (!vol)
594 return -ENOMEM;
595
592 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { 596 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
593 ret = -EFAULT; 597 ret = -EFAULT;
594 goto out; 598 goto out;
595 } 599 }
596 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX); 600
597 switch (cmd) { 601 switch (cmd) {
598 case BTRFS_IOC_SCAN_DEV: 602 case BTRFS_IOC_SCAN_DEV:
599 ret = btrfs_scan_one_device(vol->name, FMODE_READ, 603 ret = btrfs_scan_one_device(vol->name, FMODE_READ,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8a08f9443340..4112d53d4f4d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
688 num_bytes -= btrfs_root_used(&dirty->root->root_item); 688 num_bytes -= btrfs_root_used(&dirty->root->root_item);
689 bytes_used = btrfs_root_used(&root->root_item); 689 bytes_used = btrfs_root_used(&root->root_item);
690 if (num_bytes) { 690 if (num_bytes) {
691 mutex_lock(&root->fs_info->trans_mutex);
691 btrfs_record_root_in_trans(root); 692 btrfs_record_root_in_trans(root);
693 mutex_unlock(&root->fs_info->trans_mutex);
692 btrfs_set_root_used(&root->root_item, 694 btrfs_set_root_used(&root->root_item,
693 bytes_used - num_bytes); 695 bytes_used - num_bytes);
694 } 696 }
@@ -852,11 +854,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
852{ 854{
853 struct btrfs_pending_snapshot *pending; 855 struct btrfs_pending_snapshot *pending;
854 struct list_head *head = &trans->transaction->pending_snapshots; 856 struct list_head *head = &trans->transaction->pending_snapshots;
855 struct list_head *cur;
856 int ret; 857 int ret;
857 858
858 list_for_each(cur, head) { 859 list_for_each_entry(pending, head, list) {
859 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
860 ret = create_pending_snapshot(trans, fs_info, pending); 860 ret = create_pending_snapshot(trans, fs_info, pending);
861 BUG_ON(ret); 861 BUG_ON(ret);
862 } 862 }
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3e8358c36165..98d25fa4570e 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
74 u32 nritems; 74 u32 nritems;
75 75
76 root_node = btrfs_lock_root_node(root); 76 root_node = btrfs_lock_root_node(root);
77 btrfs_set_lock_blocking(root_node);
77 nritems = btrfs_header_nritems(root_node); 78 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0; 79 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */ 80 /* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d81cda2e077c..9c462fbd60fa 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -78,104 +78,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
78 */ 78 */
79 79
80/* 80/*
81 * btrfs_add_log_tree adds a new per-subvolume log tree into the
82 * tree of log tree roots. This must be called with a tree log transaction
83 * running (see start_log_trans).
84 */
85static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root)
87{
88 struct btrfs_key key;
89 struct btrfs_root_item root_item;
90 struct btrfs_inode_item *inode_item;
91 struct extent_buffer *leaf;
92 struct btrfs_root *new_root = root;
93 int ret;
94 u64 objectid = root->root_key.objectid;
95
96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97 BTRFS_TREE_LOG_OBJECTID,
98 trans->transid, 0, 0, 0);
99 if (IS_ERR(leaf)) {
100 ret = PTR_ERR(leaf);
101 return ret;
102 }
103
104 btrfs_set_header_nritems(leaf, 0);
105 btrfs_set_header_level(leaf, 0);
106 btrfs_set_header_bytenr(leaf, leaf->start);
107 btrfs_set_header_generation(leaf, trans->transid);
108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110 write_extent_buffer(leaf, root->fs_info->fsid,
111 (unsigned long)btrfs_header_fsid(leaf),
112 BTRFS_FSID_SIZE);
113 btrfs_mark_buffer_dirty(leaf);
114
115 inode_item = &root_item.inode;
116 memset(inode_item, 0, sizeof(*inode_item));
117 inode_item->generation = cpu_to_le64(1);
118 inode_item->size = cpu_to_le64(3);
119 inode_item->nlink = cpu_to_le32(1);
120 inode_item->nbytes = cpu_to_le64(root->leafsize);
121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123 btrfs_set_root_bytenr(&root_item, leaf->start);
124 btrfs_set_root_generation(&root_item, trans->transid);
125 btrfs_set_root_level(&root_item, 0);
126 btrfs_set_root_refs(&root_item, 0);
127 btrfs_set_root_used(&root_item, 0);
128
129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130 root_item.drop_level = 0;
131
132 btrfs_tree_unlock(leaf);
133 free_extent_buffer(leaf);
134 leaf = NULL;
135
136 btrfs_set_root_dirid(&root_item, 0);
137
138 key.objectid = BTRFS_TREE_LOG_OBJECTID;
139 key.offset = objectid;
140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142 &root_item);
143 if (ret)
144 goto fail;
145
146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147 &key);
148 BUG_ON(!new_root);
149
150 WARN_ON(root->log_root);
151 root->log_root = new_root;
152
153 /*
154 * log trees do not get reference counted because they go away
155 * before a real commit is actually done. They do store pointers
156 * to file data extents, and those reference counts still get
157 * updated (along with back refs to the log tree).
158 */
159 new_root->ref_cows = 0;
160 new_root->last_trans = trans->transid;
161
162 /*
163 * we need to make sure the root block for this new tree
164 * is marked as dirty in the dirty_log_pages tree. This
165 * is how it gets flushed down to disk at tree log commit time.
166 *
167 * the tree logging mutex keeps others from coming in and changing
168 * the new_root->node, so we can safely access it here
169 */
170 set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
171 new_root->node->start + new_root->node->len - 1,
172 GFP_NOFS);
173
174fail:
175 return ret;
176}
177
178/*
179 * start a sub transaction and setup the log tree 81 * start a sub transaction and setup the log tree
180 * this increments the log tree writer count to make the people 82 * this increments the log tree writer count to make the people
181 * syncing the tree wait for us to finish 83 * syncing the tree wait for us to finish
@@ -184,6 +86,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root) 86 struct btrfs_root *root)
185{ 87{
186 int ret; 88 int ret;
89
90 mutex_lock(&root->log_mutex);
91 if (root->log_root) {
92 root->log_batch++;
93 atomic_inc(&root->log_writers);
94 mutex_unlock(&root->log_mutex);
95 return 0;
96 }
187 mutex_lock(&root->fs_info->tree_log_mutex); 97 mutex_lock(&root->fs_info->tree_log_mutex);
188 if (!root->fs_info->log_root_tree) { 98 if (!root->fs_info->log_root_tree) {
189 ret = btrfs_init_log_root_tree(trans, root->fs_info); 99 ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -193,9 +103,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
193 ret = btrfs_add_log_tree(trans, root); 103 ret = btrfs_add_log_tree(trans, root);
194 BUG_ON(ret); 104 BUG_ON(ret);
195 } 105 }
196 atomic_inc(&root->fs_info->tree_log_writers);
197 root->fs_info->tree_log_batch++;
198 mutex_unlock(&root->fs_info->tree_log_mutex); 106 mutex_unlock(&root->fs_info->tree_log_mutex);
107 root->log_batch++;
108 atomic_inc(&root->log_writers);
109 mutex_unlock(&root->log_mutex);
199 return 0; 110 return 0;
200} 111}
201 112
@@ -212,13 +123,12 @@ static int join_running_log_trans(struct btrfs_root *root)
212 if (!root->log_root) 123 if (!root->log_root)
213 return -ENOENT; 124 return -ENOENT;
214 125
215 mutex_lock(&root->fs_info->tree_log_mutex); 126 mutex_lock(&root->log_mutex);
216 if (root->log_root) { 127 if (root->log_root) {
217 ret = 0; 128 ret = 0;
218 atomic_inc(&root->fs_info->tree_log_writers); 129 atomic_inc(&root->log_writers);
219 root->fs_info->tree_log_batch++;
220 } 130 }
221 mutex_unlock(&root->fs_info->tree_log_mutex); 131 mutex_unlock(&root->log_mutex);
222 return ret; 132 return ret;
223} 133}
224 134
@@ -228,10 +138,11 @@ static int join_running_log_trans(struct btrfs_root *root)
228 */ 138 */
229static int end_log_trans(struct btrfs_root *root) 139static int end_log_trans(struct btrfs_root *root)
230{ 140{
231 atomic_dec(&root->fs_info->tree_log_writers); 141 if (atomic_dec_and_test(&root->log_writers)) {
232 smp_mb(); 142 smp_mb();
233 if (waitqueue_active(&root->fs_info->tree_log_wait)) 143 if (waitqueue_active(&root->log_writer_wait))
234 wake_up(&root->fs_info->tree_log_wait); 144 wake_up(&root->log_writer_wait);
145 }
235 return 0; 146 return 0;
236} 147}
237 148
@@ -1704,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1704 1615
1705 btrfs_tree_lock(next); 1616 btrfs_tree_lock(next);
1706 clean_tree_block(trans, root, next); 1617 clean_tree_block(trans, root, next);
1618 btrfs_set_lock_blocking(next);
1707 btrfs_wait_tree_block_writeback(next); 1619 btrfs_wait_tree_block_writeback(next);
1708 btrfs_tree_unlock(next); 1620 btrfs_tree_unlock(next);
1709 1621
@@ -1750,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1750 next = path->nodes[*level]; 1662 next = path->nodes[*level];
1751 btrfs_tree_lock(next); 1663 btrfs_tree_lock(next);
1752 clean_tree_block(trans, root, next); 1664 clean_tree_block(trans, root, next);
1665 btrfs_set_lock_blocking(next);
1753 btrfs_wait_tree_block_writeback(next); 1666 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next); 1667 btrfs_tree_unlock(next);
1755 1668
@@ -1807,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1807 1720
1808 btrfs_tree_lock(next); 1721 btrfs_tree_lock(next);
1809 clean_tree_block(trans, root, next); 1722 clean_tree_block(trans, root, next);
1723 btrfs_set_lock_blocking(next);
1810 btrfs_wait_tree_block_writeback(next); 1724 btrfs_wait_tree_block_writeback(next);
1811 btrfs_tree_unlock(next); 1725 btrfs_tree_unlock(next);
1812 1726
@@ -1879,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1879 1793
1880 btrfs_tree_lock(next); 1794 btrfs_tree_lock(next);
1881 clean_tree_block(trans, log, next); 1795 clean_tree_block(trans, log, next);
1796 btrfs_set_lock_blocking(next);
1882 btrfs_wait_tree_block_writeback(next); 1797 btrfs_wait_tree_block_writeback(next);
1883 btrfs_tree_unlock(next); 1798 btrfs_tree_unlock(next);
1884 1799
@@ -1902,26 +1817,65 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1902 } 1817 }
1903 } 1818 }
1904 btrfs_free_path(path); 1819 btrfs_free_path(path);
1905 if (wc->free)
1906 free_extent_buffer(log->node);
1907 return ret; 1820 return ret;
1908} 1821}
1909 1822
1910static int wait_log_commit(struct btrfs_root *log) 1823/*
1824 * helper function to update the item for a given subvolumes log root
1825 * in the tree of log roots
1826 */
1827static int update_log_root(struct btrfs_trans_handle *trans,
1828 struct btrfs_root *log)
1829{
1830 int ret;
1831
1832 if (log->log_transid == 1) {
1833 /* insert root item on the first sync */
1834 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
1835 &log->root_key, &log->root_item);
1836 } else {
1837 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
1838 &log->root_key, &log->root_item);
1839 }
1840 return ret;
1841}
1842
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1911{ 1844{
1912 DEFINE_WAIT(wait); 1845 DEFINE_WAIT(wait);
1913 u64 transid = log->fs_info->tree_log_transid; 1846 int index = transid % 2;
1914 1847
1848 /*
1849 * we only allow two pending log transactions at a time,
1850 * so we know that if ours is more than 2 older than the
1851 * current transaction, we're done
1852 */
1915 do { 1853 do {
1916 prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1854 prepare_to_wait(&root->log_commit_wait[index],
1917 TASK_UNINTERRUPTIBLE); 1855 &wait, TASK_UNINTERRUPTIBLE);
1918 mutex_unlock(&log->fs_info->tree_log_mutex); 1856 mutex_unlock(&root->log_mutex);
1919 if (atomic_read(&log->fs_info->tree_log_commit)) 1857 if (root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index]))
1920 schedule(); 1859 schedule();
1921 finish_wait(&log->fs_info->tree_log_wait, &wait); 1860 finish_wait(&root->log_commit_wait[index], &wait);
1922 mutex_lock(&log->fs_info->tree_log_mutex); 1861 mutex_lock(&root->log_mutex);
1923 } while (transid == log->fs_info->tree_log_transid && 1862 } while (root->log_transid < transid + 2 &&
1924 atomic_read(&log->fs_info->tree_log_commit)); 1863 atomic_read(&root->log_commit[index]));
1864 return 0;
1865}
1866
1867static int wait_for_writer(struct btrfs_root *root)
1868{
1869 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers))
1875 schedule();
1876 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait);
1878 }
1925 return 0; 1879 return 0;
1926} 1880}
1927 1881
@@ -1933,57 +1887,114 @@ static int wait_log_commit(struct btrfs_root *log)
1933int btrfs_sync_log(struct btrfs_trans_handle *trans, 1887int btrfs_sync_log(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root) 1888 struct btrfs_root *root)
1935{ 1889{
1890 int index1;
1891 int index2;
1936 int ret; 1892 int ret;
1937 unsigned long batch;
1938 struct btrfs_root *log = root->log_root; 1893 struct btrfs_root *log = root->log_root;
1894 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
1939 1895
1940 mutex_lock(&log->fs_info->tree_log_mutex); 1896 mutex_lock(&root->log_mutex);
1941 if (atomic_read(&log->fs_info->tree_log_commit)) { 1897 index1 = root->log_transid % 2;
1942 wait_log_commit(log); 1898 if (atomic_read(&root->log_commit[index1])) {
1943 goto out; 1899 wait_log_commit(root, root->log_transid);
1900 mutex_unlock(&root->log_mutex);
1901 return 0;
1944 } 1902 }
1945 atomic_set(&log->fs_info->tree_log_commit, 1); 1903 atomic_set(&root->log_commit[index1], 1);
1904
1905 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1);
1946 1908
1947 while (1) { 1909 while (1) {
1948 batch = log->fs_info->tree_log_batch; 1910 unsigned long batch = root->log_batch;
1949 mutex_unlock(&log->fs_info->tree_log_mutex); 1911 mutex_unlock(&root->log_mutex);
1950 schedule_timeout_uninterruptible(1); 1912 schedule_timeout_uninterruptible(1);
1951 mutex_lock(&log->fs_info->tree_log_mutex); 1913 mutex_lock(&root->log_mutex);
1952 1914 wait_for_writer(root);
1953 while (atomic_read(&log->fs_info->tree_log_writers)) { 1915 if (batch == root->log_batch)
1954 DEFINE_WAIT(wait);
1955 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1956 TASK_UNINTERRUPTIBLE);
1957 mutex_unlock(&log->fs_info->tree_log_mutex);
1958 if (atomic_read(&log->fs_info->tree_log_writers))
1959 schedule();
1960 mutex_lock(&log->fs_info->tree_log_mutex);
1961 finish_wait(&log->fs_info->tree_log_wait, &wait);
1962 }
1963 if (batch == log->fs_info->tree_log_batch)
1964 break; 1916 break;
1965 } 1917 }
1966 1918
1967 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1968 BUG_ON(ret); 1920 BUG_ON(ret);
1969 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, 1921
1970 &root->fs_info->log_root_tree->dirty_log_pages); 1922 btrfs_set_root_bytenr(&log->root_item, log->node->start);
1923 btrfs_set_root_generation(&log->root_item, trans->transid);
1924 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
1925
1926 root->log_batch = 0;
1927 root->log_transid++;
1928 log->log_transid = root->log_transid;
1929 smp_mb();
1930 /*
1931 * log tree has been flushed to disk, new modifications of
1932 * the log will be written to new positions. so it's safe to
1933 * allow log writers to go in.
1934 */
1935 mutex_unlock(&root->log_mutex);
1936
1937 mutex_lock(&log_root_tree->log_mutex);
1938 log_root_tree->log_batch++;
1939 atomic_inc(&log_root_tree->log_writers);
1940 mutex_unlock(&log_root_tree->log_mutex);
1941
1942 ret = update_log_root(trans, log);
1943 BUG_ON(ret);
1944
1945 mutex_lock(&log_root_tree->log_mutex);
1946 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
1947 smp_mb();
1948 if (waitqueue_active(&log_root_tree->log_writer_wait))
1949 wake_up(&log_root_tree->log_writer_wait);
1950 }
1951
1952 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out;
1957 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1);
1959
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
1962
1963 wait_for_writer(log_root_tree);
1964
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages);
1971 BUG_ON(ret); 1967 BUG_ON(ret);
1972 1968
1973 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 1969 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1974 log->fs_info->log_root_tree->node->start); 1970 log_root_tree->node->start);
1975 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 1971 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1976 btrfs_header_level(log->fs_info->log_root_tree->node)); 1972 btrfs_header_level(log_root_tree->node));
1973
1974 log_root_tree->log_batch = 0;
1975 log_root_tree->log_transid++;
1976 smp_mb();
1977
1978 mutex_unlock(&log_root_tree->log_mutex);
1979
1980 /*
1981 * nobody else is going to jump in and write the the ctree
1982 * super here because the log_commit atomic below is protecting
1983 * us. We must be called with a transaction handle pinning
1984 * the running transaction open, so a full commit can't hop
1985 * in and cause problems either.
1986 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2);
1977 1988
1978 write_ctree_super(trans, log->fs_info->tree_root, 2); 1989 atomic_set(&log_root_tree->log_commit[index2], 0);
1979 log->fs_info->tree_log_transid++;
1980 log->fs_info->tree_log_batch = 0;
1981 atomic_set(&log->fs_info->tree_log_commit, 0);
1982 smp_mb(); 1990 smp_mb();
1983 if (waitqueue_active(&log->fs_info->tree_log_wait)) 1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
1984 wake_up(&log->fs_info->tree_log_wait); 1992 wake_up(&log_root_tree->log_commit_wait[index2]);
1985out: 1993out:
1986 mutex_unlock(&log->fs_info->tree_log_mutex); 1994 atomic_set(&root->log_commit[index1], 0);
1995 smp_mb();
1996 if (waitqueue_active(&root->log_commit_wait[index1]))
1997 wake_up(&root->log_commit_wait[index1]);
1987 return 0; 1998 return 0;
1988} 1999}
1989 2000
@@ -2019,38 +2030,18 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2019 start, end, GFP_NOFS); 2030 start, end, GFP_NOFS);
2020 } 2031 }
2021 2032
2022 log = root->log_root; 2033 if (log->log_transid > 0) {
2023 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2034 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2024 &log->root_key); 2035 &log->root_key);
2025 BUG_ON(ret); 2036 BUG_ON(ret);
2037 }
2026 root->log_root = NULL; 2038 root->log_root = NULL;
2027 kfree(root->log_root); 2039 free_extent_buffer(log->node);
2040 kfree(log);
2028 return 0; 2041 return 0;
2029} 2042}
2030 2043
2031/* 2044/*
2032 * helper function to update the item for a given subvolumes log root
2033 * in the tree of log roots
2034 */
2035static int update_log_root(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *log)
2037{
2038 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2039 int ret;
2040
2041 if (log->node->start == bytenr)
2042 return 0;
2043
2044 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2045 btrfs_set_root_generation(&log->root_item, trans->transid);
2046 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2047 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2048 &log->root_key, &log->root_item);
2049 BUG_ON(ret);
2050 return ret;
2051}
2052
2053/*
2054 * If both a file and directory are logged, and unlinks or renames are 2045 * If both a file and directory are logged, and unlinks or renames are
2055 * mixed in, we have a few interesting corners: 2046 * mixed in, we have a few interesting corners:
2056 * 2047 *
@@ -2711,11 +2702,6 @@ next_slot:
2711 2702
2712 btrfs_free_path(path); 2703 btrfs_free_path(path);
2713 btrfs_free_path(dst_path); 2704 btrfs_free_path(dst_path);
2714
2715 mutex_lock(&root->fs_info->tree_log_mutex);
2716 ret = update_log_root(trans, log);
2717 BUG_ON(ret);
2718 mutex_unlock(&root->fs_info->tree_log_mutex);
2719out: 2705out:
2720 return 0; 2706 return 0;
2721} 2707}
@@ -2846,7 +2832,9 @@ again:
2846 BUG_ON(!wc.replay_dest); 2832 BUG_ON(!wc.replay_dest);
2847 2833
2848 wc.replay_dest->log_root = log; 2834 wc.replay_dest->log_root = log;
2835 mutex_lock(&fs_info->trans_mutex);
2849 btrfs_record_root_in_trans(wc.replay_dest); 2836 btrfs_record_root_in_trans(wc.replay_dest);
2837 mutex_unlock(&fs_info->trans_mutex);
2850 ret = walk_log_tree(trans, log, &wc); 2838 ret = walk_log_tree(trans, log, &wc);
2851 BUG_ON(ret); 2839 BUG_ON(ret);
2852 2840
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b187b537888e..dd06e18e5aac 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,6 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/version.h>
24#include <asm/div64.h> 23#include <asm/div64.h>
25#include "compat.h" 24#include "compat.h"
26#include "ctree.h" 25#include "ctree.h"
@@ -104,10 +103,8 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
104 u64 devid, u8 *uuid) 103 u64 devid, u8 *uuid)
105{ 104{
106 struct btrfs_device *dev; 105 struct btrfs_device *dev;
107 struct list_head *cur;
108 106
109 list_for_each(cur, head) { 107 list_for_each_entry(dev, head, dev_list) {
110 dev = list_entry(cur, struct btrfs_device, dev_list);
111 if (dev->devid == devid && 108 if (dev->devid == devid &&
112 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 109 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
113 return dev; 110 return dev;
@@ -118,11 +115,9 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
118 115
119static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 116static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
120{ 117{
121 struct list_head *cur;
122 struct btrfs_fs_devices *fs_devices; 118 struct btrfs_fs_devices *fs_devices;
123 119
124 list_for_each(cur, &fs_uuids) { 120 list_for_each_entry(fs_devices, &fs_uuids, list) {
125 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
126 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 121 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
127 return fs_devices; 122 return fs_devices;
128 } 123 }
@@ -159,6 +154,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
159loop: 154loop:
160 spin_lock(&device->io_lock); 155 spin_lock(&device->io_lock);
161 156
157loop_lock:
162 /* take all the bios off the list at once and process them 158 /* take all the bios off the list at once and process them
163 * later on (without the lock held). But, remember the 159 * later on (without the lock held). But, remember the
164 * tail and other pointers so the bios can be properly reinserted 160 * tail and other pointers so the bios can be properly reinserted
@@ -208,7 +204,7 @@ loop:
208 * is now congested. Back off and let other work structs 204 * is now congested. Back off and let other work structs
209 * run instead 205 * run instead
210 */ 206 */
211 if (pending && bdi_write_congested(bdi) && 207 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
212 fs_info->fs_devices->open_devices > 1) { 208 fs_info->fs_devices->open_devices > 1) {
213 struct bio *old_head; 209 struct bio *old_head;
214 210
@@ -221,6 +217,8 @@ loop:
221 else 217 else
222 device->pending_bio_tail = tail; 218 device->pending_bio_tail = tail;
223 219
220 device->running_pending = 1;
221
224 spin_unlock(&device->io_lock); 222 spin_unlock(&device->io_lock);
225 btrfs_requeue_work(&device->work); 223 btrfs_requeue_work(&device->work);
226 goto done; 224 goto done;
@@ -228,6 +226,11 @@ loop:
228 } 226 }
229 if (again) 227 if (again)
230 goto loop; 228 goto loop;
229
230 spin_lock(&device->io_lock);
231 if (device->pending_bios)
232 goto loop_lock;
233 spin_unlock(&device->io_lock);
231done: 234done:
232 return 0; 235 return 0;
233} 236}
@@ -344,14 +347,11 @@ error:
344 347
345int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 348int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
346{ 349{
347 struct list_head *tmp; 350 struct btrfs_device *device, *next;
348 struct list_head *cur;
349 struct btrfs_device *device;
350 351
351 mutex_lock(&uuid_mutex); 352 mutex_lock(&uuid_mutex);
352again: 353again:
353 list_for_each_safe(cur, tmp, &fs_devices->devices) { 354 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
354 device = list_entry(cur, struct btrfs_device, dev_list);
355 if (device->in_fs_metadata) 355 if (device->in_fs_metadata)
356 continue; 356 continue;
357 357
@@ -382,14 +382,12 @@ again:
382 382
383static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 383static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
384{ 384{
385 struct list_head *cur;
386 struct btrfs_device *device; 385 struct btrfs_device *device;
387 386
388 if (--fs_devices->opened > 0) 387 if (--fs_devices->opened > 0)
389 return 0; 388 return 0;
390 389
391 list_for_each(cur, &fs_devices->devices) { 390 list_for_each_entry(device, &fs_devices->devices, dev_list) {
392 device = list_entry(cur, struct btrfs_device, dev_list);
393 if (device->bdev) { 391 if (device->bdev) {
394 close_bdev_exclusive(device->bdev, device->mode); 392 close_bdev_exclusive(device->bdev, device->mode);
395 fs_devices->open_devices--; 393 fs_devices->open_devices--;
@@ -438,7 +436,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
438{ 436{
439 struct block_device *bdev; 437 struct block_device *bdev;
440 struct list_head *head = &fs_devices->devices; 438 struct list_head *head = &fs_devices->devices;
441 struct list_head *cur;
442 struct btrfs_device *device; 439 struct btrfs_device *device;
443 struct block_device *latest_bdev = NULL; 440 struct block_device *latest_bdev = NULL;
444 struct buffer_head *bh; 441 struct buffer_head *bh;
@@ -449,8 +446,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
449 int seeding = 1; 446 int seeding = 1;
450 int ret = 0; 447 int ret = 0;
451 448
452 list_for_each(cur, head) { 449 list_for_each_entry(device, head, dev_list) {
453 device = list_entry(cur, struct btrfs_device, dev_list);
454 if (device->bdev) 450 if (device->bdev)
455 continue; 451 continue;
456 if (!device->name) 452 if (!device->name)
@@ -577,7 +573,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
577 *(unsigned long long *)disk_super->fsid, 573 *(unsigned long long *)disk_super->fsid,
578 *(unsigned long long *)(disk_super->fsid + 8)); 574 *(unsigned long long *)(disk_super->fsid + 8));
579 } 575 }
580 printk(KERN_INFO "devid %llu transid %llu %s\n", 576 printk(KERN_CONT "devid %llu transid %llu %s\n",
581 (unsigned long long)devid, (unsigned long long)transid, path); 577 (unsigned long long)devid, (unsigned long long)transid, path);
582 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 578 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
583 579
@@ -1016,14 +1012,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1016 } 1012 }
1017 1013
1018 if (strcmp(device_path, "missing") == 0) { 1014 if (strcmp(device_path, "missing") == 0) {
1019 struct list_head *cur;
1020 struct list_head *devices; 1015 struct list_head *devices;
1021 struct btrfs_device *tmp; 1016 struct btrfs_device *tmp;
1022 1017
1023 device = NULL; 1018 device = NULL;
1024 devices = &root->fs_info->fs_devices->devices; 1019 devices = &root->fs_info->fs_devices->devices;
1025 list_for_each(cur, devices) { 1020 list_for_each_entry(tmp, devices, dev_list) {
1026 tmp = list_entry(cur, struct btrfs_device, dev_list);
1027 if (tmp->in_fs_metadata && !tmp->bdev) { 1021 if (tmp->in_fs_metadata && !tmp->bdev) {
1028 device = tmp; 1022 device = tmp;
1029 break; 1023 break;
@@ -1279,7 +1273,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1279 struct btrfs_trans_handle *trans; 1273 struct btrfs_trans_handle *trans;
1280 struct btrfs_device *device; 1274 struct btrfs_device *device;
1281 struct block_device *bdev; 1275 struct block_device *bdev;
1282 struct list_head *cur;
1283 struct list_head *devices; 1276 struct list_head *devices;
1284 struct super_block *sb = root->fs_info->sb; 1277 struct super_block *sb = root->fs_info->sb;
1285 u64 total_bytes; 1278 u64 total_bytes;
@@ -1303,8 +1296,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1303 mutex_lock(&root->fs_info->volume_mutex); 1296 mutex_lock(&root->fs_info->volume_mutex);
1304 1297
1305 devices = &root->fs_info->fs_devices->devices; 1298 devices = &root->fs_info->fs_devices->devices;
1306 list_for_each(cur, devices) { 1299 list_for_each_entry(device, devices, dev_list) {
1307 device = list_entry(cur, struct btrfs_device, dev_list);
1308 if (device->bdev == bdev) { 1300 if (device->bdev == bdev) {
1309 ret = -EEXIST; 1301 ret = -EEXIST;
1310 goto error; 1302 goto error;
@@ -1382,6 +1374,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1382 ret = btrfs_add_device(trans, root, device); 1374 ret = btrfs_add_device(trans, root, device);
1383 } 1375 }
1384 1376
1377 /*
1378 * we've got more storage, clear any full flags on the space
1379 * infos
1380 */
1381 btrfs_clear_space_info_full(root->fs_info);
1382
1385 unlock_chunks(root); 1383 unlock_chunks(root);
1386 btrfs_commit_transaction(trans, root); 1384 btrfs_commit_transaction(trans, root);
1387 1385
@@ -1467,6 +1465,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1467 device->fs_devices->total_rw_bytes += diff; 1465 device->fs_devices->total_rw_bytes += diff;
1468 1466
1469 device->total_bytes = new_size; 1467 device->total_bytes = new_size;
1468 btrfs_clear_space_info_full(device->dev_root->fs_info);
1469
1470 return btrfs_update_device(trans, device); 1470 return btrfs_update_device(trans, device);
1471} 1471}
1472 1472
@@ -1703,7 +1703,6 @@ static u64 div_factor(u64 num, int factor)
1703int btrfs_balance(struct btrfs_root *dev_root) 1703int btrfs_balance(struct btrfs_root *dev_root)
1704{ 1704{
1705 int ret; 1705 int ret;
1706 struct list_head *cur;
1707 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1706 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1708 struct btrfs_device *device; 1707 struct btrfs_device *device;
1709 u64 old_size; 1708 u64 old_size;
@@ -1722,8 +1721,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1722 dev_root = dev_root->fs_info->dev_root; 1721 dev_root = dev_root->fs_info->dev_root;
1723 1722
1724 /* step one make some room on all the devices */ 1723 /* step one make some room on all the devices */
1725 list_for_each(cur, devices) { 1724 list_for_each_entry(device, devices, dev_list) {
1726 device = list_entry(cur, struct btrfs_device, dev_list);
1727 old_size = device->total_bytes; 1725 old_size = device->total_bytes;
1728 size_to_free = div_factor(old_size, 1); 1726 size_to_free = div_factor(old_size, 1);
1729 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 1727 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
@@ -2904,10 +2902,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2904 free_extent_map(em); 2902 free_extent_map(em);
2905 } 2903 }
2906 2904
2907 map = kzalloc(sizeof(*map), GFP_NOFS);
2908 if (!map)
2909 return -ENOMEM;
2910
2911 em = alloc_extent_map(GFP_NOFS); 2905 em = alloc_extent_map(GFP_NOFS);
2912 if (!em) 2906 if (!em)
2913 return -ENOMEM; 2907 return -ENOMEM;
@@ -3116,6 +3110,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
3116 if (!sb) 3110 if (!sb)
3117 return -ENOMEM; 3111 return -ENOMEM;
3118 btrfs_set_buffer_uptodate(sb); 3112 btrfs_set_buffer_uptodate(sb);
3113 btrfs_set_buffer_lockdep_class(sb, 0);
3114
3119 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3115 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3120 array_size = btrfs_super_sys_array_size(super_copy); 3116 array_size = btrfs_super_sys_array_size(super_copy);
3121 3117
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 7f332e270894..a9d3bf4d2689 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/rwsem.h> 22#include <linux/rwsem.h>
23#include <linux/xattr.h> 23#include <linux/xattr.h>
24#include <linux/security.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "btrfs_inode.h" 26#include "btrfs_inode.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -45,9 +46,12 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
45 /* lookup the xattr by name */ 46 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, 47 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0); 48 strlen(name), 0);
48 if (!di || IS_ERR(di)) { 49 if (!di) {
49 ret = -ENODATA; 50 ret = -ENODATA;
50 goto out; 51 goto out;
52 } else if (IS_ERR(di)) {
53 ret = PTR_ERR(di);
54 goto out;
51 } 55 }
52 56
53 leaf = path->nodes[0]; 57 leaf = path->nodes[0];
@@ -62,6 +66,14 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
62 ret = -ERANGE; 66 ret = -ERANGE;
63 goto out; 67 goto out;
64 } 68 }
69
70 /*
71 * The way things are packed into the leaf is like this
72 * |struct btrfs_dir_item|name|data|
73 * where name is the xattr name, so security.foo, and data is the
74 * content of the xattr. data_ptr points to the location in memory
75 * where the data starts in the in memory leaf
76 */
65 data_ptr = (unsigned long)((char *)(di + 1) + 77 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di)); 78 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr, 79 read_extent_buffer(leaf, buffer, data_ptr,
@@ -86,7 +98,7 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
86 if (!path) 98 if (!path)
87 return -ENOMEM; 99 return -ENOMEM;
88 100
89 trans = btrfs_start_transaction(root, 1); 101 trans = btrfs_join_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode); 102 btrfs_set_trans_block_group(trans, inode);
91 103
92 /* first lets see if we already have this xattr */ 104 /* first lets see if we already have this xattr */
@@ -176,7 +188,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0) 189 if (ret < 0)
178 goto err; 190 goto err;
179 ret = 0;
180 advance = 0; 191 advance = 0;
181 while (1) { 192 while (1) {
182 leaf = path->nodes[0]; 193 leaf = path->nodes[0];
@@ -320,3 +331,34 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
320 return -EOPNOTSUPP; 331 return -EOPNOTSUPP;
321 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); 332 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
322} 333}
334
335int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
336{
337 int err;
338 size_t len;
339 void *value;
340 char *suffix;
341 char *name;
342
343 err = security_inode_init_security(inode, dir, &suffix, &value, &len);
344 if (err) {
345 if (err == -EOPNOTSUPP)
346 return 0;
347 return err;
348 }
349
350 name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
351 GFP_NOFS);
352 if (!name) {
353 err = -ENOMEM;
354 } else {
355 strcpy(name, XATTR_SECURITY_PREFIX);
356 strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
357 err = __btrfs_setxattr(inode, name, value, len, 0);
358 kfree(name);
359 }
360
361 kfree(suffix);
362 kfree(value);
363 return err;
364}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5b1d08f8e68d..c71e9c3cf3f7 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -36,4 +36,6 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags); 36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name); 37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38 38
39extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
40
39#endif /* __XATTR__ */ 41#endif /* __XATTR__ */
diff --git a/fs/buffer.c b/fs/buffer.c
index b58208f1640a..891e1c78e4f1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -760,15 +760,9 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
760 * If warn is true, then emit a warning if the page is not uptodate and has 760 * If warn is true, then emit a warning if the page is not uptodate and has
761 * not been truncated. 761 * not been truncated.
762 */ 762 */
763static int __set_page_dirty(struct page *page, 763static void __set_page_dirty(struct page *page,
764 struct address_space *mapping, int warn) 764 struct address_space *mapping, int warn)
765{ 765{
766 if (unlikely(!mapping))
767 return !TestSetPageDirty(page);
768
769 if (TestSetPageDirty(page))
770 return 0;
771
772 spin_lock_irq(&mapping->tree_lock); 766 spin_lock_irq(&mapping->tree_lock);
773 if (page->mapping) { /* Race with truncate? */ 767 if (page->mapping) { /* Race with truncate? */
774 WARN_ON_ONCE(warn && !PageUptodate(page)); 768 WARN_ON_ONCE(warn && !PageUptodate(page));
@@ -777,6 +771,7 @@ static int __set_page_dirty(struct page *page,
777 __inc_zone_page_state(page, NR_FILE_DIRTY); 771 __inc_zone_page_state(page, NR_FILE_DIRTY);
778 __inc_bdi_stat(mapping->backing_dev_info, 772 __inc_bdi_stat(mapping->backing_dev_info,
779 BDI_RECLAIMABLE); 773 BDI_RECLAIMABLE);
774 task_dirty_inc(current);
780 task_io_account_write(PAGE_CACHE_SIZE); 775 task_io_account_write(PAGE_CACHE_SIZE);
781 } 776 }
782 radix_tree_tag_set(&mapping->page_tree, 777 radix_tree_tag_set(&mapping->page_tree,
@@ -784,8 +779,6 @@ static int __set_page_dirty(struct page *page,
784 } 779 }
785 spin_unlock_irq(&mapping->tree_lock); 780 spin_unlock_irq(&mapping->tree_lock);
786 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 781 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
787
788 return 1;
789} 782}
790 783
791/* 784/*
@@ -815,6 +808,7 @@ static int __set_page_dirty(struct page *page,
815 */ 808 */
816int __set_page_dirty_buffers(struct page *page) 809int __set_page_dirty_buffers(struct page *page)
817{ 810{
811 int newly_dirty;
818 struct address_space *mapping = page_mapping(page); 812 struct address_space *mapping = page_mapping(page);
819 813
820 if (unlikely(!mapping)) 814 if (unlikely(!mapping))
@@ -830,9 +824,12 @@ int __set_page_dirty_buffers(struct page *page)
830 bh = bh->b_this_page; 824 bh = bh->b_this_page;
831 } while (bh != head); 825 } while (bh != head);
832 } 826 }
827 newly_dirty = !TestSetPageDirty(page);
833 spin_unlock(&mapping->private_lock); 828 spin_unlock(&mapping->private_lock);
834 829
835 return __set_page_dirty(page, mapping, 1); 830 if (newly_dirty)
831 __set_page_dirty(page, mapping, 1);
832 return newly_dirty;
836} 833}
837EXPORT_SYMBOL(__set_page_dirty_buffers); 834EXPORT_SYMBOL(__set_page_dirty_buffers);
838 835
@@ -1261,8 +1258,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
1261 return; 1258 return;
1262 } 1259 }
1263 1260
1264 if (!test_set_buffer_dirty(bh)) 1261 if (!test_set_buffer_dirty(bh)) {
1265 __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); 1262 struct page *page = bh->b_page;
1263 if (!TestSetPageDirty(page))
1264 __set_page_dirty(page, page_mapping(page), 0);
1265 }
1266} 1266}
1267 1267
1268/* 1268/*
@@ -2688,7 +2688,7 @@ int nobh_write_end(struct file *file, struct address_space *mapping,
2688 struct buffer_head *bh; 2688 struct buffer_head *bh;
2689 BUG_ON(fsdata != NULL && page_has_buffers(page)); 2689 BUG_ON(fsdata != NULL && page_has_buffers(page));
2690 2690
2691 if (unlikely(copied < len) && !page_has_buffers(page)) 2691 if (unlikely(copied < len) && head)
2692 attach_nobh_buffers(page, head); 2692 attach_nobh_buffers(page, head);
2693 if (page_has_buffers(page)) 2693 if (page_has_buffers(page))
2694 return generic_write_end(file, mapping, pos, len, 2694 return generic_write_end(file, mapping, pos, len,
@@ -3108,7 +3108,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
3108 if (test_clear_buffer_dirty(bh)) { 3108 if (test_clear_buffer_dirty(bh)) {
3109 get_bh(bh); 3109 get_bh(bh);
3110 bh->b_end_io = end_buffer_write_sync; 3110 bh->b_end_io = end_buffer_write_sync;
3111 ret = submit_bh(WRITE_SYNC, bh); 3111 ret = submit_bh(WRITE, bh);
3112 wait_on_buffer(bh); 3112 wait_on_buffer(bh);
3113 if (buffer_eopnotsupp(bh)) { 3113 if (buffer_eopnotsupp(bh)) {
3114 clear_buffer_eopnotsupp(bh); 3114 clear_buffer_eopnotsupp(bh);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 080703a15f44..65984006192c 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,22 @@
1Version 1.57
2------------
3Improve support for multiple security contexts to the same server. We
4used to use the same "vcnumber" for all connections which could cause
5the server to treat subsequent connections, especially those that
6are authenticated as guest, as reconnections, invalidating the earlier
7user's smb session. This fix allows cifs to mount multiple times to the
8same server with different userids without risking invalidating earlier
9established security contexts. fsync now sends SMB Flush operation
10to better ensure that we wait for server to write all of the data to
11server disk (not just write it over the network). Add new mount
12parameter to allow user to disable sending the (slow) SMB flush on
13fsync if desired (fsync still flushes all cached write data to the server).
14Posix file open support added (turned off after one attempt if server
15fails to support it properly, as with Samba server versions prior to 3.3.2)
16Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
17little memory for the "nativeFileSystem" field returned by the server
18during mount).
19
1Version 1.56 20Version 1.56
2------------ 21------------
3Add "forcemandatorylock" mount option to allow user to use mandatory 22Add "forcemandatorylock" mount option to allow user to use mandatory
@@ -5,7 +24,12 @@ rather than posix (advisory) byte range locks, even though server would
5support posix byte range locks. Fix query of root inode when prefixpath 24support posix byte range locks. Fix query of root inode when prefixpath
6specified and user does not have access to query information about the 25specified and user does not have access to query information about the
7top of the share. Fix problem in 2.6.28 resolving DFS paths to 26top of the share. Fix problem in 2.6.28 resolving DFS paths to
8Samba servers (worked to Windows). 27Samba servers (worked to Windows). Fix rmdir so that pending search
28(readdir) requests do not get invalid results which include the now
29removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
30when using DFS. Add better file create support to servers which support
31the CIFS POSIX protocol extensions (this adds support for new flags
32on create, and improves semantics for write of locked ranges).
9 33
10Version 1.55 34Version 1.55
11------------ 35------------
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 341a98965bd0..6994a0f54f02 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -118,6 +118,18 @@ config CIFS_DEBUG2
118 option can be turned off unless you are debugging 118 option can be turned off unless you are debugging
119 cifs problems. If unsure, say N. 119 cifs problems. If unsure, say N.
120 120
121config CIFS_DFS_UPCALL
122 bool "DFS feature support"
123 depends on CIFS && KEYS
124 help
125 Distributed File System (DFS) support is used to access shares
126 transparently in an enterprise name space, even if the share
127 moves to a different server. This feature also enables
128 an upcall mechanism for CIFS which contacts userspace helper
129 utilities to provide server name resolution (host names to
130 IP addresses) which is needed for implicit mounts of DFS junction
131 points. If unsure, say N.
132
121config CIFS_EXPERIMENTAL 133config CIFS_EXPERIMENTAL
122 bool "CIFS Experimental Features (EXPERIMENTAL)" 134 bool "CIFS Experimental Features (EXPERIMENTAL)"
123 depends on CIFS && EXPERIMENTAL 135 depends on CIFS && EXPERIMENTAL
@@ -131,12 +143,3 @@ config CIFS_EXPERIMENTAL
131 (which is disabled by default). See the file fs/cifs/README 143 (which is disabled by default). See the file fs/cifs/README
132 for more details. If unsure, say N. 144 for more details. If unsure, say N.
133 145
134config CIFS_DFS_UPCALL
135 bool "DFS feature support (EXPERIMENTAL)"
136 depends on CIFS_EXPERIMENTAL
137 depends on KEYS
138 help
139 Enables an upcall mechanism for CIFS which contacts userspace
140 helper utilities to provide server name resolution (host names to
141 IP addresses) which is needed for implicit mounts of DFS junction
142 points. If unsure, say N.
diff --git a/fs/cifs/README b/fs/cifs/README
index da4515e3be20..07434181623b 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -472,6 +472,19 @@ A partial list of the supported mount options follows:
472 even if the cifs server would support posix advisory locks. 472 even if the cifs server would support posix advisory locks.
473 "forcemand" is accepted as a shorter form of this mount 473 "forcemand" is accepted as a shorter form of this mount
474 option. 474 option.
475 nostrictsync If this mount option is set, when an application does an
476 fsync call then the cifs client does not send an SMB Flush
477 to the server (to force the server to write all dirty data
478 for this file immediately to disk), although cifs still sends
479 all dirty (cached) file data to the server and waits for the
480 server to respond to the write. Since SMB Flush can be
481 very slow, and some servers may be reliable enough (to risk
482 delaying slightly flushing the data to disk on the server),
483 turning on this option may be useful to improve performance for
484 applications that fsync too much, at a small risk of server
485 crash. If this mount option is not set, by default cifs will
486 send an SMB flush request (and wait for a response) on every
487 fsync call.
475 nodfs Disable DFS (global name space support) even if the 488 nodfs Disable DFS (global name space support) even if the
476 server claims to support it. This can help work around 489 server claims to support it. This can help work around
477 a problem with parsing of DFS paths with Samba server 490 a problem with parsing of DFS paths with Samba server
@@ -692,13 +705,14 @@ require this helper. Note that NTLMv2 security (which does not require the
692cifs.upcall helper program), instead of using Kerberos, is sufficient for 705cifs.upcall helper program), instead of using Kerberos, is sufficient for
693some use cases. 706some use cases.
694 707
695Enabling DFS support (used to access shares transparently in an MS-DFS 708DFS support allows transparent redirection to shares in an MS-DFS name space.
696global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled. In 709In addition, DFS support for target shares which are specified as UNC
697addition, DFS support for target shares which are specified as UNC
698names which begin with host names (rather than IP addresses) requires 710names which begin with host names (rather than IP addresses) requires
699a user space helper (such as cifs.upcall) to be present in order to 711a user space helper (such as cifs.upcall) to be present in order to
700translate host names to ip address, and the user space helper must also 712translate host names to ip address, and the user space helper must also
701be configured in the file /etc/request-key.conf 713be configured in the file /etc/request-key.conf. Samba, Windows servers and
714many NAS appliances support DFS as a way of constructing a global name
715space to ease network configuration and improve reliability.
702 716
703To use cifs Kerberos and DFS support, the Linux keyutils package should be 717To use cifs Kerberos and DFS support, the Linux keyutils package should be
704installed and something like the following lines should be added to the 718installed and something like the following lines should be added to the
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 490e34bbf27a..877e4d9a1159 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -340,6 +340,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
340 seq_printf(m, "\nWrites: %d Bytes: %lld", 340 seq_printf(m, "\nWrites: %d Bytes: %lld",
341 atomic_read(&tcon->num_writes), 341 atomic_read(&tcon->num_writes),
342 (long long)(tcon->bytes_written)); 342 (long long)(tcon->bytes_written));
343 seq_printf(m, "\nFlushes: %d",
344 atomic_read(&tcon->num_flushes));
343 seq_printf(m, "\nLocks: %d HardLinks: %d " 345 seq_printf(m, "\nLocks: %d HardLinks: %d "
344 "Symlinks: %d", 346 "Symlinks: %d",
345 atomic_read(&tcon->num_locks), 347 atomic_read(&tcon->num_locks),
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 85c0a74d034d..5fdbf8a14472 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -104,9 +104,9 @@ static char *cifs_get_share_name(const char *node_name)
104 104
105 105
106/** 106/**
107 * compose_mount_options - creates mount options for refferral 107 * cifs_compose_mount_options - creates mount options for refferral
108 * @sb_mountdata: parent/root DFS mount options (template) 108 * @sb_mountdata: parent/root DFS mount options (template)
109 * @dentry: point where we are going to mount 109 * @fullpath: full path in UNC format
110 * @ref: server's referral 110 * @ref: server's referral
111 * @devname: pointer for saving device name 111 * @devname: pointer for saving device name
112 * 112 *
@@ -116,8 +116,8 @@ static char *cifs_get_share_name(const char *node_name)
116 * Returns: pointer to new mount options or ERR_PTR. 116 * Returns: pointer to new mount options or ERR_PTR.
117 * Caller is responcible for freeing retunrned value if it is not error. 117 * Caller is responcible for freeing retunrned value if it is not error.
118 */ 118 */
119static char *compose_mount_options(const char *sb_mountdata, 119char *cifs_compose_mount_options(const char *sb_mountdata,
120 struct dentry *dentry, 120 const char *fullpath,
121 const struct dfs_info3_param *ref, 121 const struct dfs_info3_param *ref,
122 char **devname) 122 char **devname)
123{ 123{
@@ -128,7 +128,6 @@ static char *compose_mount_options(const char *sb_mountdata,
128 char *srvIP = NULL; 128 char *srvIP = NULL;
129 char sep = ','; 129 char sep = ',';
130 int off, noff; 130 int off, noff;
131 char *fullpath;
132 131
133 if (sb_mountdata == NULL) 132 if (sb_mountdata == NULL)
134 return ERR_PTR(-EINVAL); 133 return ERR_PTR(-EINVAL);
@@ -202,17 +201,6 @@ static char *compose_mount_options(const char *sb_mountdata,
202 goto compose_mount_options_err; 201 goto compose_mount_options_err;
203 } 202 }
204 203
205 /*
206 * this function gives us a path with a double backslash prefix. We
207 * require a single backslash for DFS. Temporarily increment fullpath
208 * to put it in the proper form and decrement before freeing it.
209 */
210 fullpath = build_path_from_dentry(dentry);
211 if (!fullpath) {
212 rc = -ENOMEM;
213 goto compose_mount_options_err;
214 }
215 ++fullpath;
216 tkn_e = strchr(tkn_e + 1, '\\'); 204 tkn_e = strchr(tkn_e + 1, '\\');
217 if (tkn_e || (strlen(fullpath) - ref->path_consumed)) { 205 if (tkn_e || (strlen(fullpath) - ref->path_consumed)) {
218 strncat(mountdata, &sep, 1); 206 strncat(mountdata, &sep, 1);
@@ -221,8 +209,6 @@ static char *compose_mount_options(const char *sb_mountdata,
221 strcat(mountdata, tkn_e + 1); 209 strcat(mountdata, tkn_e + 1);
222 strcat(mountdata, fullpath + ref->path_consumed); 210 strcat(mountdata, fullpath + ref->path_consumed);
223 } 211 }
224 --fullpath;
225 kfree(fullpath);
226 212
227 /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/ 213 /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
228 /*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/ 214 /*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
@@ -245,10 +231,20 @@ static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
245 struct vfsmount *mnt; 231 struct vfsmount *mnt;
246 char *mountdata; 232 char *mountdata;
247 char *devname = NULL; 233 char *devname = NULL;
234 char *fullpath;
248 235
249 cifs_sb = CIFS_SB(dentry->d_inode->i_sb); 236 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
250 mountdata = compose_mount_options(cifs_sb->mountdata, 237 /*
251 dentry, ref, &devname); 238 * this function gives us a path with a double backslash prefix. We
239 * require a single backslash for DFS.
240 */
241 fullpath = build_path_from_dentry(dentry);
242 if (!fullpath)
243 return ERR_PTR(-ENOMEM);
244
245 mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
246 fullpath + 1, ref, &devname);
247 kfree(fullpath);
252 248
253 if (IS_ERR(mountdata)) 249 if (IS_ERR(mountdata))
254 return (struct vfsmount *)mountdata; 250 return (struct vfsmount *)mountdata;
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index c4c306f7b06f..4797787c6a44 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -32,6 +32,7 @@
32#define CIFS_MOUNT_OVERR_GID 0x800 /* override gid returned from server */ 32#define CIFS_MOUNT_OVERR_GID 0x800 /* override gid returned from server */
33#define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */ 33#define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */
34#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */ 34#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */
35#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/
35 36
36struct cifs_sb_info { 37struct cifs_sb_info {
37 struct cifsTconInfo *tcon; /* primary mount */ 38 struct cifsTconInfo *tcon; /* primary mount */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index d4839cf0cb2c..7c9809523f42 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -48,11 +48,11 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
48 if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL)) 48 if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
49 return -EINVAL; 49 return -EINVAL;
50 50
51 MD5Init(&context); 51 cifs_MD5_init(&context);
52 MD5Update(&context, (char *)&key->data, key->len); 52 cifs_MD5_update(&context, (char *)&key->data, key->len);
53 MD5Update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); 53 cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
54 54
55 MD5Final(signature, &context); 55 cifs_MD5_final(signature, &context);
56 return 0; 56 return 0;
57} 57}
58 58
@@ -96,8 +96,8 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
96 if ((iov == NULL) || (signature == NULL) || (key == NULL)) 96 if ((iov == NULL) || (signature == NULL) || (key == NULL))
97 return -EINVAL; 97 return -EINVAL;
98 98
99 MD5Init(&context); 99 cifs_MD5_init(&context);
100 MD5Update(&context, (char *)&key->data, key->len); 100 cifs_MD5_update(&context, (char *)&key->data, key->len);
101 for (i = 0; i < n_vec; i++) { 101 for (i = 0; i < n_vec; i++) {
102 if (iov[i].iov_len == 0) 102 if (iov[i].iov_len == 0)
103 continue; 103 continue;
@@ -110,13 +110,13 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
110 if (i == 0) { 110 if (i == 0) {
111 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ 111 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
112 break; /* nothing to sign or corrupt header */ 112 break; /* nothing to sign or corrupt header */
113 MD5Update(&context, iov[0].iov_base+4, 113 cifs_MD5_update(&context, iov[0].iov_base+4,
114 iov[0].iov_len-4); 114 iov[0].iov_len-4);
115 } else 115 } else
116 MD5Update(&context, iov[i].iov_base, iov[i].iov_len); 116 cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
117 } 117 }
118 118
119 MD5Final(signature, &context); 119 cifs_MD5_final(signature, &context);
120 120
121 return 0; 121 return 0;
122} 122}
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7ac481841f87..2b1d28a9ee28 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
100extern const struct export_operations cifs_export_ops; 100extern const struct export_operations cifs_export_ops;
101#endif /* EXPERIMENTAL */ 101#endif /* EXPERIMENTAL */
102 102
103#define CIFS_VERSION "1.56" 103#define CIFS_VERSION "1.57"
104#endif /* _CIFSFS_H */ 104#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 94c1ca0ec953..9fbf4dff5da6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -164,9 +164,12 @@ struct TCP_Server_Info {
164 /* multiplexed reads or writes */ 164 /* multiplexed reads or writes */
165 unsigned int maxBuf; /* maxBuf specifies the maximum */ 165 unsigned int maxBuf; /* maxBuf specifies the maximum */
166 /* message size the server can send or receive for non-raw SMBs */ 166 /* message size the server can send or receive for non-raw SMBs */
167 unsigned int maxRw; /* maxRw specifies the maximum */ 167 unsigned int max_rw; /* maxRw specifies the maximum */
168 /* message size the server can send or receive for */ 168 /* message size the server can send or receive for */
169 /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ 169 /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
170 unsigned int max_vcs; /* maximum number of smb sessions, at least
171 those that can be specified uniquely with
172 vcnumbers */
170 char sessid[4]; /* unique token id for this session */ 173 char sessid[4]; /* unique token id for this session */
171 /* (returned on Negotiate */ 174 /* (returned on Negotiate */
172 int capabilities; /* allow selective disabling of caps by smb sess */ 175 int capabilities; /* allow selective disabling of caps by smb sess */
@@ -210,6 +213,7 @@ struct cifsSesInfo {
210 unsigned overrideSecFlg; /* if non-zero override global sec flags */ 213 unsigned overrideSecFlg; /* if non-zero override global sec flags */
211 __u16 ipc_tid; /* special tid for connection to IPC share */ 214 __u16 ipc_tid; /* special tid for connection to IPC share */
212 __u16 flags; 215 __u16 flags;
216 __u16 vcnum;
213 char *serverOS; /* name of operating system underlying server */ 217 char *serverOS; /* name of operating system underlying server */
214 char *serverNOS; /* name of network operating system of server */ 218 char *serverNOS; /* name of network operating system of server */
215 char *serverDomain; /* security realm of server */ 219 char *serverDomain; /* security realm of server */
@@ -250,6 +254,7 @@ struct cifsTconInfo {
250 atomic_t num_smbs_sent; 254 atomic_t num_smbs_sent;
251 atomic_t num_writes; 255 atomic_t num_writes;
252 atomic_t num_reads; 256 atomic_t num_reads;
257 atomic_t num_flushes;
253 atomic_t num_oplock_brks; 258 atomic_t num_oplock_brks;
254 atomic_t num_opens; 259 atomic_t num_opens;
255 atomic_t num_closes; 260 atomic_t num_closes;
@@ -294,6 +299,7 @@ struct cifsTconInfo {
294 bool unix_ext:1; /* if false disable Linux extensions to CIFS protocol 299 bool unix_ext:1; /* if false disable Linux extensions to CIFS protocol
295 for this mount even if server would support */ 300 for this mount even if server would support */
296 bool local_lease:1; /* check leases (only) on local system not remote */ 301 bool local_lease:1; /* check leases (only) on local system not remote */
302 bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
297 bool need_reconnect:1; /* connection reset, tid now invalid */ 303 bool need_reconnect:1; /* connection reset, tid now invalid */
298 /* BB add field for back pointer to sb struct(s)? */ 304 /* BB add field for back pointer to sb struct(s)? */
299}; 305};
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b4e2e9f0ee3d..b370489c8da5 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifspdu.h 2 * fs/cifs/cifspdu.h
3 * 3 *
4 * Copyright (c) International Business Machines Corp., 2002,2008 4 * Copyright (c) International Business Machines Corp., 2002,2009
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -23,6 +23,7 @@
23#define _CIFSPDU_H 23#define _CIFSPDU_H
24 24
25#include <net/sock.h> 25#include <net/sock.h>
26#include "smbfsctl.h"
26 27
27#ifdef CONFIG_CIFS_WEAK_PW_HASH 28#ifdef CONFIG_CIFS_WEAK_PW_HASH
28#define LANMAN_PROT 0 29#define LANMAN_PROT 0
@@ -34,15 +35,15 @@
34#define POSIX_PROT (CIFS_PROT+1) 35#define POSIX_PROT (CIFS_PROT+1)
35#define BAD_PROT 0xFFFF 36#define BAD_PROT 0xFFFF
36 37
37/* SMB command codes */ 38/* SMB command codes:
38/* 39 * Note some commands have minimal (wct=0,bcc=0), or uninteresting, responses
39 * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
40 * (ie which include no useful data other than the SMB error code itself). 40 * (ie which include no useful data other than the SMB error code itself).
41 * Knowing this helps avoid response buffer allocations and copy in some cases 41 * This can allow us to avoid response buffer allocations and copy in some cases
42 */ 42 */
43#define SMB_COM_CREATE_DIRECTORY 0x00 /* trivial response */ 43#define SMB_COM_CREATE_DIRECTORY 0x00 /* trivial response */
44#define SMB_COM_DELETE_DIRECTORY 0x01 /* trivial response */ 44#define SMB_COM_DELETE_DIRECTORY 0x01 /* trivial response */
45#define SMB_COM_CLOSE 0x04 /* triv req/rsp, timestamp ignored */ 45#define SMB_COM_CLOSE 0x04 /* triv req/rsp, timestamp ignored */
46#define SMB_COM_FLUSH 0x05 /* triv req/rsp */
46#define SMB_COM_DELETE 0x06 /* trivial response */ 47#define SMB_COM_DELETE 0x06 /* trivial response */
47#define SMB_COM_RENAME 0x07 /* trivial response */ 48#define SMB_COM_RENAME 0x07 /* trivial response */
48#define SMB_COM_QUERY_INFORMATION 0x08 /* aka getattr */ 49#define SMB_COM_QUERY_INFORMATION 0x08 /* aka getattr */
@@ -790,6 +791,12 @@ typedef struct smb_com_close_rsp {
790 __u16 ByteCount; /* bct = 0 */ 791 __u16 ByteCount; /* bct = 0 */
791} __attribute__((packed)) CLOSE_RSP; 792} __attribute__((packed)) CLOSE_RSP;
792 793
794typedef struct smb_com_flush_req {
795 struct smb_hdr hdr; /* wct = 1 */
796 __u16 FileID;
797 __u16 ByteCount; /* 0 */
798} __attribute__((packed)) FLUSH_REQ;
799
793typedef struct smb_com_findclose_req { 800typedef struct smb_com_findclose_req {
794 struct smb_hdr hdr; /* wct = 1 */ 801 struct smb_hdr hdr; /* wct = 1 */
795 __u16 FileID; 802 __u16 FileID;
@@ -1924,19 +1931,19 @@ typedef struct smb_com_transaction2_get_dfs_refer_req {
1924#define DFS_TYPE_ROOT 0x0001 1931#define DFS_TYPE_ROOT 0x0001
1925 1932
1926/* Referral Entry Flags */ 1933/* Referral Entry Flags */
1927#define DFS_NAME_LIST_REF 0x0200 1934#define DFS_NAME_LIST_REF 0x0200 /* set for domain or DC referral responses */
1935#define DFS_TARGET_SET_BOUNDARY 0x0400 /* only valid with version 4 dfs req */
1928 1936
1929typedef struct dfs_referral_level_3 { 1937typedef struct dfs_referral_level_3 { /* version 4 is same, + one flag bit */
1930 __le16 VersionNumber; 1938 __le16 VersionNumber; /* must be 3 or 4 */
1931 __le16 Size; 1939 __le16 Size;
1932 __le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */ 1940 __le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */
1933 __le16 ReferralEntryFlags; /* 0x0200 bit set only for domain 1941 __le16 ReferralEntryFlags;
1934 or DC referral responce */
1935 __le32 TimeToLive; 1942 __le32 TimeToLive;
1936 __le16 DfsPathOffset; 1943 __le16 DfsPathOffset;
1937 __le16 DfsAlternatePathOffset; 1944 __le16 DfsAlternatePathOffset;
1938 __le16 NetworkAddressOffset; /* offset of the link target */ 1945 __le16 NetworkAddressOffset; /* offset of the link target */
1939 __le16 ServiceSiteGuid; 1946 __u8 ServiceSiteGuid[16]; /* MBZ, ignored */
1940} __attribute__((packed)) REFERRAL3; 1947} __attribute__((packed)) REFERRAL3;
1941 1948
1942typedef struct smb_com_transaction_get_dfs_refer_rsp { 1949typedef struct smb_com_transaction_get_dfs_refer_rsp {
@@ -1946,48 +1953,15 @@ typedef struct smb_com_transaction_get_dfs_refer_rsp {
1946 __u8 Pad; 1953 __u8 Pad;
1947 __le16 PathConsumed; 1954 __le16 PathConsumed;
1948 __le16 NumberOfReferrals; 1955 __le16 NumberOfReferrals;
1949 __le16 DFSFlags; 1956 __le32 DFSFlags;
1950 __u16 Pad2;
1951 REFERRAL3 referrals[1]; /* array of level 3 dfs_referral structures */ 1957 REFERRAL3 referrals[1]; /* array of level 3 dfs_referral structures */
1952 /* followed by the strings pointed to by the referral structures */ 1958 /* followed by the strings pointed to by the referral structures */
1953} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_RSP; 1959} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_RSP;
1954 1960
1955/* DFS Flags */ 1961/* DFS Flags */
1956#define DFSREF_REFERRAL_SERVER 0x0001 1962#define DFSREF_REFERRAL_SERVER 0x00000001 /* all targets are DFS roots */
1957#define DFSREF_STORAGE_SERVER 0x0002 1963#define DFSREF_STORAGE_SERVER 0x00000002 /* no further ref requests needed */
1958 1964#define DFSREF_TARGET_FAILBACK 0x00000004 /* only for DFS referral version 4 */
1959/* IOCTL information */
1960/*
1961 * List of ioctl function codes that look to be of interest to remote clients
1962 * like this one. Need to do some experimentation to make sure they all work
1963 * remotely. Some of the following, such as the encryption/compression ones
1964 * would be invoked from tools via a specialized hook into the VFS rather
1965 * than via the standard vfs entry points
1966 */
1967#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
1968#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
1969#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008
1970#define FSCTL_LOCK_VOLUME 0x00090018
1971#define FSCTL_UNLOCK_VOLUME 0x0009001C
1972#define FSCTL_GET_COMPRESSION 0x0009003C
1973#define FSCTL_SET_COMPRESSION 0x0009C040
1974#define FSCTL_REQUEST_FILTER_OPLOCK 0x0009008C
1975#define FSCTL_FILESYS_GET_STATISTICS 0x00090090
1976#define FSCTL_SET_REPARSE_POINT 0x000900A4
1977#define FSCTL_GET_REPARSE_POINT 0x000900A8
1978#define FSCTL_DELETE_REPARSE_POINT 0x000900AC
1979#define FSCTL_SET_SPARSE 0x000900C4
1980#define FSCTL_SET_ZERO_DATA 0x000900C8
1981#define FSCTL_SET_ENCRYPTION 0x000900D7
1982#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB
1983#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF
1984#define FSCTL_READ_RAW_ENCRYPTED 0x000900E3
1985#define FSCTL_SIS_COPYFILE 0x00090100
1986#define FSCTL_SIS_LINK_FILES 0x0009C104
1987
1988#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003
1989#define IO_REPARSE_TAG_HSM 0xC0000004
1990#define IO_REPARSE_TAG_SIS 0x80000007
1991 1965
1992/* 1966/*
1993 ************************************************************************ 1967 ************************************************************************
@@ -2508,8 +2482,6 @@ struct data_blob {
2508 6) Use nanosecond timestamps throughout all time fields if 2482 6) Use nanosecond timestamps throughout all time fields if
2509 corresponding attribute flag is set 2483 corresponding attribute flag is set
2510 7) sendfile - handle based copy 2484 7) sendfile - handle based copy
2511 8) Direct i/o
2512 9) Misc fcntls?
2513 2485
2514 what about fixing 64 bit alignment 2486 what about fixing 64 bit alignment
2515 2487
@@ -2628,7 +2600,5 @@ typedef struct file_chattr_info {
2628 __le64 mode; /* list of actual attribute bits on this inode */ 2600 __le64 mode; /* list of actual attribute bits on this inode */
2629} __attribute__((packed)) FILE_CHATTR_INFO; /* ext attributes 2601} __attribute__((packed)) FILE_CHATTR_INFO; /* ext attributes
2630 (chattr, chflags) level 0x206 */ 2602 (chattr, chflags) level 0x206 */
2631 2603#endif /* POSIX */
2632#endif
2633
2634#endif /* _CIFSPDU_H */ 2604#endif /* _CIFSPDU_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 06f6779988bf..4167716d32f2 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -35,14 +35,18 @@ extern struct smb_hdr *cifs_buf_get(void);
35extern void cifs_buf_release(void *); 35extern void cifs_buf_release(void *);
36extern struct smb_hdr *cifs_small_buf_get(void); 36extern struct smb_hdr *cifs_small_buf_get(void);
37extern void cifs_small_buf_release(void *); 37extern void cifs_small_buf_release(void *);
38extern int smb_send(struct socket *, struct smb_hdr *, 38extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
39 unsigned int /* length */ , struct sockaddr *, bool); 39 unsigned int /* length */);
40extern unsigned int _GetXid(void); 40extern unsigned int _GetXid(void);
41extern void _FreeXid(unsigned int); 41extern void _FreeXid(unsigned int);
42#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid())); 42#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
43#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));} 43#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
44extern char *build_path_from_dentry(struct dentry *); 44extern char *build_path_from_dentry(struct dentry *);
45extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
45extern char *build_wildcard_path_from_dentry(struct dentry *direntry); 46extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
47extern char *cifs_compose_mount_options(const char *sb_mountdata,
48 const char *fullpath, const struct dfs_info3_param *ref,
49 char **devname);
46/* extern void renew_parental_timestamps(struct dentry *direntry);*/ 50/* extern void renew_parental_timestamps(struct dentry *direntry);*/
47extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *, 51extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
48 struct smb_hdr * /* input */ , 52 struct smb_hdr * /* input */ ,
@@ -91,6 +95,12 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
91extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time); 95extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
92extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time); 96extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
93 97
98extern int cifs_posix_open(char *full_path, struct inode **pinode,
99 struct super_block *sb, int mode, int oflags,
100 int *poplock, __u16 *pnetfid, int xid);
101extern void posix_fill_in_inode(struct inode *tmp_inode,
102 FILE_UNIX_BASIC_INFO *pData, int isNewInode);
103extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
94extern int cifs_get_inode_info(struct inode **pinode, 104extern int cifs_get_inode_info(struct inode **pinode,
95 const unsigned char *search_path, 105 const unsigned char *search_path,
96 FILE_ALL_INFO *pfile_info, 106 FILE_ALL_INFO *pfile_info,
@@ -277,6 +287,9 @@ extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon,
277extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, 287extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
278 const int smb_file_id); 288 const int smb_file_id);
279 289
290extern int CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon,
291 const int smb_file_id);
292
280extern int CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, 293extern int CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
281 const int netfid, unsigned int count, 294 const int netfid, unsigned int count,
282 const __u64 lseek, unsigned int *nbytes, char **buf, 295 const __u64 lseek, unsigned int *nbytes, char **buf,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 552642a507c4..bc09c998631f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -528,14 +528,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
528 server->maxReq = le16_to_cpu(rsp->MaxMpxCount); 528 server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
529 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), 529 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
530 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 530 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
531 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
531 GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey); 532 GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
532 /* even though we do not use raw we might as well set this 533 /* even though we do not use raw we might as well set this
533 accurately, in case we ever find a need for it */ 534 accurately, in case we ever find a need for it */
534 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { 535 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
535 server->maxRw = 0xFF00; 536 server->max_rw = 0xFF00;
536 server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE; 537 server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
537 } else { 538 } else {
538 server->maxRw = 0;/* we do not need to use raw anyway */ 539 server->max_rw = 0;/* do not need to use raw anyway */
539 server->capabilities = CAP_MPX_MODE; 540 server->capabilities = CAP_MPX_MODE;
540 } 541 }
541 tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone); 542 tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
@@ -638,7 +639,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
638 /* probably no need to store and check maxvcs */ 639 /* probably no need to store and check maxvcs */
639 server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), 640 server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
640 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 641 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
641 server->maxRw = le32_to_cpu(pSMBr->MaxRawSize); 642 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
642 cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf)); 643 cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
643 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey); 644 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
644 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 645 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
@@ -1933,6 +1934,27 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1933} 1934}
1934 1935
1935int 1936int
1937CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1938{
1939 int rc = 0;
1940 FLUSH_REQ *pSMB = NULL;
1941 cFYI(1, ("In CIFSSMBFlush"));
1942
1943 rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
1944 if (rc)
1945 return rc;
1946
1947 pSMB->FileID = (__u16) smb_file_id;
1948 pSMB->ByteCount = 0;
1949 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
1950 cifs_stats_inc(&tcon->num_flushes);
1951 if (rc)
1952 cERROR(1, ("Send error in Flush = %d", rc));
1953
1954 return rc;
1955}
1956
1957int
1936CIFSSMBRename(const int xid, struct cifsTconInfo *tcon, 1958CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
1937 const char *fromName, const char *toName, 1959 const char *fromName, const char *toName,
1938 const struct nls_table *nls_codepage, int remap) 1960 const struct nls_table *nls_codepage, int remap)
@@ -2355,8 +2377,10 @@ winCreateHardLinkRetry:
2355 PATH_MAX, nls_codepage, remap); 2377 PATH_MAX, nls_codepage, remap);
2356 name_len++; /* trailing null */ 2378 name_len++; /* trailing null */
2357 name_len *= 2; 2379 name_len *= 2;
2358 pSMB->OldFileName[name_len] = 0; /* pad */ 2380
2359 pSMB->OldFileName[name_len + 1] = 0x04; 2381 /* protocol specifies ASCII buffer format (0x04) for unicode */
2382 pSMB->OldFileName[name_len] = 0x04;
2383 pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
2360 name_len2 = 2384 name_len2 =
2361 cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], 2385 cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
2362 toName, PATH_MAX, nls_codepage, remap); 2386 toName, PATH_MAX, nls_codepage, remap);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e9ea394ee075..0de3b5615a22 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,7 +23,6 @@
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/wait.h> 25#include <linux/wait.h>
26#include <linux/ipv6.h>
27#include <linux/pagemap.h> 26#include <linux/pagemap.h>
28#include <linux/ctype.h> 27#include <linux/ctype.h>
29#include <linux/utsname.h> 28#include <linux/utsname.h>
@@ -35,6 +34,7 @@
35#include <linux/freezer.h> 34#include <linux/freezer.h>
36#include <asm/uaccess.h> 35#include <asm/uaccess.h>
37#include <asm/processor.h> 36#include <asm/processor.h>
37#include <net/ipv6.h>
38#include "cifspdu.h" 38#include "cifspdu.h"
39#include "cifsglob.h" 39#include "cifsglob.h"
40#include "cifsproto.h" 40#include "cifsproto.h"
@@ -95,6 +95,7 @@ struct smb_vol {
95 bool local_lease:1; /* check leases only on local system, not remote */ 95 bool local_lease:1; /* check leases only on local system, not remote */
96 bool noblocksnd:1; 96 bool noblocksnd:1;
97 bool noautotune:1; 97 bool noautotune:1;
98 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
98 unsigned int rsize; 99 unsigned int rsize;
99 unsigned int wsize; 100 unsigned int wsize;
100 unsigned int sockopt; 101 unsigned int sockopt;
@@ -1274,6 +1275,10 @@ cifs_parse_mount_options(char *options, const char *devname,
1274 vol->intr = 0; 1275 vol->intr = 0;
1275 } else if (strnicmp(data, "intr", 4) == 0) { 1276 } else if (strnicmp(data, "intr", 4) == 0) {
1276 vol->intr = 1; 1277 vol->intr = 1;
1278 } else if (strnicmp(data, "nostrictsync", 12) == 0) {
1279 vol->nostrictsync = 1;
1280 } else if (strnicmp(data, "strictsync", 10) == 0) {
1281 vol->nostrictsync = 0;
1277 } else if (strnicmp(data, "serverino", 7) == 0) { 1282 } else if (strnicmp(data, "serverino", 7) == 0) {
1278 vol->server_ino = 1; 1283 vol->server_ino = 1;
1279 } else if (strnicmp(data, "noserverino", 9) == 0) { 1284 } else if (strnicmp(data, "noserverino", 9) == 0) {
@@ -1354,7 +1359,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1354} 1359}
1355 1360
1356static struct TCP_Server_Info * 1361static struct TCP_Server_Info *
1357cifs_find_tcp_session(struct sockaddr *addr) 1362cifs_find_tcp_session(struct sockaddr_storage *addr)
1358{ 1363{
1359 struct list_head *tmp; 1364 struct list_head *tmp;
1360 struct TCP_Server_Info *server; 1365 struct TCP_Server_Info *server;
@@ -1374,13 +1379,13 @@ cifs_find_tcp_session(struct sockaddr *addr)
1374 if (server->tcpStatus == CifsNew) 1379 if (server->tcpStatus == CifsNew)
1375 continue; 1380 continue;
1376 1381
1377 if (addr->sa_family == AF_INET && 1382 if (addr->ss_family == AF_INET &&
1378 (addr4->sin_addr.s_addr != 1383 (addr4->sin_addr.s_addr !=
1379 server->addr.sockAddr.sin_addr.s_addr)) 1384 server->addr.sockAddr.sin_addr.s_addr))
1380 continue; 1385 continue;
1381 else if (addr->sa_family == AF_INET6 && 1386 else if (addr->ss_family == AF_INET6 &&
1382 memcmp(&server->addr.sockAddr6.sin6_addr, 1387 !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
1383 &addr6->sin6_addr, sizeof(addr6->sin6_addr))) 1388 &addr6->sin6_addr))
1384 continue; 1389 continue;
1385 1390
1386 ++server->srv_count; 1391 ++server->srv_count;
@@ -1419,12 +1424,12 @@ static struct TCP_Server_Info *
1419cifs_get_tcp_session(struct smb_vol *volume_info) 1424cifs_get_tcp_session(struct smb_vol *volume_info)
1420{ 1425{
1421 struct TCP_Server_Info *tcp_ses = NULL; 1426 struct TCP_Server_Info *tcp_ses = NULL;
1422 struct sockaddr addr; 1427 struct sockaddr_storage addr;
1423 struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr; 1428 struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
1424 struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr; 1429 struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
1425 int rc; 1430 int rc;
1426 1431
1427 memset(&addr, 0, sizeof(struct sockaddr)); 1432 memset(&addr, 0, sizeof(struct sockaddr_storage));
1428 1433
1429 if (volume_info->UNCip && volume_info->UNC) { 1434 if (volume_info->UNCip && volume_info->UNC) {
1430 rc = cifs_inet_pton(AF_INET, volume_info->UNCip, 1435 rc = cifs_inet_pton(AF_INET, volume_info->UNCip,
@@ -1435,9 +1440,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1435 rc = cifs_inet_pton(AF_INET6, volume_info->UNCip, 1440 rc = cifs_inet_pton(AF_INET6, volume_info->UNCip,
1436 &sin_server6->sin6_addr.in6_u); 1441 &sin_server6->sin6_addr.in6_u);
1437 if (rc > 0) 1442 if (rc > 0)
1438 addr.sa_family = AF_INET6; 1443 addr.ss_family = AF_INET6;
1439 } else { 1444 } else {
1440 addr.sa_family = AF_INET; 1445 addr.ss_family = AF_INET;
1441 } 1446 }
1442 1447
1443 if (rc <= 0) { 1448 if (rc <= 0) {
@@ -1502,7 +1507,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1502 tcp_ses->tcpStatus = CifsNew; 1507 tcp_ses->tcpStatus = CifsNew;
1503 ++tcp_ses->srv_count; 1508 ++tcp_ses->srv_count;
1504 1509
1505 if (addr.sa_family == AF_INET6) { 1510 if (addr.ss_family == AF_INET6) {
1506 cFYI(1, ("attempting ipv6 connect")); 1511 cFYI(1, ("attempting ipv6 connect"));
1507 /* BB should we allow ipv6 on port 139? */ 1512 /* BB should we allow ipv6 on port 139? */
1508 /* other OS never observed in Wild doing 139 with v6 */ 1513 /* other OS never observed in Wild doing 139 with v6 */
@@ -1802,7 +1807,7 @@ ipv4_connect(struct TCP_Server_Info *server)
1802 * user space buffer 1807 * user space buffer
1803 */ 1808 */
1804 socket->sk->sk_rcvtimeo = 7 * HZ; 1809 socket->sk->sk_rcvtimeo = 7 * HZ;
1805 socket->sk->sk_sndtimeo = 3 * HZ; 1810 socket->sk->sk_sndtimeo = 5 * HZ;
1806 1811
1807 /* make the bufsizes depend on wsize/rsize and max requests */ 1812 /* make the bufsizes depend on wsize/rsize and max requests */
1808 if (server->noautotune) { 1813 if (server->noautotune) {
@@ -1860,9 +1865,7 @@ ipv4_connect(struct TCP_Server_Info *server)
1860 smb_buf = (struct smb_hdr *)ses_init_buf; 1865 smb_buf = (struct smb_hdr *)ses_init_buf;
1861 /* sizeof RFC1002_SESSION_REQUEST with no scope */ 1866 /* sizeof RFC1002_SESSION_REQUEST with no scope */
1862 smb_buf->smb_buf_length = 0x81000044; 1867 smb_buf->smb_buf_length = 0x81000044;
1863 rc = smb_send(socket, smb_buf, 0x44, 1868 rc = smb_send(server, smb_buf, 0x44);
1864 (struct sockaddr *) &server->addr.sockAddr,
1865 server->noblocksnd);
1866 kfree(ses_init_buf); 1869 kfree(ses_init_buf);
1867 msleep(1); /* RFC1001 layer in at least one server 1870 msleep(1); /* RFC1001 layer in at least one server
1868 requires very short break before negprot 1871 requires very short break before negprot
@@ -1955,7 +1958,7 @@ ipv6_connect(struct TCP_Server_Info *server)
1955 * user space buffer 1958 * user space buffer
1956 */ 1959 */
1957 socket->sk->sk_rcvtimeo = 7 * HZ; 1960 socket->sk->sk_rcvtimeo = 7 * HZ;
1958 socket->sk->sk_sndtimeo = 3 * HZ; 1961 socket->sk->sk_sndtimeo = 5 * HZ;
1959 server->ssocket = socket; 1962 server->ssocket = socket;
1960 1963
1961 return rc; 1964 return rc;
@@ -2162,6 +2165,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2162 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL; 2165 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
2163 if (pvolume_info->nobrl) 2166 if (pvolume_info->nobrl)
2164 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL; 2167 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
2168 if (pvolume_info->nostrictsync)
2169 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
2165 if (pvolume_info->mand_lock) 2170 if (pvolume_info->mand_lock)
2166 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL; 2171 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
2167 if (pvolume_info->cifs_acl) 2172 if (pvolume_info->cifs_acl)
@@ -2182,6 +2187,33 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2182 "mount option supported")); 2187 "mount option supported"));
2183} 2188}
2184 2189
2190static int
2191is_path_accessible(int xid, struct cifsTconInfo *tcon,
2192 struct cifs_sb_info *cifs_sb, const char *full_path)
2193{
2194 int rc;
2195 __u64 inode_num;
2196 FILE_ALL_INFO *pfile_info;
2197
2198 rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num,
2199 cifs_sb->local_nls,
2200 cifs_sb->mnt_cifs_flags &
2201 CIFS_MOUNT_MAP_SPECIAL_CHR);
2202 if (rc != -EOPNOTSUPP)
2203 return rc;
2204
2205 pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
2206 if (pfile_info == NULL)
2207 return -ENOMEM;
2208
2209 rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info,
2210 0 /* not legacy */, cifs_sb->local_nls,
2211 cifs_sb->mnt_cifs_flags &
2212 CIFS_MOUNT_MAP_SPECIAL_CHR);
2213 kfree(pfile_info);
2214 return rc;
2215}
2216
2185int 2217int
2186cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, 2218cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2187 char *mount_data, const char *devname) 2219 char *mount_data, const char *devname)
@@ -2192,6 +2224,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2192 struct cifsSesInfo *pSesInfo = NULL; 2224 struct cifsSesInfo *pSesInfo = NULL;
2193 struct cifsTconInfo *tcon = NULL; 2225 struct cifsTconInfo *tcon = NULL;
2194 struct TCP_Server_Info *srvTcp = NULL; 2226 struct TCP_Server_Info *srvTcp = NULL;
2227 char *full_path;
2195 2228
2196 xid = GetXid(); 2229 xid = GetXid();
2197 2230
@@ -2428,6 +2461,23 @@ mount_fail_check:
2428 cifs_sb->rsize = min(cifs_sb->rsize, 2461 cifs_sb->rsize = min(cifs_sb->rsize,
2429 (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); 2462 (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
2430 2463
2464 if (!rc && cifs_sb->prepathlen) {
2465 /* build_path_to_root works only when we have a valid tcon */
2466 full_path = cifs_build_path_to_root(cifs_sb);
2467 if (full_path == NULL) {
2468 rc = -ENOMEM;
2469 goto mount_fail_check;
2470 }
2471 rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
2472 if (rc) {
2473 cERROR(1, ("Path %s in not accessible: %d",
2474 full_path, rc));
2475 kfree(full_path);
2476 goto mount_fail_check;
2477 }
2478 kfree(full_path);
2479 }
2480
2431 /* volume_info->password is freed above when existing session found 2481 /* volume_info->password is freed above when existing session found
2432 (in which case it is not needed anymore) but when new sesion is created 2482 (in which case it is not needed anymore) but when new sesion is created
2433 the password ptr is put in the new session structure (in which case the 2483 the password ptr is put in the new session structure (in which case the
@@ -3624,7 +3674,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3624 BCC(smb_buffer_response)) { 3674 BCC(smb_buffer_response)) {
3625 kfree(tcon->nativeFileSystem); 3675 kfree(tcon->nativeFileSystem);
3626 tcon->nativeFileSystem = 3676 tcon->nativeFileSystem =
3627 kzalloc(length + 2, GFP_KERNEL); 3677 kzalloc(2*(length + 1), GFP_KERNEL);
3628 if (tcon->nativeFileSystem) 3678 if (tcon->nativeFileSystem)
3629 cifs_strfromUCS_le( 3679 cifs_strfromUCS_le(
3630 tcon->nativeFileSystem, 3680 tcon->nativeFileSystem,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 838d9c720a5c..f9b6f68be976 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * vfs operations that deal with dentries 4 * vfs operations that deal with dentries
5 * 5 *
6 * Copyright (C) International Business Machines Corp., 2002,2008 6 * Copyright (C) International Business Machines Corp., 2002,2009
7 * Author(s): Steve French (sfrench@us.ibm.com) 7 * Author(s): Steve French (sfrench@us.ibm.com)
8 * 8 *
9 * This library is free software; you can redistribute it and/or modify 9 * This library is free software; you can redistribute it and/or modify
@@ -129,6 +129,91 @@ cifs_bp_rename_retry:
129 return full_path; 129 return full_path;
130} 130}
131 131
132int cifs_posix_open(char *full_path, struct inode **pinode,
133 struct super_block *sb, int mode, int oflags,
134 int *poplock, __u16 *pnetfid, int xid)
135{
136 int rc;
137 __u32 oplock;
138 FILE_UNIX_BASIC_INFO *presp_data;
139 __u32 posix_flags = 0;
140 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
141
142 cFYI(1, ("posix open %s", full_path));
143
144 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
145 if (presp_data == NULL)
146 return -ENOMEM;
147
148/* So far cifs posix extensions can only map the following flags.
149 There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
150 so far we do not seem to need them, and we can treat them as local only */
151 if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
152 (FMODE_READ | FMODE_WRITE))
153 posix_flags = SMB_O_RDWR;
154 else if (oflags & FMODE_READ)
155 posix_flags = SMB_O_RDONLY;
156 else if (oflags & FMODE_WRITE)
157 posix_flags = SMB_O_WRONLY;
158 if (oflags & O_CREAT)
159 posix_flags |= SMB_O_CREAT;
160 if (oflags & O_EXCL)
161 posix_flags |= SMB_O_EXCL;
162 if (oflags & O_TRUNC)
163 posix_flags |= SMB_O_TRUNC;
164 if (oflags & O_APPEND)
165 posix_flags |= SMB_O_APPEND;
166 if (oflags & O_SYNC)
167 posix_flags |= SMB_O_SYNC;
168 if (oflags & O_DIRECTORY)
169 posix_flags |= SMB_O_DIRECTORY;
170 if (oflags & O_NOFOLLOW)
171 posix_flags |= SMB_O_NOFOLLOW;
172 if (oflags & O_DIRECT)
173 posix_flags |= SMB_O_DIRECT;
174
175
176 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
177 pnetfid, presp_data, &oplock, full_path,
178 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
179 CIFS_MOUNT_MAP_SPECIAL_CHR);
180 if (rc)
181 goto posix_open_ret;
182
183 if (presp_data->Type == cpu_to_le32(-1))
184 goto posix_open_ret; /* open ok, caller does qpathinfo */
185
186 /* get new inode and set it up */
187 if (!pinode)
188 goto posix_open_ret; /* caller does not need info */
189
190 if (*pinode == NULL)
191 *pinode = cifs_new_inode(sb, &presp_data->UniqueId);
192 /* else an inode was passed in. Update its info, don't create one */
193
194 /* We do not need to close the file if new_inode fails since
195 the caller will retry qpathinfo as long as inode is null */
196 if (*pinode == NULL)
197 goto posix_open_ret;
198
199 posix_fill_in_inode(*pinode, presp_data, 1);
200
201posix_open_ret:
202 kfree(presp_data);
203 return rc;
204}
205
206static void setup_cifs_dentry(struct cifsTconInfo *tcon,
207 struct dentry *direntry,
208 struct inode *newinode)
209{
210 if (tcon->nocase)
211 direntry->d_op = &cifs_ci_dentry_ops;
212 else
213 direntry->d_op = &cifs_dentry_ops;
214 d_instantiate(direntry, newinode);
215}
216
132/* Inode operations in similar order to how they appear in Linux file fs.h */ 217/* Inode operations in similar order to how they appear in Linux file fs.h */
133 218
134int 219int
@@ -139,14 +224,21 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
139 int xid; 224 int xid;
140 int create_options = CREATE_NOT_DIR; 225 int create_options = CREATE_NOT_DIR;
141 int oplock = 0; 226 int oplock = 0;
227 int oflags;
228 /*
229 * BB below access is probably too much for mknod to request
230 * but we have to do query and setpathinfo so requesting
231 * less could fail (unless we want to request getatr and setatr
232 * permissions (only). At least for POSIX we do not have to
233 * request so much.
234 */
142 int desiredAccess = GENERIC_READ | GENERIC_WRITE; 235 int desiredAccess = GENERIC_READ | GENERIC_WRITE;
143 __u16 fileHandle; 236 __u16 fileHandle;
144 struct cifs_sb_info *cifs_sb; 237 struct cifs_sb_info *cifs_sb;
145 struct cifsTconInfo *pTcon; 238 struct cifsTconInfo *tcon;
146 char *full_path = NULL; 239 char *full_path = NULL;
147 FILE_ALL_INFO *buf = NULL; 240 FILE_ALL_INFO *buf = NULL;
148 struct inode *newinode = NULL; 241 struct inode *newinode = NULL;
149 struct cifsFileInfo *pCifsFile = NULL;
150 struct cifsInodeInfo *pCifsInode; 242 struct cifsInodeInfo *pCifsInode;
151 int disposition = FILE_OVERWRITE_IF; 243 int disposition = FILE_OVERWRITE_IF;
152 bool write_only = false; 244 bool write_only = false;
@@ -154,7 +246,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
154 xid = GetXid(); 246 xid = GetXid();
155 247
156 cifs_sb = CIFS_SB(inode->i_sb); 248 cifs_sb = CIFS_SB(inode->i_sb);
157 pTcon = cifs_sb->tcon; 249 tcon = cifs_sb->tcon;
158 250
159 full_path = build_path_from_dentry(direntry); 251 full_path = build_path_from_dentry(direntry);
160 if (full_path == NULL) { 252 if (full_path == NULL) {
@@ -162,12 +254,44 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
162 return -ENOMEM; 254 return -ENOMEM;
163 } 255 }
164 256
165 if (nd && (nd->flags & LOOKUP_OPEN)) { 257 mode &= ~current->fs->umask;
166 int oflags = nd->intent.open.flags; 258 if (oplockEnabled)
259 oplock = REQ_OPLOCK;
260
261 if (nd && (nd->flags & LOOKUP_OPEN))
262 oflags = nd->intent.open.flags;
263 else
264 oflags = FMODE_READ;
265
266 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
267 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
268 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
269 rc = cifs_posix_open(full_path, &newinode, inode->i_sb,
270 mode, oflags, &oplock, &fileHandle, xid);
271 /* EIO could indicate that (posix open) operation is not
272 supported, despite what server claimed in capability
273 negotation. EREMOTE indicates DFS junction, which is not
274 handled in posix open */
275
276 if ((rc == 0) && (newinode == NULL))
277 goto cifs_create_get_file_info; /* query inode info */
278 else if (rc == 0) /* success, no need to query */
279 goto cifs_create_set_dentry;
280 else if ((rc != -EIO) && (rc != -EREMOTE) &&
281 (rc != -EOPNOTSUPP)) /* path not found or net err */
282 goto cifs_create_out;
283 /* else fallthrough to retry, using older open call, this is
284 case where server does not support this SMB level, and
285 falsely claims capability (also get here for DFS case
286 which should be rare for path not covered on files) */
287 }
167 288
289 if (nd && (nd->flags & LOOKUP_OPEN)) {
290 /* if the file is going to stay open, then we
291 need to set the desired access properly */
168 desiredAccess = 0; 292 desiredAccess = 0;
169 if (oflags & FMODE_READ) 293 if (oflags & FMODE_READ)
170 desiredAccess |= GENERIC_READ; 294 desiredAccess |= GENERIC_READ; /* is this too little? */
171 if (oflags & FMODE_WRITE) { 295 if (oflags & FMODE_WRITE) {
172 desiredAccess |= GENERIC_WRITE; 296 desiredAccess |= GENERIC_WRITE;
173 if (!(oflags & FMODE_READ)) 297 if (!(oflags & FMODE_READ))
@@ -186,8 +310,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
186 310
187 /* BB add processing to set equivalent of mode - e.g. via CreateX with 311 /* BB add processing to set equivalent of mode - e.g. via CreateX with
188 ACLs */ 312 ACLs */
189 if (oplockEnabled)
190 oplock = REQ_OPLOCK;
191 313
192 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 314 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
193 if (buf == NULL) { 315 if (buf == NULL) {
@@ -196,17 +318,15 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
196 return -ENOMEM; 318 return -ENOMEM;
197 } 319 }
198 320
199 mode &= ~current->fs->umask;
200
201 /* 321 /*
202 * if we're not using unix extensions, see if we need to set 322 * if we're not using unix extensions, see if we need to set
203 * ATTR_READONLY on the create call 323 * ATTR_READONLY on the create call
204 */ 324 */
205 if (!pTcon->unix_ext && (mode & S_IWUGO) == 0) 325 if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
206 create_options |= CREATE_OPTION_READONLY; 326 create_options |= CREATE_OPTION_READONLY;
207 327
208 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 328 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
209 rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, 329 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
210 desiredAccess, create_options, 330 desiredAccess, create_options,
211 &fileHandle, &oplock, buf, cifs_sb->local_nls, 331 &fileHandle, &oplock, buf, cifs_sb->local_nls,
212 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 332 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -215,128 +335,119 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
215 335
216 if (rc == -EIO) { 336 if (rc == -EIO) {
217 /* old server, retry the open legacy style */ 337 /* old server, retry the open legacy style */
218 rc = SMBLegacyOpen(xid, pTcon, full_path, disposition, 338 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
219 desiredAccess, create_options, 339 desiredAccess, create_options,
220 &fileHandle, &oplock, buf, cifs_sb->local_nls, 340 &fileHandle, &oplock, buf, cifs_sb->local_nls,
221 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 341 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
222 } 342 }
223 if (rc) { 343 if (rc) {
224 cFYI(1, ("cifs_create returned 0x%x", rc)); 344 cFYI(1, ("cifs_create returned 0x%x", rc));
225 } else { 345 goto cifs_create_out;
226 /* If Open reported that we actually created a file 346 }
227 then we now have to set the mode if possible */ 347
228 if ((pTcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) { 348 /* If Open reported that we actually created a file
229 struct cifs_unix_set_info_args args = { 349 then we now have to set the mode if possible */
350 if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
351 struct cifs_unix_set_info_args args = {
230 .mode = mode, 352 .mode = mode,
231 .ctime = NO_CHANGE_64, 353 .ctime = NO_CHANGE_64,
232 .atime = NO_CHANGE_64, 354 .atime = NO_CHANGE_64,
233 .mtime = NO_CHANGE_64, 355 .mtime = NO_CHANGE_64,
234 .device = 0, 356 .device = 0,
235 }; 357 };
236 358
237 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { 359 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
238 args.uid = (__u64) current_fsuid(); 360 args.uid = (__u64) current_fsuid();
239 if (inode->i_mode & S_ISGID) 361 if (inode->i_mode & S_ISGID)
240 args.gid = (__u64) inode->i_gid; 362 args.gid = (__u64) inode->i_gid;
241 else 363 else
242 args.gid = (__u64) current_fsgid(); 364 args.gid = (__u64) current_fsgid();
243 } else {
244 args.uid = NO_CHANGE_64;
245 args.gid = NO_CHANGE_64;
246 }
247 CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
248 cifs_sb->local_nls,
249 cifs_sb->mnt_cifs_flags &
250 CIFS_MOUNT_MAP_SPECIAL_CHR);
251 } else { 365 } else {
252 /* BB implement mode setting via Windows security 366 args.uid = NO_CHANGE_64;
253 descriptors e.g. */ 367 args.gid = NO_CHANGE_64;
254 /* CIFSSMBWinSetPerms(xid,pTcon,path,mode,-1,-1,nls);*/
255
256 /* Could set r/o dos attribute if mode & 0222 == 0 */
257 } 368 }
369 CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
370 cifs_sb->local_nls,
371 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
372 } else {
373 /* BB implement mode setting via Windows security
374 descriptors e.g. */
375 /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
258 376
259 /* server might mask mode so we have to query for it */ 377 /* Could set r/o dos attribute if mode & 0222 == 0 */
260 if (pTcon->unix_ext) 378 }
261 rc = cifs_get_inode_info_unix(&newinode, full_path, 379
262 inode->i_sb, xid); 380cifs_create_get_file_info:
263 else { 381 /* server might mask mode so we have to query for it */
264 rc = cifs_get_inode_info(&newinode, full_path, 382 if (tcon->unix_ext)
265 buf, inode->i_sb, xid, 383 rc = cifs_get_inode_info_unix(&newinode, full_path,
266 &fileHandle); 384 inode->i_sb, xid);
267 if (newinode) { 385 else {
268 if (cifs_sb->mnt_cifs_flags & 386 rc = cifs_get_inode_info(&newinode, full_path, buf,
269 CIFS_MOUNT_DYNPERM) 387 inode->i_sb, xid, &fileHandle);
270 newinode->i_mode = mode; 388 if (newinode) {
271 if ((oplock & CIFS_CREATE_ACTION) && 389 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
272 (cifs_sb->mnt_cifs_flags & 390 newinode->i_mode = mode;
273 CIFS_MOUNT_SET_UID)) { 391 if ((oplock & CIFS_CREATE_ACTION) &&
274 newinode->i_uid = current_fsuid(); 392 (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
275 if (inode->i_mode & S_ISGID) 393 newinode->i_uid = current_fsuid();
276 newinode->i_gid = 394 if (inode->i_mode & S_ISGID)
277 inode->i_gid; 395 newinode->i_gid = inode->i_gid;
278 else 396 else
279 newinode->i_gid = 397 newinode->i_gid = current_fsgid();
280 current_fsgid();
281 }
282 } 398 }
283 } 399 }
400 }
284 401
285 if (rc != 0) { 402cifs_create_set_dentry:
286 cFYI(1, 403 if (rc == 0)
287 ("Create worked but get_inode_info failed rc = %d", 404 setup_cifs_dentry(tcon, direntry, newinode);
288 rc)); 405 else
289 } else { 406 cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
290 if (pTcon->nocase) 407
291 direntry->d_op = &cifs_ci_dentry_ops; 408 /* nfsd case - nfs srv does not set nd */
292 else 409 if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
293 direntry->d_op = &cifs_dentry_ops; 410 /* mknod case - do not leave file open */
294 d_instantiate(direntry, newinode); 411 CIFSSMBClose(xid, tcon, fileHandle);
295 } 412 } else if (newinode) {
296 if ((nd == NULL /* nfsd case - nfs srv does not set nd */) || 413 struct cifsFileInfo *pCifsFile =
297 (!(nd->flags & LOOKUP_OPEN))) { 414 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
298 /* mknod case - do not leave file open */ 415
299 CIFSSMBClose(xid, pTcon, fileHandle); 416 if (pCifsFile == NULL)
300 } else if (newinode) { 417 goto cifs_create_out;
301 pCifsFile = 418 pCifsFile->netfid = fileHandle;
302 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 419 pCifsFile->pid = current->tgid;
303 420 pCifsFile->pInode = newinode;
304 if (pCifsFile == NULL) 421 pCifsFile->invalidHandle = false;
305 goto cifs_create_out; 422 pCifsFile->closePend = false;
306 pCifsFile->netfid = fileHandle; 423 init_MUTEX(&pCifsFile->fh_sem);
307 pCifsFile->pid = current->tgid; 424 mutex_init(&pCifsFile->lock_mutex);
308 pCifsFile->pInode = newinode; 425 INIT_LIST_HEAD(&pCifsFile->llist);
309 pCifsFile->invalidHandle = false; 426 atomic_set(&pCifsFile->wrtPending, 0);
310 pCifsFile->closePend = false; 427
311 init_MUTEX(&pCifsFile->fh_sem); 428 /* set the following in open now
312 mutex_init(&pCifsFile->lock_mutex);
313 INIT_LIST_HEAD(&pCifsFile->llist);
314 atomic_set(&pCifsFile->wrtPending, 0);
315
316 /* set the following in open now
317 pCifsFile->pfile = file; */ 429 pCifsFile->pfile = file; */
318 write_lock(&GlobalSMBSeslock); 430 write_lock(&GlobalSMBSeslock);
319 list_add(&pCifsFile->tlist, &pTcon->openFileList); 431 list_add(&pCifsFile->tlist, &tcon->openFileList);
320 pCifsInode = CIFS_I(newinode); 432 pCifsInode = CIFS_I(newinode);
321 if (pCifsInode) { 433 if (pCifsInode) {
322 /* if readable file instance put first in list*/ 434 /* if readable file instance put first in list*/
323 if (write_only) { 435 if (write_only) {
324 list_add_tail(&pCifsFile->flist, 436 list_add_tail(&pCifsFile->flist,
325 &pCifsInode->openFileList); 437 &pCifsInode->openFileList);
326 } else { 438 } else {
327 list_add(&pCifsFile->flist, 439 list_add(&pCifsFile->flist,
328 &pCifsInode->openFileList); 440 &pCifsInode->openFileList);
329 }
330 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
331 pCifsInode->clientCanCacheAll = true;
332 pCifsInode->clientCanCacheRead = true;
333 cFYI(1, ("Exclusive Oplock inode %p",
334 newinode));
335 } else if ((oplock & 0xF) == OPLOCK_READ)
336 pCifsInode->clientCanCacheRead = true;
337 } 441 }
338 write_unlock(&GlobalSMBSeslock); 442 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
443 pCifsInode->clientCanCacheAll = true;
444 pCifsInode->clientCanCacheRead = true;
445 cFYI(1, ("Exclusive Oplock inode %p",
446 newinode));
447 } else if ((oplock & 0xF) == OPLOCK_READ)
448 pCifsInode->clientCanCacheRead = true;
339 } 449 }
450 write_unlock(&GlobalSMBSeslock);
340 } 451 }
341cifs_create_out: 452cifs_create_out:
342 kfree(buf); 453 kfree(buf);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 12bb656fbe75..81747acca4c4 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -78,8 +78,36 @@ static inline int cifs_convert_flags(unsigned int flags)
78 return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES | 78 return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES |
79 FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA | 79 FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA |
80 FILE_READ_DATA); 80 FILE_READ_DATA);
81}
81 82
83static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
84{
85 fmode_t posix_flags = 0;
82 86
87 if ((flags & O_ACCMODE) == O_RDONLY)
88 posix_flags = FMODE_READ;
89 else if ((flags & O_ACCMODE) == O_WRONLY)
90 posix_flags = FMODE_WRITE;
91 else if ((flags & O_ACCMODE) == O_RDWR) {
92 /* GENERIC_ALL is too much permission to request
93 can cause unnecessary access denied on create */
94 /* return GENERIC_ALL; */
95 posix_flags = FMODE_READ | FMODE_WRITE;
96 }
97 /* can not map O_CREAT or O_EXCL or O_TRUNC flags when
98 reopening a file. They had their effect on the original open */
99 if (flags & O_APPEND)
100 posix_flags |= (fmode_t)O_APPEND;
101 if (flags & O_SYNC)
102 posix_flags |= (fmode_t)O_SYNC;
103 if (flags & O_DIRECTORY)
104 posix_flags |= (fmode_t)O_DIRECTORY;
105 if (flags & O_NOFOLLOW)
106 posix_flags |= (fmode_t)O_NOFOLLOW;
107 if (flags & O_DIRECT)
108 posix_flags |= (fmode_t)O_DIRECT;
109
110 return posix_flags;
83} 111}
84 112
85static inline int cifs_get_disposition(unsigned int flags) 113static inline int cifs_get_disposition(unsigned int flags)
@@ -97,6 +125,80 @@ static inline int cifs_get_disposition(unsigned int flags)
97} 125}
98 126
99/* all arguments to this function must be checked for validity in caller */ 127/* all arguments to this function must be checked for validity in caller */
128static inline int cifs_posix_open_inode_helper(struct inode *inode,
129 struct file *file, struct cifsInodeInfo *pCifsInode,
130 struct cifsFileInfo *pCifsFile, int oplock, u16 netfid)
131{
132 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
133/* struct timespec temp; */ /* BB REMOVEME BB */
134
135 file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
136 if (file->private_data == NULL)
137 return -ENOMEM;
138 pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
139 write_lock(&GlobalSMBSeslock);
140 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
141
142 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
143 if (pCifsInode == NULL) {
144 write_unlock(&GlobalSMBSeslock);
145 return -EINVAL;
146 }
147
148 /* want handles we can use to read with first
149 in the list so we do not have to walk the
150 list to search for one in write_begin */
151 if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
152 list_add_tail(&pCifsFile->flist,
153 &pCifsInode->openFileList);
154 } else {
155 list_add(&pCifsFile->flist,
156 &pCifsInode->openFileList);
157 }
158
159 if (pCifsInode->clientCanCacheRead) {
160 /* we have the inode open somewhere else
161 no need to discard cache data */
162 goto psx_client_can_cache;
163 }
164
165 /* BB FIXME need to fix this check to move it earlier into posix_open
166 BB fIX following section BB FIXME */
167
168 /* if not oplocked, invalidate inode pages if mtime or file
169 size changed */
170/* temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
171 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
172 (file->f_path.dentry->d_inode->i_size ==
173 (loff_t)le64_to_cpu(buf->EndOfFile))) {
174 cFYI(1, ("inode unchanged on server"));
175 } else {
176 if (file->f_path.dentry->d_inode->i_mapping) {
177 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
178 if (rc != 0)
179 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
180 }
181 cFYI(1, ("invalidating remote inode since open detected it "
182 "changed"));
183 invalidate_remote_inode(file->f_path.dentry->d_inode);
184 } */
185
186psx_client_can_cache:
187 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
188 pCifsInode->clientCanCacheAll = true;
189 pCifsInode->clientCanCacheRead = true;
190 cFYI(1, ("Exclusive Oplock granted on inode %p",
191 file->f_path.dentry->d_inode));
192 } else if ((oplock & 0xF) == OPLOCK_READ)
193 pCifsInode->clientCanCacheRead = true;
194
195 /* will have to change the unlock if we reenable the
196 filemap_fdatawrite (which does not seem necessary */
197 write_unlock(&GlobalSMBSeslock);
198 return 0;
199}
200
201/* all arguments to this function must be checked for validity in caller */
100static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, 202static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
101 struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile, 203 struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile,
102 struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf, 204 struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
@@ -167,7 +269,7 @@ int cifs_open(struct inode *inode, struct file *file)
167 int rc = -EACCES; 269 int rc = -EACCES;
168 int xid, oplock; 270 int xid, oplock;
169 struct cifs_sb_info *cifs_sb; 271 struct cifs_sb_info *cifs_sb;
170 struct cifsTconInfo *pTcon; 272 struct cifsTconInfo *tcon;
171 struct cifsFileInfo *pCifsFile; 273 struct cifsFileInfo *pCifsFile;
172 struct cifsInodeInfo *pCifsInode; 274 struct cifsInodeInfo *pCifsInode;
173 struct list_head *tmp; 275 struct list_head *tmp;
@@ -180,7 +282,7 @@ int cifs_open(struct inode *inode, struct file *file)
180 xid = GetXid(); 282 xid = GetXid();
181 283
182 cifs_sb = CIFS_SB(inode->i_sb); 284 cifs_sb = CIFS_SB(inode->i_sb);
183 pTcon = cifs_sb->tcon; 285 tcon = cifs_sb->tcon;
184 286
185 if (file->f_flags & O_CREAT) { 287 if (file->f_flags & O_CREAT) {
186 /* search inode for this file and fill in file->private_data */ 288 /* search inode for this file and fill in file->private_data */
@@ -220,6 +322,45 @@ int cifs_open(struct inode *inode, struct file *file)
220 322
221 cFYI(1, ("inode = 0x%p file flags are 0x%x for %s", 323 cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
222 inode, file->f_flags, full_path)); 324 inode, file->f_flags, full_path));
325
326 if (oplockEnabled)
327 oplock = REQ_OPLOCK;
328 else
329 oplock = 0;
330
331 if (!tcon->broken_posix_open && tcon->unix_ext &&
332 (tcon->ses->capabilities & CAP_UNIX) &&
333 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
334 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
335 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
336 /* can not refresh inode info since size could be stale */
337 rc = cifs_posix_open(full_path, &inode, inode->i_sb,
338 cifs_sb->mnt_file_mode /* ignored */,
339 oflags, &oplock, &netfid, xid);
340 if (rc == 0) {
341 cFYI(1, ("posix open succeeded"));
342 /* no need for special case handling of setting mode
343 on read only files needed here */
344
345 cifs_posix_open_inode_helper(inode, file, pCifsInode,
346 pCifsFile, oplock, netfid);
347 goto out;
348 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
349 if (tcon->ses->serverNOS)
350 cERROR(1, ("server %s of type %s returned"
351 " unexpected error on SMB posix open"
352 ", disabling posix open support."
353 " Check if server update available.",
354 tcon->ses->serverName,
355 tcon->ses->serverNOS));
356 tcon->broken_posix_open = true;
357 } else if ((rc != -EIO) && (rc != -EREMOTE) &&
358 (rc != -EOPNOTSUPP)) /* path not found or net err */
359 goto out;
360 /* else fallthrough to retry open the old way on network i/o
361 or DFS errors */
362 }
363
223 desiredAccess = cifs_convert_flags(file->f_flags); 364 desiredAccess = cifs_convert_flags(file->f_flags);
224 365
225/********************************************************************* 366/*********************************************************************
@@ -248,11 +389,6 @@ int cifs_open(struct inode *inode, struct file *file)
248 389
249 disposition = cifs_get_disposition(file->f_flags); 390 disposition = cifs_get_disposition(file->f_flags);
250 391
251 if (oplockEnabled)
252 oplock = REQ_OPLOCK;
253 else
254 oplock = 0;
255
256 /* BB pass O_SYNC flag through on file attributes .. BB */ 392 /* BB pass O_SYNC flag through on file attributes .. BB */
257 393
258 /* Also refresh inode by passing in file_info buf returned by SMBOpen 394 /* Also refresh inode by passing in file_info buf returned by SMBOpen
@@ -269,7 +405,7 @@ int cifs_open(struct inode *inode, struct file *file)
269 } 405 }
270 406
271 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 407 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
272 rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, 408 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
273 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf, 409 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
274 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags 410 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
275 & CIFS_MOUNT_MAP_SPECIAL_CHR); 411 & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -278,7 +414,7 @@ int cifs_open(struct inode *inode, struct file *file)
278 414
279 if (rc == -EIO) { 415 if (rc == -EIO) {
280 /* Old server, try legacy style OpenX */ 416 /* Old server, try legacy style OpenX */
281 rc = SMBLegacyOpen(xid, pTcon, full_path, disposition, 417 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
282 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf, 418 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
283 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags 419 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
284 & CIFS_MOUNT_MAP_SPECIAL_CHR); 420 & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -295,12 +431,12 @@ int cifs_open(struct inode *inode, struct file *file)
295 } 431 }
296 pCifsFile = cifs_init_private(file->private_data, inode, file, netfid); 432 pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
297 write_lock(&GlobalSMBSeslock); 433 write_lock(&GlobalSMBSeslock);
298 list_add(&pCifsFile->tlist, &pTcon->openFileList); 434 list_add(&pCifsFile->tlist, &tcon->openFileList);
299 435
300 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 436 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
301 if (pCifsInode) { 437 if (pCifsInode) {
302 rc = cifs_open_inode_helper(inode, file, pCifsInode, 438 rc = cifs_open_inode_helper(inode, file, pCifsInode,
303 pCifsFile, pTcon, 439 pCifsFile, tcon,
304 &oplock, buf, full_path, xid); 440 &oplock, buf, full_path, xid);
305 } else { 441 } else {
306 write_unlock(&GlobalSMBSeslock); 442 write_unlock(&GlobalSMBSeslock);
@@ -309,7 +445,7 @@ int cifs_open(struct inode *inode, struct file *file)
309 if (oplock & CIFS_CREATE_ACTION) { 445 if (oplock & CIFS_CREATE_ACTION) {
310 /* time to set mode which we can not set earlier due to 446 /* time to set mode which we can not set earlier due to
311 problems creating new read-only files */ 447 problems creating new read-only files */
312 if (pTcon->unix_ext) { 448 if (tcon->unix_ext) {
313 struct cifs_unix_set_info_args args = { 449 struct cifs_unix_set_info_args args = {
314 .mode = inode->i_mode, 450 .mode = inode->i_mode,
315 .uid = NO_CHANGE_64, 451 .uid = NO_CHANGE_64,
@@ -319,7 +455,7 @@ int cifs_open(struct inode *inode, struct file *file)
319 .mtime = NO_CHANGE_64, 455 .mtime = NO_CHANGE_64,
320 .device = 0, 456 .device = 0,
321 }; 457 };
322 CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args, 458 CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
323 cifs_sb->local_nls, 459 cifs_sb->local_nls,
324 cifs_sb->mnt_cifs_flags & 460 cifs_sb->mnt_cifs_flags &
325 CIFS_MOUNT_MAP_SPECIAL_CHR); 461 CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -349,7 +485,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
349 int rc = -EACCES; 485 int rc = -EACCES;
350 int xid, oplock; 486 int xid, oplock;
351 struct cifs_sb_info *cifs_sb; 487 struct cifs_sb_info *cifs_sb;
352 struct cifsTconInfo *pTcon; 488 struct cifsTconInfo *tcon;
353 struct cifsFileInfo *pCifsFile; 489 struct cifsFileInfo *pCifsFile;
354 struct cifsInodeInfo *pCifsInode; 490 struct cifsInodeInfo *pCifsInode;
355 struct inode *inode; 491 struct inode *inode;
@@ -387,7 +523,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
387 } 523 }
388 524
389 cifs_sb = CIFS_SB(inode->i_sb); 525 cifs_sb = CIFS_SB(inode->i_sb);
390 pTcon = cifs_sb->tcon; 526 tcon = cifs_sb->tcon;
391 527
392/* can not grab rename sem here because various ops, including 528/* can not grab rename sem here because various ops, including
393 those that already have the rename sem can end up causing writepage 529 those that already have the rename sem can end up causing writepage
@@ -404,20 +540,37 @@ reopen_error_exit:
404 540
405 cFYI(1, ("inode = 0x%p file flags 0x%x for %s", 541 cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
406 inode, file->f_flags, full_path)); 542 inode, file->f_flags, full_path));
407 desiredAccess = cifs_convert_flags(file->f_flags);
408 543
409 if (oplockEnabled) 544 if (oplockEnabled)
410 oplock = REQ_OPLOCK; 545 oplock = REQ_OPLOCK;
411 else 546 else
412 oplock = 0; 547 oplock = 0;
413 548
549 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
550 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
551 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
552 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
553 /* can not refresh inode info since size could be stale */
554 rc = cifs_posix_open(full_path, NULL, inode->i_sb,
555 cifs_sb->mnt_file_mode /* ignored */,
556 oflags, &oplock, &netfid, xid);
557 if (rc == 0) {
558 cFYI(1, ("posix reopen succeeded"));
559 goto reopen_success;
560 }
561 /* fallthrough to retry open the old way on errors, especially
562 in the reconnect path it is important to retry hard */
563 }
564
565 desiredAccess = cifs_convert_flags(file->f_flags);
566
414 /* Can not refresh inode by passing in file_info buf to be returned 567 /* Can not refresh inode by passing in file_info buf to be returned
415 by SMBOpen and then calling get_inode_info with returned buf 568 by SMBOpen and then calling get_inode_info with returned buf
416 since file might have write behind data that needs to be flushed 569 since file might have write behind data that needs to be flushed
417 and server version of file size can be stale. If we knew for sure 570 and server version of file size can be stale. If we knew for sure
418 that inode was not dirty locally we could do this */ 571 that inode was not dirty locally we could do this */
419 572
420 rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess, 573 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
421 CREATE_NOT_DIR, &netfid, &oplock, NULL, 574 CREATE_NOT_DIR, &netfid, &oplock, NULL,
422 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 575 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
423 CIFS_MOUNT_MAP_SPECIAL_CHR); 576 CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -426,6 +579,7 @@ reopen_error_exit:
426 cFYI(1, ("cifs_open returned 0x%x", rc)); 579 cFYI(1, ("cifs_open returned 0x%x", rc));
427 cFYI(1, ("oplock: %d", oplock)); 580 cFYI(1, ("oplock: %d", oplock));
428 } else { 581 } else {
582reopen_success:
429 pCifsFile->netfid = netfid; 583 pCifsFile->netfid = netfid;
430 pCifsFile->invalidHandle = false; 584 pCifsFile->invalidHandle = false;
431 up(&pCifsFile->fh_sem); 585 up(&pCifsFile->fh_sem);
@@ -439,7 +593,7 @@ reopen_error_exit:
439 go to server to get inode info */ 593 go to server to get inode info */
440 pCifsInode->clientCanCacheAll = false; 594 pCifsInode->clientCanCacheAll = false;
441 pCifsInode->clientCanCacheRead = false; 595 pCifsInode->clientCanCacheRead = false;
442 if (pTcon->unix_ext) 596 if (tcon->unix_ext)
443 rc = cifs_get_inode_info_unix(&inode, 597 rc = cifs_get_inode_info_unix(&inode,
444 full_path, inode->i_sb, xid); 598 full_path, inode->i_sb, xid);
445 else 599 else
@@ -467,7 +621,6 @@ reopen_error_exit:
467 cifs_relock_file(pCifsFile); 621 cifs_relock_file(pCifsFile);
468 } 622 }
469 } 623 }
470
471 kfree(full_path); 624 kfree(full_path);
472 FreeXid(xid); 625 FreeXid(xid);
473 return rc; 626 return rc;
@@ -1523,6 +1676,9 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1523{ 1676{
1524 int xid; 1677 int xid;
1525 int rc = 0; 1678 int rc = 0;
1679 struct cifsTconInfo *tcon;
1680 struct cifsFileInfo *smbfile =
1681 (struct cifsFileInfo *)file->private_data;
1526 struct inode *inode = file->f_path.dentry->d_inode; 1682 struct inode *inode = file->f_path.dentry->d_inode;
1527 1683
1528 xid = GetXid(); 1684 xid = GetXid();
@@ -1534,7 +1690,12 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1534 if (rc == 0) { 1690 if (rc == 0) {
1535 rc = CIFS_I(inode)->write_behind_rc; 1691 rc = CIFS_I(inode)->write_behind_rc;
1536 CIFS_I(inode)->write_behind_rc = 0; 1692 CIFS_I(inode)->write_behind_rc = 0;
1693 tcon = CIFS_SB(inode->i_sb)->tcon;
1694 if (!rc && tcon && smbfile &&
1695 !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1696 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
1537 } 1697 }
1698
1538 FreeXid(xid); 1699 FreeXid(xid);
1539 return rc; 1700 return rc;
1540} 1701}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5ab9896fdcb2..a8797cc60805 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -199,6 +199,49 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
199 pfnd_dat->Gid = cpu_to_le64(pinode->i_gid); 199 pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
200} 200}
201 201
202/**
203 * cifs_new inode - create new inode, initialize, and hash it
204 * @sb - pointer to superblock
205 * @inum - if valid pointer and serverino is enabled, replace i_ino with val
206 *
207 * Create a new inode, initialize it for CIFS and hash it. Returns the new
208 * inode or NULL if one couldn't be allocated.
209 *
210 * If the share isn't mounted with "serverino" or inum is a NULL pointer then
211 * we'll just use the inode number assigned by new_inode(). Note that this can
212 * mean i_ino collisions since the i_ino assigned by new_inode is not
213 * guaranteed to be unique.
214 */
215struct inode *
216cifs_new_inode(struct super_block *sb, __u64 *inum)
217{
218 struct inode *inode;
219
220 inode = new_inode(sb);
221 if (inode == NULL)
222 return NULL;
223
224 /*
225 * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we
226 * stop passing inum as ptr. Are there sanity checks we can use to
227 * ensure that the server is really filling in that field? Also,
228 * if serverino is disabled, perhaps we should be using iunique()?
229 */
230 if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
231 inode->i_ino = (unsigned long) *inum;
232
233 /*
234 * must set this here instead of cifs_alloc_inode since VFS will
235 * clobber i_flags
236 */
237 if (sb->s_flags & MS_NOATIME)
238 inode->i_flags |= S_NOATIME | S_NOCMTIME;
239
240 insert_inode_hash(inode);
241
242 return inode;
243}
244
202int cifs_get_inode_info_unix(struct inode **pinode, 245int cifs_get_inode_info_unix(struct inode **pinode,
203 const unsigned char *full_path, struct super_block *sb, int xid) 246 const unsigned char *full_path, struct super_block *sb, int xid)
204{ 247{
@@ -233,22 +276,11 @@ int cifs_get_inode_info_unix(struct inode **pinode,
233 276
234 /* get new inode */ 277 /* get new inode */
235 if (*pinode == NULL) { 278 if (*pinode == NULL) {
236 *pinode = new_inode(sb); 279 *pinode = cifs_new_inode(sb, &find_data.UniqueId);
237 if (*pinode == NULL) { 280 if (*pinode == NULL) {
238 rc = -ENOMEM; 281 rc = -ENOMEM;
239 goto cgiiu_exit; 282 goto cgiiu_exit;
240 } 283 }
241 /* Is an i_ino of zero legal? */
242 /* note ino incremented to unique num in new_inode */
243 /* Are there sanity checks we can use to ensure that
244 the server is really filling in that field? */
245 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
246 (*pinode)->i_ino = (unsigned long)find_data.UniqueId;
247
248 if (sb->s_flags & MS_NOATIME)
249 (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
250
251 insert_inode_hash(*pinode);
252 } 284 }
253 285
254 inode = *pinode; 286 inode = *pinode;
@@ -465,11 +497,9 @@ int cifs_get_inode_info(struct inode **pinode,
465 497
466 /* get new inode */ 498 /* get new inode */
467 if (*pinode == NULL) { 499 if (*pinode == NULL) {
468 *pinode = new_inode(sb); 500 __u64 inode_num;
469 if (*pinode == NULL) { 501 __u64 *pinum = &inode_num;
470 rc = -ENOMEM; 502
471 goto cgii_exit;
472 }
473 /* Is an i_ino of zero legal? Can we use that to check 503 /* Is an i_ino of zero legal? Can we use that to check
474 if the server supports returning inode numbers? Are 504 if the server supports returning inode numbers? Are
475 there other sanity checks we can use to ensure that 505 there other sanity checks we can use to ensure that
@@ -486,22 +516,26 @@ int cifs_get_inode_info(struct inode **pinode,
486 516
487 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 517 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
488 int rc1 = 0; 518 int rc1 = 0;
489 __u64 inode_num;
490 519
491 rc1 = CIFSGetSrvInodeNumber(xid, pTcon, 520 rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
492 full_path, &inode_num, 521 full_path, pinum,
493 cifs_sb->local_nls, 522 cifs_sb->local_nls,
494 cifs_sb->mnt_cifs_flags & 523 cifs_sb->mnt_cifs_flags &
495 CIFS_MOUNT_MAP_SPECIAL_CHR); 524 CIFS_MOUNT_MAP_SPECIAL_CHR);
496 if (rc1) { 525 if (rc1) {
497 cFYI(1, ("GetSrvInodeNum rc %d", rc1)); 526 cFYI(1, ("GetSrvInodeNum rc %d", rc1));
527 pinum = NULL;
498 /* BB EOPNOSUPP disable SERVER_INUM? */ 528 /* BB EOPNOSUPP disable SERVER_INUM? */
499 } else /* do we need cast or hash to ino? */ 529 }
500 (*pinode)->i_ino = inode_num; 530 } else {
501 } /* else ino incremented to unique num in new_inode*/ 531 pinum = NULL;
502 if (sb->s_flags & MS_NOATIME) 532 }
503 (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME; 533
504 insert_inode_hash(*pinode); 534 *pinode = cifs_new_inode(sb, pinum);
535 if (*pinode == NULL) {
536 rc = -ENOMEM;
537 goto cgii_exit;
538 }
505 } 539 }
506 inode = *pinode; 540 inode = *pinode;
507 cifsInfo = CIFS_I(inode); 541 cifsInfo = CIFS_I(inode);
@@ -621,7 +655,7 @@ static const struct inode_operations cifs_ipc_inode_ops = {
621 .lookup = cifs_lookup, 655 .lookup = cifs_lookup,
622}; 656};
623 657
624static char *build_path_to_root(struct cifs_sb_info *cifs_sb) 658char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
625{ 659{
626 int pplen = cifs_sb->prepathlen; 660 int pplen = cifs_sb->prepathlen;
627 int dfsplen; 661 int dfsplen;
@@ -678,7 +712,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
678 return inode; 712 return inode;
679 713
680 cifs_sb = CIFS_SB(inode->i_sb); 714 cifs_sb = CIFS_SB(inode->i_sb);
681 full_path = build_path_to_root(cifs_sb); 715 full_path = cifs_build_path_to_root(cifs_sb);
682 if (full_path == NULL) 716 if (full_path == NULL)
683 return ERR_PTR(-ENOMEM); 717 return ERR_PTR(-ENOMEM);
684 718
@@ -729,6 +763,9 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
729 struct cifsTconInfo *pTcon = cifs_sb->tcon; 763 struct cifsTconInfo *pTcon = cifs_sb->tcon;
730 FILE_BASIC_INFO info_buf; 764 FILE_BASIC_INFO info_buf;
731 765
766 if (attrs == NULL)
767 return -EINVAL;
768
732 if (attrs->ia_valid & ATTR_ATIME) { 769 if (attrs->ia_valid & ATTR_ATIME) {
733 set_time = true; 770 set_time = true;
734 info_buf.LastAccessTime = 771 info_buf.LastAccessTime =
@@ -1017,7 +1054,7 @@ out_reval:
1017 return rc; 1054 return rc;
1018} 1055}
1019 1056
1020static void posix_fill_in_inode(struct inode *tmp_inode, 1057void posix_fill_in_inode(struct inode *tmp_inode,
1021 FILE_UNIX_BASIC_INFO *pData, int isNewInode) 1058 FILE_UNIX_BASIC_INFO *pData, int isNewInode)
1022{ 1059{
1023 struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode); 1060 struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
@@ -1114,24 +1151,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1114 else 1151 else
1115 direntry->d_op = &cifs_dentry_ops; 1152 direntry->d_op = &cifs_dentry_ops;
1116 1153
1117 newinode = new_inode(inode->i_sb); 1154 newinode = cifs_new_inode(inode->i_sb,
1155 &pInfo->UniqueId);
1118 if (newinode == NULL) { 1156 if (newinode == NULL) {
1119 kfree(pInfo); 1157 kfree(pInfo);
1120 goto mkdir_get_info; 1158 goto mkdir_get_info;
1121 } 1159 }
1122 1160
1123 /* Is an i_ino of zero legal? */
1124 /* Are there sanity checks we can use to ensure that
1125 the server is really filling in that field? */
1126 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
1127 newinode->i_ino =
1128 (unsigned long)pInfo->UniqueId;
1129 } /* note ino incremented to unique num in new_inode */
1130 if (inode->i_sb->s_flags & MS_NOATIME)
1131 newinode->i_flags |= S_NOATIME | S_NOCMTIME;
1132 newinode->i_nlink = 2; 1161 newinode->i_nlink = 2;
1133
1134 insert_inode_hash(newinode);
1135 d_instantiate(direntry, newinode); 1162 d_instantiate(direntry, newinode);
1136 1163
1137 /* we already checked in POSIXCreate whether 1164 /* we already checked in POSIXCreate whether
@@ -1285,6 +1312,11 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1285 cifsInode = CIFS_I(direntry->d_inode); 1312 cifsInode = CIFS_I(direntry->d_inode);
1286 cifsInode->time = 0; /* force revalidate to go get info when 1313 cifsInode->time = 0; /* force revalidate to go get info when
1287 needed */ 1314 needed */
1315
1316 cifsInode = CIFS_I(inode);
1317 cifsInode->time = 0; /* force revalidate to get parent dir info
1318 since cached search results now invalid */
1319
1288 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = 1320 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
1289 current_fs_time(inode->i_sb); 1321 current_fs_time(inode->i_sb);
1290 1322
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index 462bbfefd4b6..98b66a54c319 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -10,8 +10,8 @@
10 * with every copy. 10 * with every copy.
11 * 11 *
12 * To compute the message digest of a chunk of bytes, declare an 12 * To compute the message digest of a chunk of bytes, declare an
13 * MD5Context structure, pass it to MD5Init, call MD5Update as 13 * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
14 * needed on buffers full of bytes, and then call MD5Final, which 14 * needed on buffers full of bytes, and then call cifs_MD5_final, which
15 * will fill a supplied 16-byte array with the digest. 15 * will fill a supplied 16-byte array with the digest.
16 */ 16 */
17 17
@@ -45,7 +45,7 @@ byteReverse(unsigned char *buf, unsigned longs)
45 * initialization constants. 45 * initialization constants.
46 */ 46 */
47void 47void
48MD5Init(struct MD5Context *ctx) 48cifs_MD5_init(struct MD5Context *ctx)
49{ 49{
50 ctx->buf[0] = 0x67452301; 50 ctx->buf[0] = 0x67452301;
51 ctx->buf[1] = 0xefcdab89; 51 ctx->buf[1] = 0xefcdab89;
@@ -61,7 +61,7 @@ MD5Init(struct MD5Context *ctx)
61 * of bytes. 61 * of bytes.
62 */ 62 */
63void 63void
64MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len) 64cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
65{ 65{
66 register __u32 t; 66 register __u32 t;
67 67
@@ -110,7 +110,7 @@ MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
110 * 1 0* (64-bit count of bits processed, MSB-first) 110 * 1 0* (64-bit count of bits processed, MSB-first)
111 */ 111 */
112void 112void
113MD5Final(unsigned char digest[16], struct MD5Context *ctx) 113cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
114{ 114{
115 unsigned int count; 115 unsigned int count;
116 unsigned char *p; 116 unsigned char *p;
@@ -165,7 +165,7 @@ MD5Final(unsigned char digest[16], struct MD5Context *ctx)
165 165
166/* 166/*
167 * The core of the MD5 algorithm, this alters an existing MD5 hash to 167 * The core of the MD5 algorithm, this alters an existing MD5 hash to
168 * reflect the addition of 16 longwords of new data. MD5Update blocks 168 * reflect the addition of 16 longwords of new data. cifs_MD5_update blocks
169 * the data and converts bytes into longwords for this routine. 169 * the data and converts bytes into longwords for this routine.
170 */ 170 */
171static void 171static void
@@ -267,9 +267,9 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
267 unsigned char tk[16]; 267 unsigned char tk[16];
268 struct MD5Context tctx; 268 struct MD5Context tctx;
269 269
270 MD5Init(&tctx); 270 cifs_MD5_init(&tctx);
271 MD5Update(&tctx, key, key_len); 271 cifs_MD5_update(&tctx, key, key_len);
272 MD5Final(tk, &tctx); 272 cifs_MD5_final(tk, &tctx);
273 273
274 key = tk; 274 key = tk;
275 key_len = 16; 275 key_len = 16;
@@ -287,8 +287,8 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
287 ctx->k_opad[i] ^= 0x5c; 287 ctx->k_opad[i] ^= 0x5c;
288 } 288 }
289 289
290 MD5Init(&ctx->ctx); 290 cifs_MD5_init(&ctx->ctx);
291 MD5Update(&ctx->ctx, ctx->k_ipad, 64); 291 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
292} 292}
293#endif 293#endif
294 294
@@ -317,8 +317,8 @@ hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
317 ctx->k_opad[i] ^= 0x5c; 317 ctx->k_opad[i] ^= 0x5c;
318 } 318 }
319 319
320 MD5Init(&ctx->ctx); 320 cifs_MD5_init(&ctx->ctx);
321 MD5Update(&ctx->ctx, ctx->k_ipad, 64); 321 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
322} 322}
323 323
324/*********************************************************************** 324/***********************************************************************
@@ -328,7 +328,7 @@ void
328hmac_md5_update(const unsigned char *text, int text_len, 328hmac_md5_update(const unsigned char *text, int text_len,
329 struct HMACMD5Context *ctx) 329 struct HMACMD5Context *ctx)
330{ 330{
331 MD5Update(&ctx->ctx, text, text_len); /* then text of datagram */ 331 cifs_MD5_update(&ctx->ctx, text, text_len); /* then text of datagram */
332} 332}
333 333
334/*********************************************************************** 334/***********************************************************************
@@ -339,12 +339,12 @@ hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
339{ 339{
340 struct MD5Context ctx_o; 340 struct MD5Context ctx_o;
341 341
342 MD5Final(digest, &ctx->ctx); 342 cifs_MD5_final(digest, &ctx->ctx);
343 343
344 MD5Init(&ctx_o); 344 cifs_MD5_init(&ctx_o);
345 MD5Update(&ctx_o, ctx->k_opad, 64); 345 cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
346 MD5Update(&ctx_o, digest, 16); 346 cifs_MD5_update(&ctx_o, digest, 16);
347 MD5Final(digest, &ctx_o); 347 cifs_MD5_final(digest, &ctx_o);
348} 348}
349 349
350/*********************************************************** 350/***********************************************************
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
index f7d4f4197bac..6fba8cb402fd 100644
--- a/fs/cifs/md5.h
+++ b/fs/cifs/md5.h
@@ -20,10 +20,10 @@ struct HMACMD5Context {
20}; 20};
21#endif /* _HMAC_MD5_H */ 21#endif /* _HMAC_MD5_H */
22 22
23void MD5Init(struct MD5Context *context); 23void cifs_MD5_init(struct MD5Context *context);
24void MD5Update(struct MD5Context *context, unsigned char const *buf, 24void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
25 unsigned len); 25 unsigned len);
26void MD5Final(unsigned char digest[16], struct MD5Context *context); 26void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
27 27
28/* The following definitions come from lib/hmacmd5.c */ 28/* The following definitions come from lib/hmacmd5.c */
29 29
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 9f51f9bf0292..c2c01ff4c32c 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -56,35 +56,34 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
56} 56}
57#endif /* DEBUG2 */ 57#endif /* DEBUG2 */
58 58
59/* Returns one if new inode created (which therefore needs to be hashed) */ 59/* Returns 1 if new inode created, 2 if both dentry and inode were */
60/* Might check in the future if inode number changed so we can rehash inode */ 60/* Might check in the future if inode number changed so we can rehash inode */
61static int construct_dentry(struct qstr *qstring, struct file *file, 61static int
62 struct inode **ptmp_inode, struct dentry **pnew_dentry) 62construct_dentry(struct qstr *qstring, struct file *file,
63 struct inode **ptmp_inode, struct dentry **pnew_dentry,
64 __u64 *inum)
63{ 65{
64 struct dentry *tmp_dentry; 66 struct dentry *tmp_dentry = NULL;
65 struct cifs_sb_info *cifs_sb; 67 struct super_block *sb = file->f_path.dentry->d_sb;
66 struct cifsTconInfo *pTcon;
67 int rc = 0; 68 int rc = 0;
68 69
69 cFYI(1, ("For %s", qstring->name)); 70 cFYI(1, ("For %s", qstring->name));
70 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
71 pTcon = cifs_sb->tcon;
72 71
73 qstring->hash = full_name_hash(qstring->name, qstring->len); 72 qstring->hash = full_name_hash(qstring->name, qstring->len);
74 tmp_dentry = d_lookup(file->f_path.dentry, qstring); 73 tmp_dentry = d_lookup(file->f_path.dentry, qstring);
75 if (tmp_dentry) { 74 if (tmp_dentry) {
75 /* BB: overwrite old name? i.e. tmp_dentry->d_name and
76 * tmp_dentry->d_name.len??
77 */
76 cFYI(0, ("existing dentry with inode 0x%p", 78 cFYI(0, ("existing dentry with inode 0x%p",
77 tmp_dentry->d_inode)); 79 tmp_dentry->d_inode));
78 *ptmp_inode = tmp_dentry->d_inode; 80 *ptmp_inode = tmp_dentry->d_inode;
79/* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/
80 if (*ptmp_inode == NULL) { 81 if (*ptmp_inode == NULL) {
81 *ptmp_inode = new_inode(file->f_path.dentry->d_sb); 82 *ptmp_inode = cifs_new_inode(sb, inum);
82 if (*ptmp_inode == NULL) 83 if (*ptmp_inode == NULL)
83 return rc; 84 return rc;
84 rc = 1; 85 rc = 1;
85 } 86 }
86 if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
87 (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
88 } else { 87 } else {
89 tmp_dentry = d_alloc(file->f_path.dentry, qstring); 88 tmp_dentry = d_alloc(file->f_path.dentry, qstring);
90 if (tmp_dentry == NULL) { 89 if (tmp_dentry == NULL) {
@@ -93,15 +92,14 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
93 return rc; 92 return rc;
94 } 93 }
95 94
96 *ptmp_inode = new_inode(file->f_path.dentry->d_sb); 95 if (CIFS_SB(sb)->tcon->nocase)
97 if (pTcon->nocase)
98 tmp_dentry->d_op = &cifs_ci_dentry_ops; 96 tmp_dentry->d_op = &cifs_ci_dentry_ops;
99 else 97 else
100 tmp_dentry->d_op = &cifs_dentry_ops; 98 tmp_dentry->d_op = &cifs_dentry_ops;
99
100 *ptmp_inode = cifs_new_inode(sb, inum);
101 if (*ptmp_inode == NULL) 101 if (*ptmp_inode == NULL)
102 return rc; 102 return rc;
103 if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
104 (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
105 rc = 2; 103 rc = 2;
106 } 104 }
107 105
@@ -822,7 +820,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
822/* inode num, inode type and filename returned */ 820/* inode num, inode type and filename returned */
823static int cifs_get_name_from_search_buf(struct qstr *pqst, 821static int cifs_get_name_from_search_buf(struct qstr *pqst,
824 char *current_entry, __u16 level, unsigned int unicode, 822 char *current_entry, __u16 level, unsigned int unicode,
825 struct cifs_sb_info *cifs_sb, int max_len, ino_t *pinum) 823 struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum)
826{ 824{
827 int rc = 0; 825 int rc = 0;
828 unsigned int len = 0; 826 unsigned int len = 0;
@@ -842,9 +840,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
842 len = strnlen(filename, PATH_MAX); 840 len = strnlen(filename, PATH_MAX);
843 } 841 }
844 842
845 /* BB fixme - hash low and high 32 bits if not 64 bit arch BB */ 843 *pinum = pFindData->UniqueId;
846 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
847 *pinum = pFindData->UniqueId;
848 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { 844 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
849 FILE_DIRECTORY_INFO *pFindData = 845 FILE_DIRECTORY_INFO *pFindData =
850 (FILE_DIRECTORY_INFO *)current_entry; 846 (FILE_DIRECTORY_INFO *)current_entry;
@@ -907,7 +903,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
907 struct qstr qstring; 903 struct qstr qstring;
908 struct cifsFileInfo *pCifsF; 904 struct cifsFileInfo *pCifsF;
909 unsigned int obj_type; 905 unsigned int obj_type;
910 ino_t inum; 906 __u64 inum;
911 struct cifs_sb_info *cifs_sb; 907 struct cifs_sb_info *cifs_sb;
912 struct inode *tmp_inode; 908 struct inode *tmp_inode;
913 struct dentry *tmp_dentry; 909 struct dentry *tmp_dentry;
@@ -940,20 +936,18 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
940 if (rc) 936 if (rc)
941 return rc; 937 return rc;
942 938
943 rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry); 939 /* only these two infolevels return valid inode numbers */
940 if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX ||
941 pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
942 rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
943 &inum);
944 else
945 rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
946 NULL);
947
944 if ((tmp_inode == NULL) || (tmp_dentry == NULL)) 948 if ((tmp_inode == NULL) || (tmp_dentry == NULL))
945 return -ENOMEM; 949 return -ENOMEM;
946 950
947 if (rc) {
948 /* inode created, we need to hash it with right inode number */
949 if (inum != 0) {
950 /* BB fixme - hash the 2 32 quantities bits together if
951 * necessary BB */
952 tmp_inode->i_ino = inum;
953 }
954 insert_inode_hash(tmp_inode);
955 }
956
957 /* we pass in rc below, indicating whether it is a new inode, 951 /* we pass in rc below, indicating whether it is a new inode,
958 so we can figure out whether to invalidate the inode cached 952 so we can figure out whether to invalidate the inode cached
959 data if the file has changed */ 953 data if the file has changed */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5f22de7b79a9..5c68b4282be9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -34,15 +34,99 @@
34extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 34extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
35 unsigned char *p24); 35 unsigned char *p24);
36 36
37/* Checks if this is the first smb session to be reconnected after
38 the socket has been reestablished (so we know whether to use vc 0).
39 Called while holding the cifs_tcp_ses_lock, so do not block */
40static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
41{
42 struct list_head *tmp;
43 struct cifsSesInfo *tmp_ses;
44
45 list_for_each(tmp, &ses->server->smb_ses_list) {
46 tmp_ses = list_entry(tmp, struct cifsSesInfo,
47 smb_ses_list);
48 if (tmp_ses->need_reconnect == false)
49 return false;
50 }
51 /* could not find a session that was already connected,
52 this must be the first one we are reconnecting */
53 return true;
54}
55
56/*
57 * vc number 0 is treated specially by some servers, and should be the
58 * first one we request. After that we can use vcnumbers up to maxvcs,
59 * one for each smb session (some Windows versions set maxvcs incorrectly
60 * so maxvc=1 can be ignored). If we have too many vcs, we can reuse
61 * any vc but zero (some servers reset the connection on vcnum zero)
62 *
63 */
64static __le16 get_next_vcnum(struct cifsSesInfo *ses)
65{
66 __u16 vcnum = 0;
67 struct list_head *tmp;
68 struct cifsSesInfo *tmp_ses;
69 __u16 max_vcs = ses->server->max_vcs;
70 __u16 i;
71 int free_vc_found = 0;
72
73 /* Quoting the MS-SMB specification: "Windows-based SMB servers set this
74 field to one but do not enforce this limit, which allows an SMB client
75 to establish more virtual circuits than allowed by this value ... but
76 other server implementations can enforce this limit." */
77 if (max_vcs < 2)
78 max_vcs = 0xFFFF;
79
80 write_lock(&cifs_tcp_ses_lock);
81 if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
82 goto get_vc_num_exit; /* vcnum will be zero */
83 for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
84 if (i == 0) /* this is the only connection, use vc 0 */
85 break;
86
87 free_vc_found = 1;
88
89 list_for_each(tmp, &ses->server->smb_ses_list) {
90 tmp_ses = list_entry(tmp, struct cifsSesInfo,
91 smb_ses_list);
92 if (tmp_ses->vcnum == i) {
93 free_vc_found = 0;
94 break; /* found duplicate, try next vcnum */
95 }
96 }
97 if (free_vc_found)
98 break; /* we found a vcnumber that will work - use it */
99 }
100
101 if (i == 0)
102 vcnum = 0; /* for most common case, ie if one smb session, use
103 vc zero. Also for case when no free vcnum, zero
104 is safest to send (some clients only send zero) */
105 else if (free_vc_found == 0)
106 vcnum = 1; /* we can not reuse vc=0 safely, since some servers
107 reset all uids on that, but 1 is ok. */
108 else
109 vcnum = i;
110 ses->vcnum = vcnum;
111get_vc_num_exit:
112 write_unlock(&cifs_tcp_ses_lock);
113
114 return le16_to_cpu(vcnum);
115}
116
37static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB) 117static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
38{ 118{
39 __u32 capabilities = 0; 119 __u32 capabilities = 0;
40 120
41 /* init fields common to all four types of SessSetup */ 121 /* init fields common to all four types of SessSetup */
42 /* note that header is initialized to zero in header_assemble */ 122 /* Note that offsets for first seven fields in req struct are same */
123 /* in CIFS Specs so does not matter which of 3 forms of struct */
124 /* that we use in next few lines */
125 /* Note that header is initialized to zero in header_assemble */
43 pSMB->req.AndXCommand = 0xFF; 126 pSMB->req.AndXCommand = 0xFF;
44 pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); 127 pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
45 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); 128 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
129 pSMB->req.VcNumber = get_next_vcnum(ses);
46 130
47 /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ 131 /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
48 132
@@ -71,7 +155,6 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
71 if (ses->capabilities & CAP_UNIX) 155 if (ses->capabilities & CAP_UNIX)
72 capabilities |= CAP_UNIX; 156 capabilities |= CAP_UNIX;
73 157
74 /* BB check whether to init vcnum BB */
75 return capabilities; 158 return capabilities;
76} 159}
77 160
@@ -228,7 +311,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
228 311
229 kfree(ses->serverOS); 312 kfree(ses->serverOS);
230 /* UTF-8 string will not grow more than four times as big as UCS-16 */ 313 /* UTF-8 string will not grow more than four times as big as UCS-16 */
231 ses->serverOS = kzalloc(4 * len, GFP_KERNEL); 314 ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
232 if (ses->serverOS != NULL) 315 if (ses->serverOS != NULL)
233 cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp); 316 cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp);
234 data += 2 * (len + 1); 317 data += 2 * (len + 1);
@@ -241,7 +324,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
241 return rc; 324 return rc;
242 325
243 kfree(ses->serverNOS); 326 kfree(ses->serverNOS);
244 ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */ 327 ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
245 if (ses->serverNOS != NULL) { 328 if (ses->serverNOS != NULL) {
246 cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len, 329 cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
247 nls_cp); 330 nls_cp);
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
new file mode 100644
index 000000000000..7056b891e087
--- /dev/null
+++ b/fs/cifs/smbfsctl.h
@@ -0,0 +1,84 @@
1/*
2 * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
3 *
4 * Copyright (c) International Business Machines Corp., 2002,2009
5 * Author(s): Steve French (sfrench@us.ibm.com)
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22/* IOCTL information */
23/*
24 * List of ioctl/fsctl function codes that are or could be useful in the
25 * future to remote clients like cifs or SMB2 client. There is probably
26 * a slightly larger set of fsctls that NTFS local filesystem could handle,
27 * including the seven below that we do not have struct definitions for.
28 * Even with protocol definitions for most of these now available, we still
29 * need to do some experimentation to identify which are practical to do
30 * remotely. Some of the following, such as the encryption/compression ones
31 * could be invoked from tools via a specialized hook into the VFS rather
32 * than via the standard vfs entry points
33 */
34#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
35#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
36#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008
37#define FSCTL_LOCK_VOLUME 0x00090018
38#define FSCTL_UNLOCK_VOLUME 0x0009001C
39#define FSCTL_IS_PATHNAME_VALID 0x0009002C /* BB add struct */
40#define FSCTL_GET_COMPRESSION 0x0009003C /* BB add struct */
41#define FSCTL_SET_COMPRESSION 0x0009C040 /* BB add struct */
42#define FSCTL_QUERY_FAT_BPB 0x00090058 /* BB add struct */
43/* Verify the next FSCTL number, we had it as 0x00090090 before */
44#define FSCTL_FILESYSTEM_GET_STATS 0x00090060 /* BB add struct */
45#define FSCTL_GET_NTFS_VOLUME_DATA 0x00090064 /* BB add struct */
46#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */
47#define FSCTL_IS_VOLUME_DIRTY 0x00090078 /* BB add struct */
48#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */
49#define FSCTL_REQUEST_FILTER_OPLOCK 0x0009008C
50#define FSCTL_FIND_FILES_BY_SID 0x0009008F /* BB add struct */
51#define FSCTL_SET_OBJECT_ID 0x00090098 /* BB add struct */
52#define FSCTL_GET_OBJECT_ID 0x0009009C /* BB add struct */
53#define FSCTL_DELETE_OBJECT_ID 0x000900A0 /* BB add struct */
54#define FSCTL_SET_REPARSE_POINT 0x000900A4 /* BB add struct */
55#define FSCTL_GET_REPARSE_POINT 0x000900A8 /* BB add struct */
56#define FSCTL_DELETE_REPARSE_POINT 0x000900AC /* BB add struct */
57#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
58#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
59#define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */
60#define FSCTL_SET_ZERO_DATA 0x000900C8 /* BB add struct */
61#define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */
62#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */
63#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */
64#define FSCTL_READ_RAW_ENCRYPTED 0x000900E3 /* BB add struct */
65#define FSCTL_READ_FILE_USN_DATA 0x000900EB /* BB add struct */
66#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */
67#define FSCTL_SIS_COPYFILE 0x00090100 /* BB add struct */
68#define FSCTL_RECALL_FILE 0x00090117 /* BB add struct */
69#define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */
70#define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */
71#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
72#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
73#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */
74#define FSCTL_SIS_LINK_FILES 0x0009C104
75#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */
76#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */
77/* strange that the number for this op is not sequential with previous op */
78#define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */
79#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
80#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
81
82#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003
83#define IO_REPARSE_TAG_HSM 0xC0000004
84#define IO_REPARSE_TAG_SIS 0x80000007
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7ebe6599ed3a..0ad3e2d116a6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -154,81 +154,8 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
154 spin_unlock(&GlobalMid_Lock); 154 spin_unlock(&GlobalMid_Lock);
155} 155}
156 156
157int
158smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
159 unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd)
160{
161 int rc = 0;
162 int i = 0;
163 struct msghdr smb_msg;
164 struct kvec iov;
165 unsigned len = smb_buf_length + 4;
166
167 if (ssocket == NULL)
168 return -ENOTSOCK; /* BB eventually add reconnect code here */
169 iov.iov_base = smb_buffer;
170 iov.iov_len = len;
171
172 smb_msg.msg_name = sin;
173 smb_msg.msg_namelen = sizeof(struct sockaddr);
174 smb_msg.msg_control = NULL;
175 smb_msg.msg_controllen = 0;
176 if (noblocksnd)
177 smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
178 else
179 smb_msg.msg_flags = MSG_NOSIGNAL;
180
181 /* smb header is converted in header_assemble. bcc and rest of SMB word
182 area, and byte area if necessary, is converted to littleendian in
183 cifssmb.c and RFC1001 len is converted to bigendian in smb_send
184 Flags2 is converted in SendReceive */
185
186 smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
187 cFYI(1, ("Sending smb of length %d", smb_buf_length));
188 dump_smb(smb_buffer, len);
189
190 while (len > 0) {
191 rc = kernel_sendmsg(ssocket, &smb_msg, &iov, 1, len);
192 if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
193 i++;
194 /* smaller timeout here than send2 since smaller size */
195 /* Although it may not be required, this also is smaller
196 oplock break time */
197 if (i > 12) {
198 cERROR(1,
199 ("sends on sock %p stuck for 7 seconds",
200 ssocket));
201 rc = -EAGAIN;
202 break;
203 }
204 msleep(1 << i);
205 continue;
206 }
207 if (rc < 0)
208 break;
209 else
210 i = 0; /* reset i after each successful send */
211 iov.iov_base += rc;
212 iov.iov_len -= rc;
213 len -= rc;
214 }
215
216 if (rc < 0) {
217 cERROR(1, ("Error %d sending data on socket to server", rc));
218 } else {
219 rc = 0;
220 }
221
222 /* Don't want to modify the buffer as a
223 side effect of this call. */
224 smb_buffer->smb_buf_length = smb_buf_length;
225
226 return rc;
227}
228
229static int 157static int
230smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec, 158smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
231 struct sockaddr *sin, bool noblocksnd)
232{ 159{
233 int rc = 0; 160 int rc = 0;
234 int i = 0; 161 int i = 0;
@@ -243,11 +170,11 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
243 if (ssocket == NULL) 170 if (ssocket == NULL)
244 return -ENOTSOCK; /* BB eventually add reconnect code here */ 171 return -ENOTSOCK; /* BB eventually add reconnect code here */
245 172
246 smb_msg.msg_name = sin; 173 smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr;
247 smb_msg.msg_namelen = sizeof(struct sockaddr); 174 smb_msg.msg_namelen = sizeof(struct sockaddr);
248 smb_msg.msg_control = NULL; 175 smb_msg.msg_control = NULL;
249 smb_msg.msg_controllen = 0; 176 smb_msg.msg_controllen = 0;
250 if (noblocksnd) 177 if (server->noblocksnd)
251 smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; 178 smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
252 else 179 else
253 smb_msg.msg_flags = MSG_NOSIGNAL; 180 smb_msg.msg_flags = MSG_NOSIGNAL;
@@ -272,7 +199,25 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
272 n_vec - first_vec, total_len); 199 n_vec - first_vec, total_len);
273 if ((rc == -ENOSPC) || (rc == -EAGAIN)) { 200 if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
274 i++; 201 i++;
275 if (i >= 14) { 202 /* if blocking send we try 3 times, since each can block
203 for 5 seconds. For nonblocking we have to try more
204 but wait increasing amounts of time allowing time for
205 socket to clear. The overall time we wait in either
206 case to send on the socket is about 15 seconds.
207 Similarly we wait for 15 seconds for
208 a response from the server in SendReceive[2]
209 for the server to send a response back for
210 most types of requests (except SMB Write
211 past end of file which can be slow, and
212 blocking lock operations). NFS waits slightly longer
213 than CIFS, but this can make it take longer for
214 nonresponsive servers to be detected and 15 seconds
215 is more than enough time for modern networks to
216 send a packet. In most cases if we fail to send
217 after the retries we will kill the socket and
218 reconnect which may clear the network problem.
219 */
220 if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
276 cERROR(1, 221 cERROR(1,
277 ("sends on sock %p stuck for 15 seconds", 222 ("sends on sock %p stuck for 15 seconds",
278 ssocket)); 223 ssocket));
@@ -339,6 +284,18 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
339 return rc; 284 return rc;
340} 285}
341 286
287int
288smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
289 unsigned int smb_buf_length)
290{
291 struct kvec iov;
292
293 iov.iov_base = smb_buffer;
294 iov.iov_len = smb_buf_length + 4;
295
296 return smb_sendv(server, &iov, 1);
297}
298
342static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op) 299static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
343{ 300{
344 if (long_op == CIFS_ASYNC_OP) { 301 if (long_op == CIFS_ASYNC_OP) {
@@ -540,9 +497,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
540#ifdef CONFIG_CIFS_STATS2 497#ifdef CONFIG_CIFS_STATS2
541 atomic_inc(&ses->server->inSend); 498 atomic_inc(&ses->server->inSend);
542#endif 499#endif
543 rc = smb_send2(ses->server, iov, n_vec, 500 rc = smb_sendv(ses->server, iov, n_vec);
544 (struct sockaddr *) &(ses->server->addr.sockAddr),
545 ses->server->noblocksnd);
546#ifdef CONFIG_CIFS_STATS2 501#ifdef CONFIG_CIFS_STATS2
547 atomic_dec(&ses->server->inSend); 502 atomic_dec(&ses->server->inSend);
548 midQ->when_sent = jiffies; 503 midQ->when_sent = jiffies;
@@ -736,9 +691,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
736#ifdef CONFIG_CIFS_STATS2 691#ifdef CONFIG_CIFS_STATS2
737 atomic_inc(&ses->server->inSend); 692 atomic_inc(&ses->server->inSend);
738#endif 693#endif
739 rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, 694 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
740 (struct sockaddr *) &(ses->server->addr.sockAddr),
741 ses->server->noblocksnd);
742#ifdef CONFIG_CIFS_STATS2 695#ifdef CONFIG_CIFS_STATS2
743 atomic_dec(&ses->server->inSend); 696 atomic_dec(&ses->server->inSend);
744 midQ->when_sent = jiffies; 697 midQ->when_sent = jiffies;
@@ -879,9 +832,7 @@ send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
879 mutex_unlock(&ses->server->srv_mutex); 832 mutex_unlock(&ses->server->srv_mutex);
880 return rc; 833 return rc;
881 } 834 }
882 rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, 835 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
883 (struct sockaddr *) &(ses->server->addr.sockAddr),
884 ses->server->noblocksnd);
885 mutex_unlock(&ses->server->srv_mutex); 836 mutex_unlock(&ses->server->srv_mutex);
886 return rc; 837 return rc;
887} 838}
@@ -973,9 +924,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
973#ifdef CONFIG_CIFS_STATS2 924#ifdef CONFIG_CIFS_STATS2
974 atomic_inc(&ses->server->inSend); 925 atomic_inc(&ses->server->inSend);
975#endif 926#endif
976 rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, 927 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
977 (struct sockaddr *) &(ses->server->addr.sockAddr),
978 ses->server->noblocksnd);
979#ifdef CONFIG_CIFS_STATS2 928#ifdef CONFIG_CIFS_STATS2
980 atomic_dec(&ses->server->inSend); 929 atomic_dec(&ses->server->inSend);
981 midQ->when_sent = jiffies; 930 midQ->when_sent = jiffies;
diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig
new file mode 100644
index 000000000000..c0e5a7fad06d
--- /dev/null
+++ b/fs/coda/Kconfig
@@ -0,0 +1,21 @@
1config CODA_FS
2 tristate "Coda file system support (advanced network fs)"
3 depends on INET
4 help
5 Coda is an advanced network file system, similar to NFS in that it
6 enables you to mount file systems of a remote server and access them
7 with regular Unix commands as if they were sitting on your hard
8 disk. Coda has several advantages over NFS: support for
9 disconnected operation (e.g. for laptops), read/write server
10 replication, security model for authentication and encryption,
11 persistent client caches and write back caching.
12
13 If you say Y here, your Linux box will be able to act as a Coda
14 *client*. You will need user level code as well, both for the
15 client and server. Servers are currently user level, i.e. they need
16 no kernel support. Please read
17 <file:Documentation/filesystems/coda.txt> and check out the Coda
18 home page <http://www.coda.cs.cmu.edu/>.
19
20 To compile the coda client support as a module, choose M here: the
21 module will be called coda.
diff --git a/fs/compat.c b/fs/compat.c
index 65a070e705ab..0949b43794a4 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1402,12 +1402,13 @@ int compat_do_execve(char * filename,
1402 retval = mutex_lock_interruptible(&current->cred_exec_mutex); 1402 retval = mutex_lock_interruptible(&current->cred_exec_mutex);
1403 if (retval < 0) 1403 if (retval < 0)
1404 goto out_free; 1404 goto out_free;
1405 current->in_execve = 1;
1405 1406
1406 retval = -ENOMEM; 1407 retval = -ENOMEM;
1407 bprm->cred = prepare_exec_creds(); 1408 bprm->cred = prepare_exec_creds();
1408 if (!bprm->cred) 1409 if (!bprm->cred)
1409 goto out_unlock; 1410 goto out_unlock;
1410 check_unsafe_exec(bprm); 1411 check_unsafe_exec(bprm, current->files);
1411 1412
1412 file = open_exec(filename); 1413 file = open_exec(filename);
1413 retval = PTR_ERR(file); 1414 retval = PTR_ERR(file);
@@ -1454,6 +1455,7 @@ int compat_do_execve(char * filename,
1454 goto out; 1455 goto out;
1455 1456
1456 /* execve succeeded */ 1457 /* execve succeeded */
1458 current->in_execve = 0;
1457 mutex_unlock(&current->cred_exec_mutex); 1459 mutex_unlock(&current->cred_exec_mutex);
1458 acct_update_integrals(current); 1460 acct_update_integrals(current);
1459 free_bprm(bprm); 1461 free_bprm(bprm);
@@ -1470,6 +1472,7 @@ out_file:
1470 } 1472 }
1471 1473
1472out_unlock: 1474out_unlock:
1475 current->in_execve = 0;
1473 mutex_unlock(&current->cred_exec_mutex); 1476 mutex_unlock(&current->cred_exec_mutex);
1474 1477
1475out_free: 1478out_free:
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 5235c67e7594..ff786687e93b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -522,6 +522,11 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
522 if (err) 522 if (err)
523 return -EFAULT; 523 return -EFAULT;
524 break; 524 break;
525 case SIOCSHWTSTAMP:
526 if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
527 return -EFAULT;
528 ifr.ifr_data = compat_ptr(uifr32->ifr_ifru.ifru_data);
529 break;
525 default: 530 default:
526 if (copy_from_user(&ifr, uifr32, sizeof(*uifr32))) 531 if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
527 return -EFAULT; 532 return -EFAULT;
@@ -538,6 +543,7 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
538 * cannot be fixed without breaking all existing apps. 543 * cannot be fixed without breaking all existing apps.
539 */ 544 */
540 case TUNSETIFF: 545 case TUNSETIFF:
546 case TUNGETIFF:
541 case SIOCGIFFLAGS: 547 case SIOCGIFFLAGS:
542 case SIOCGIFMETRIC: 548 case SIOCGIFMETRIC:
543 case SIOCGIFMTU: 549 case SIOCGIFMTU:
@@ -784,7 +790,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
784 790
785 if (copy_in_user(&sgio->status, &sgio32->status, 791 if (copy_in_user(&sgio->status, &sgio32->status,
786 (4 * sizeof(unsigned char)) + 792 (4 * sizeof(unsigned char)) +
787 (2 * sizeof(unsigned (short))) + 793 (2 * sizeof(unsigned short)) +
788 (3 * sizeof(int)))) 794 (3 * sizeof(int))))
789 return -EFAULT; 795 return -EFAULT;
790 796
@@ -1912,6 +1918,9 @@ COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */
1912/* 0x00 */ 1918/* 0x00 */
1913COMPATIBLE_IOCTL(FIBMAP) 1919COMPATIBLE_IOCTL(FIBMAP)
1914COMPATIBLE_IOCTL(FIGETBSZ) 1920COMPATIBLE_IOCTL(FIGETBSZ)
1921/* 'X' - originally XFS but some now in the VFS */
1922COMPATIBLE_IOCTL(FIFREEZE)
1923COMPATIBLE_IOCTL(FITHAW)
1915/* RAID */ 1924/* RAID */
1916COMPATIBLE_IOCTL(RAID_VERSION) 1925COMPATIBLE_IOCTL(RAID_VERSION)
1917COMPATIBLE_IOCTL(GET_ARRAY_INFO) 1926COMPATIBLE_IOCTL(GET_ARRAY_INFO)
@@ -1937,6 +1946,8 @@ ULONG_IOCTL(SET_BITMAP_FILE)
1937/* Big K */ 1946/* Big K */
1938COMPATIBLE_IOCTL(PIO_FONT) 1947COMPATIBLE_IOCTL(PIO_FONT)
1939COMPATIBLE_IOCTL(GIO_FONT) 1948COMPATIBLE_IOCTL(GIO_FONT)
1949COMPATIBLE_IOCTL(PIO_CMAP)
1950COMPATIBLE_IOCTL(GIO_CMAP)
1940ULONG_IOCTL(KDSIGACCEPT) 1951ULONG_IOCTL(KDSIGACCEPT)
1941COMPATIBLE_IOCTL(KDGETKEYCODE) 1952COMPATIBLE_IOCTL(KDGETKEYCODE)
1942COMPATIBLE_IOCTL(KDSETKEYCODE) 1953COMPATIBLE_IOCTL(KDSETKEYCODE)
@@ -1982,6 +1993,13 @@ COMPATIBLE_IOCTL(TUNSETNOCSUM)
1982COMPATIBLE_IOCTL(TUNSETDEBUG) 1993COMPATIBLE_IOCTL(TUNSETDEBUG)
1983COMPATIBLE_IOCTL(TUNSETPERSIST) 1994COMPATIBLE_IOCTL(TUNSETPERSIST)
1984COMPATIBLE_IOCTL(TUNSETOWNER) 1995COMPATIBLE_IOCTL(TUNSETOWNER)
1996COMPATIBLE_IOCTL(TUNSETLINK)
1997COMPATIBLE_IOCTL(TUNSETGROUP)
1998COMPATIBLE_IOCTL(TUNGETFEATURES)
1999COMPATIBLE_IOCTL(TUNSETOFFLOAD)
2000COMPATIBLE_IOCTL(TUNSETTXFILTER)
2001COMPATIBLE_IOCTL(TUNGETSNDBUF)
2002COMPATIBLE_IOCTL(TUNSETSNDBUF)
1985/* Big V */ 2003/* Big V */
1986COMPATIBLE_IOCTL(VT_SETMODE) 2004COMPATIBLE_IOCTL(VT_SETMODE)
1987COMPATIBLE_IOCTL(VT_GETMODE) 2005COMPATIBLE_IOCTL(VT_GETMODE)
@@ -2555,6 +2573,7 @@ HANDLE_IOCTL(SIOCSIFMAP, dev_ifsioc)
2555HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc) 2573HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc)
2556HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc) 2574HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc)
2557HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc) 2575HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc)
2576HANDLE_IOCTL(SIOCSHWTSTAMP, dev_ifsioc)
2558 2577
2559/* ioctls used by appletalk ddp.c */ 2578/* ioctls used by appletalk ddp.c */
2560HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc) 2579HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc)
@@ -2573,6 +2592,7 @@ HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc)
2573HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc) 2592HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc)
2574HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc) 2593HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc)
2575HANDLE_IOCTL(TUNSETIFF, dev_ifsioc) 2594HANDLE_IOCTL(TUNSETIFF, dev_ifsioc)
2595HANDLE_IOCTL(TUNGETIFF, dev_ifsioc)
2576HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl) 2596HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl)
2577HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl) 2597HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl)
2578HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl) 2598HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl)
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
new file mode 100644
index 000000000000..13587cc97a0b
--- /dev/null
+++ b/fs/configfs/Kconfig
@@ -0,0 +1,11 @@
1config CONFIGFS_FS
2 tristate "Userspace-driven configuration filesystem"
3 depends on SYSFS
4 help
5 configfs is a ram-based filesystem that provides the converse
6 of sysfs's functionality. Where sysfs is a filesystem-based
7 view of kernel objects, configfs is a filesystem-based manager
8 of kernel objects, or config_items.
9
10 Both sysfs and configfs can and should exist together on the
11 same system. One is not a replacement for the other.
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
new file mode 100644
index 000000000000..cd06466f365e
--- /dev/null
+++ b/fs/cramfs/Kconfig
@@ -0,0 +1,19 @@
1config CRAMFS
2 tristate "Compressed ROM file system support (cramfs)"
3 depends on BLOCK
4 select ZLIB_INFLATE
5 help
6 Saying Y here includes support for CramFs (Compressed ROM File
7 System). CramFs is designed to be a simple, small, and compressed
8 file system for ROM based embedded systems. CramFs is read-only,
9 limited to 256MB file systems (with 16MB files), and doesn't support
10 16/32 bits uid/gid, hard links and timestamps.
11
12 See <file:Documentation/filesystems/cramfs.txt> and
13 <file:fs/cramfs/README> for further information.
14
15 To compile this as a module, choose M here: the module will be called
16 cramfs. Note that the root file system (the one containing the
17 directory /) cannot be compiled as a module.
18
19 If unsure, say N.
diff --git a/fs/dcache.c b/fs/dcache.c
index 937df0fb0da5..07e2d4a44bda 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1180,7 +1180,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
1180 iput(inode); 1180 iput(inode);
1181 return res; 1181 return res;
1182} 1182}
1183EXPORT_SYMBOL_GPL(d_obtain_alias); 1183EXPORT_SYMBOL(d_obtain_alias);
1184 1184
1185/** 1185/**
1186 * d_splice_alias - splice a disconnected dentry into the tree if one exists 1186 * d_splice_alias - splice a disconnected dentry into the tree if one exists
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5f3231b9633f..bff4052b05e7 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -198,9 +198,6 @@ static int mknod_ptmx(struct super_block *sb)
198 198
199 fsi->ptmx_dentry = dentry; 199 fsi->ptmx_dentry = dentry;
200 rc = 0; 200 rc = 0;
201
202 printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
203 inode->i_ino);
204out: 201out:
205 mutex_unlock(&root->d_inode->i_mutex); 202 mutex_unlock(&root->d_inode->i_mutex);
206 return rc; 203 return rc;
@@ -369,8 +366,6 @@ static int new_pts_mount(struct file_system_type *fs_type, int flags,
369 struct pts_fs_info *fsi; 366 struct pts_fs_info *fsi;
370 struct pts_mount_opts *opts; 367 struct pts_mount_opts *opts;
371 368
372 printk(KERN_NOTICE "devpts: newinstance mount\n");
373
374 err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt); 369 err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
375 if (err) 370 if (err)
376 return err; 371 return err;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index eba87ff3177b..894a32d438d5 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -168,7 +168,7 @@ static int dlm_plock_callback(struct plock_op *op)
168 notify = xop->callback; 168 notify = xop->callback;
169 169
170 if (op->info.rv) { 170 if (op->info.rv) {
171 notify(flc, NULL, op->info.rv); 171 notify(fl, NULL, op->info.rv);
172 goto out; 172 goto out;
173 } 173 }
174 174
@@ -187,7 +187,7 @@ static int dlm_plock_callback(struct plock_op *op)
187 (unsigned long long)op->info.number, file, fl); 187 (unsigned long long)op->info.number, file, fl);
188 } 188 }
189 189
190 rv = notify(flc, NULL, 0); 190 rv = notify(fl, NULL, 0);
191 if (rv) { 191 if (rv) {
192 /* XXX: We need to cancel the fs lock here: */ 192 /* XXX: We need to cancel the fs lock here: */
193 log_print("dlm_plock_callback: lock granted after lock request " 193 log_print("dlm_plock_callback: lock granted after lock request "
@@ -304,7 +304,9 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
304 if (rv == -ENOENT) 304 if (rv == -ENOENT)
305 rv = 0; 305 rv = 0;
306 else if (rv > 0) { 306 else if (rv > 0) {
307 locks_init_lock(fl);
307 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; 308 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
309 fl->fl_flags = FL_POSIX;
308 fl->fl_pid = op->info.pid; 310 fl->fl_pid = op->info.pid;
309 fl->fl_start = op->info.start; 311 fl->fl_start = op->info.start;
310 fl->fl_end = op->info.end; 312 fl->fl_end = op->info.end;
diff --git a/fs/dquot.c b/fs/dquot.c
index 48c0571f831d..d6add0bf5ad3 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -87,14 +87,17 @@
87#define __DQUOT_PARANOIA 87#define __DQUOT_PARANOIA
88 88
89/* 89/*
90 * There are two quota SMP locks. dq_list_lock protects all lists with quotas 90 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
91 * and quota formats and also dqstats structure containing statistics about the 91 * and quota formats, dqstats structure containing statistics about the lists
92 * lists. dq_data_lock protects data from dq_dqb and also mem_dqinfo structures 92 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
93 * and also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. 93 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
94 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly 94 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
95 * in inode_add_bytes() and inode_sub_bytes(). 95 * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects
96 * modifications of quota state (on quotaon and quotaoff) and readers who care
97 * about latest values take it as well.
96 * 98 *
97 * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock 99 * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock,
100 * dq_list_lock > dq_state_lock
98 * 101 *
99 * Note that some things (eg. sb pointer, type, id) doesn't change during 102 * Note that some things (eg. sb pointer, type, id) doesn't change during
100 * the life of the dquot structure and so needn't to be protected by a lock 103 * the life of the dquot structure and so needn't to be protected by a lock
@@ -103,12 +106,7 @@
103 * operation is just reading pointers from inode (or not using them at all) the 106 * operation is just reading pointers from inode (or not using them at all) the
104 * read lock is enough. If pointers are altered function must hold write lock 107 * read lock is enough. If pointers are altered function must hold write lock
105 * (these locking rules also apply for S_NOQUOTA flag in the inode - note that 108 * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
106 * for altering the flag i_mutex is also needed). If operation is holding 109 * for altering the flag i_mutex is also needed).
107 * reference to dquot in other way (e.g. quotactl ops) it must be guarded by
108 * dqonoff_mutex.
109 * This locking assures that:
110 * a) update/access to dquot pointers in inode is serialized
111 * b) everyone is guarded against invalidate_dquots()
112 * 110 *
113 * Each dquot has its dq_lock mutex. Locked dquots might not be referenced 111 * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
114 * from inodes (dquot_alloc_space() and such don't check the dq_lock). 112 * from inodes (dquot_alloc_space() and such don't check the dq_lock).
@@ -122,10 +120,17 @@
122 * Lock ordering (including related VFS locks) is the following: 120 * Lock ordering (including related VFS locks) is the following:
123 * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > 121 * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock >
124 * dqio_mutex 122 * dqio_mutex
123 * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem >
124 * dqptr_sem. But filesystem has to count with the fact that functions such as
125 * dquot_alloc_space() acquire dqptr_sem and they usually have to be called
126 * from inside a transaction to keep filesystem consistency after a crash. Also
127 * filesystems usually want to do some IO on dquot from ->mark_dirty which is
128 * called with dqptr_sem held.
125 * i_mutex on quota files is special (it's below dqio_mutex) 129 * i_mutex on quota files is special (it's below dqio_mutex)
126 */ 130 */
127 131
128static DEFINE_SPINLOCK(dq_list_lock); 132static DEFINE_SPINLOCK(dq_list_lock);
133static DEFINE_SPINLOCK(dq_state_lock);
129DEFINE_SPINLOCK(dq_data_lock); 134DEFINE_SPINLOCK(dq_data_lock);
130 135
131static char *quotatypes[] = INITQFNAMES; 136static char *quotatypes[] = INITQFNAMES;
@@ -428,7 +433,7 @@ static inline void do_destroy_dquot(struct dquot *dquot)
428 * quota is disabled and pointers from inodes removed so there cannot be new 433 * quota is disabled and pointers from inodes removed so there cannot be new
429 * quota users. There can still be some users of quotas due to inodes being 434 * quota users. There can still be some users of quotas due to inodes being
430 * just deleted or pruned by prune_icache() (those are not attached to any 435 * just deleted or pruned by prune_icache() (those are not attached to any
431 * list). We have to wait for such users. 436 * list) or parallel quotactl call. We have to wait for such users.
432 */ 437 */
433static void invalidate_dquots(struct super_block *sb, int type) 438static void invalidate_dquots(struct super_block *sb, int type)
434{ 439{
@@ -600,7 +605,6 @@ static struct shrinker dqcache_shrinker = {
600/* 605/*
601 * Put reference to dquot 606 * Put reference to dquot
602 * NOTE: If you change this function please check whether dqput_blocks() works right... 607 * NOTE: If you change this function please check whether dqput_blocks() works right...
603 * MUST be called with either dqptr_sem or dqonoff_mutex held
604 */ 608 */
605void dqput(struct dquot *dquot) 609void dqput(struct dquot *dquot)
606{ 610{
@@ -697,36 +701,30 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
697} 701}
698 702
699/* 703/*
700 * Check whether dquot is in memory.
701 * MUST be called with either dqptr_sem or dqonoff_mutex held
702 */
703int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
704{
705 unsigned int hashent = hashfn(sb, id, type);
706 int ret = 0;
707
708 if (!sb_has_quota_active(sb, type))
709 return 0;
710 spin_lock(&dq_list_lock);
711 if (find_dquot(hashent, sb, id, type) != NODQUOT)
712 ret = 1;
713 spin_unlock(&dq_list_lock);
714 return ret;
715}
716
717/*
718 * Get reference to dquot 704 * Get reference to dquot
719 * MUST be called with either dqptr_sem or dqonoff_mutex held 705 *
706 * Locking is slightly tricky here. We are guarded from parallel quotaoff()
707 * destroying our dquot by:
708 * a) checking for quota flags under dq_list_lock and
709 * b) getting a reference to dquot before we release dq_list_lock
720 */ 710 */
721struct dquot *dqget(struct super_block *sb, unsigned int id, int type) 711struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
722{ 712{
723 unsigned int hashent = hashfn(sb, id, type); 713 unsigned int hashent = hashfn(sb, id, type);
724 struct dquot *dquot, *empty = NODQUOT; 714 struct dquot *dquot = NODQUOT, *empty = NODQUOT;
725 715
726 if (!sb_has_quota_active(sb, type)) 716 if (!sb_has_quota_active(sb, type))
727 return NODQUOT; 717 return NODQUOT;
728we_slept: 718we_slept:
729 spin_lock(&dq_list_lock); 719 spin_lock(&dq_list_lock);
720 spin_lock(&dq_state_lock);
721 if (!sb_has_quota_active(sb, type)) {
722 spin_unlock(&dq_state_lock);
723 spin_unlock(&dq_list_lock);
724 goto out;
725 }
726 spin_unlock(&dq_state_lock);
727
730 if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) { 728 if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) {
731 if (empty == NODQUOT) { 729 if (empty == NODQUOT) {
732 spin_unlock(&dq_list_lock); 730 spin_unlock(&dq_list_lock);
@@ -735,6 +733,7 @@ we_slept:
735 goto we_slept; 733 goto we_slept;
736 } 734 }
737 dquot = empty; 735 dquot = empty;
736 empty = NODQUOT;
738 dquot->dq_id = id; 737 dquot->dq_id = id;
739 /* all dquots go on the inuse_list */ 738 /* all dquots go on the inuse_list */
740 put_inuse(dquot); 739 put_inuse(dquot);
@@ -749,8 +748,6 @@ we_slept:
749 dqstats.cache_hits++; 748 dqstats.cache_hits++;
750 dqstats.lookups++; 749 dqstats.lookups++;
751 spin_unlock(&dq_list_lock); 750 spin_unlock(&dq_list_lock);
752 if (empty)
753 do_destroy_dquot(empty);
754 } 751 }
755 /* Wait for dq_lock - after this we know that either dquot_release() is already 752 /* Wait for dq_lock - after this we know that either dquot_release() is already
756 * finished or it will be canceled due to dq_count > 1 test */ 753 * finished or it will be canceled due to dq_count > 1 test */
@@ -758,11 +755,15 @@ we_slept:
758 /* Read the dquot and instantiate it (everything done only if needed) */ 755 /* Read the dquot and instantiate it (everything done only if needed) */
759 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) { 756 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) {
760 dqput(dquot); 757 dqput(dquot);
761 return NODQUOT; 758 dquot = NODQUOT;
759 goto out;
762 } 760 }
763#ifdef __DQUOT_PARANOIA 761#ifdef __DQUOT_PARANOIA
764 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ 762 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
765#endif 763#endif
764out:
765 if (empty)
766 do_destroy_dquot(empty);
766 767
767 return dquot; 768 return dquot;
768} 769}
@@ -1056,10 +1057,7 @@ static void send_warning(const struct dquot *dquot, const char warntype)
1056 goto attr_err_out; 1057 goto attr_err_out;
1057 genlmsg_end(skb, msg_head); 1058 genlmsg_end(skb, msg_head);
1058 1059
1059 ret = genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); 1060 genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
1060 if (ret < 0 && ret != -ESRCH)
1061 printk(KERN_ERR
1062 "VFS: Failed to send notification message: %d\n", ret);
1063 return; 1061 return;
1064attr_err_out: 1062attr_err_out:
1065 printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); 1063 printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
@@ -1198,63 +1196,76 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
1198} 1196}
1199/* 1197/*
1200 * Initialize quota pointers in inode 1198 * Initialize quota pointers in inode
1201 * Transaction must be started at entry 1199 * We do things in a bit complicated way but by that we avoid calling
1200 * dqget() and thus filesystem callbacks under dqptr_sem.
1202 */ 1201 */
1203int dquot_initialize(struct inode *inode, int type) 1202int dquot_initialize(struct inode *inode, int type)
1204{ 1203{
1205 unsigned int id = 0; 1204 unsigned int id = 0;
1206 int cnt, ret = 0; 1205 int cnt, ret = 0;
1206 struct dquot *got[MAXQUOTAS] = { NODQUOT, NODQUOT };
1207 struct super_block *sb = inode->i_sb;
1207 1208
1208 /* First test before acquiring mutex - solves deadlocks when we 1209 /* First test before acquiring mutex - solves deadlocks when we
1209 * re-enter the quota code and are already holding the mutex */ 1210 * re-enter the quota code and are already holding the mutex */
1210 if (IS_NOQUOTA(inode)) 1211 if (IS_NOQUOTA(inode))
1211 return 0; 1212 return 0;
1212 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1213
1214 /* First get references to structures we might need. */
1215 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1216 if (type != -1 && cnt != type)
1217 continue;
1218 switch (cnt) {
1219 case USRQUOTA:
1220 id = inode->i_uid;
1221 break;
1222 case GRPQUOTA:
1223 id = inode->i_gid;
1224 break;
1225 }
1226 got[cnt] = dqget(sb, id, cnt);
1227 }
1228
1229 down_write(&sb_dqopt(sb)->dqptr_sem);
1213 /* Having dqptr_sem we know NOQUOTA flags can't be altered... */ 1230 /* Having dqptr_sem we know NOQUOTA flags can't be altered... */
1214 if (IS_NOQUOTA(inode)) 1231 if (IS_NOQUOTA(inode))
1215 goto out_err; 1232 goto out_err;
1216 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1233 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1217 if (type != -1 && cnt != type) 1234 if (type != -1 && cnt != type)
1218 continue; 1235 continue;
1236 /* Avoid races with quotaoff() */
1237 if (!sb_has_quota_active(sb, cnt))
1238 continue;
1219 if (inode->i_dquot[cnt] == NODQUOT) { 1239 if (inode->i_dquot[cnt] == NODQUOT) {
1220 switch (cnt) { 1240 inode->i_dquot[cnt] = got[cnt];
1221 case USRQUOTA: 1241 got[cnt] = NODQUOT;
1222 id = inode->i_uid;
1223 break;
1224 case GRPQUOTA:
1225 id = inode->i_gid;
1226 break;
1227 }
1228 inode->i_dquot[cnt] = dqget(inode->i_sb, id, cnt);
1229 } 1242 }
1230 } 1243 }
1231out_err: 1244out_err:
1232 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1245 up_write(&sb_dqopt(sb)->dqptr_sem);
1246 /* Drop unused references */
1247 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1248 dqput(got[cnt]);
1233 return ret; 1249 return ret;
1234} 1250}
1235 1251
1236/* 1252/*
1237 * Release all quotas referenced by inode 1253 * Release all quotas referenced by inode
1238 * Transaction must be started at an entry
1239 */ 1254 */
1240int dquot_drop_locked(struct inode *inode) 1255int dquot_drop(struct inode *inode)
1241{ 1256{
1242 int cnt; 1257 int cnt;
1258 struct dquot *put[MAXQUOTAS];
1243 1259
1260 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1244 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1261 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1245 if (inode->i_dquot[cnt] != NODQUOT) { 1262 put[cnt] = inode->i_dquot[cnt];
1246 dqput(inode->i_dquot[cnt]); 1263 inode->i_dquot[cnt] = NODQUOT;
1247 inode->i_dquot[cnt] = NODQUOT;
1248 }
1249 } 1264 }
1250 return 0;
1251}
1252
1253int dquot_drop(struct inode *inode)
1254{
1255 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1256 dquot_drop_locked(inode);
1257 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1265 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1266
1267 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1268 dqput(put[cnt]);
1258 return 0; 1269 return 0;
1259} 1270}
1260 1271
@@ -1470,8 +1481,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1470 qsize_t space; 1481 qsize_t space;
1471 struct dquot *transfer_from[MAXQUOTAS]; 1482 struct dquot *transfer_from[MAXQUOTAS];
1472 struct dquot *transfer_to[MAXQUOTAS]; 1483 struct dquot *transfer_to[MAXQUOTAS];
1473 int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid, 1484 int cnt, ret = QUOTA_OK;
1474 chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid; 1485 int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid,
1486 chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid;
1475 char warntype_to[MAXQUOTAS]; 1487 char warntype_to[MAXQUOTAS];
1476 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1488 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
1477 1489
@@ -1479,21 +1491,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1479 * re-enter the quota code and are already holding the mutex */ 1491 * re-enter the quota code and are already holding the mutex */
1480 if (IS_NOQUOTA(inode)) 1492 if (IS_NOQUOTA(inode))
1481 return QUOTA_OK; 1493 return QUOTA_OK;
1482 /* Clear the arrays */ 1494 /* Initialize the arrays */
1483 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1495 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1484 transfer_to[cnt] = transfer_from[cnt] = NODQUOT; 1496 transfer_from[cnt] = NODQUOT;
1497 transfer_to[cnt] = NODQUOT;
1485 warntype_to[cnt] = QUOTA_NL_NOWARN; 1498 warntype_to[cnt] = QUOTA_NL_NOWARN;
1486 }
1487 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1488 /* Now recheck reliably when holding dqptr_sem */
1489 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1490 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1491 return QUOTA_OK;
1492 }
1493 /* First build the transfer_to list - here we can block on
1494 * reading/instantiating of dquots. We know that the transaction for
1495 * us was already started so we don't violate lock ranking here */
1496 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1497 switch (cnt) { 1499 switch (cnt) {
1498 case USRQUOTA: 1500 case USRQUOTA:
1499 if (!chuid) 1501 if (!chuid)
@@ -1507,6 +1509,13 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1507 break; 1509 break;
1508 } 1510 }
1509 } 1511 }
1512
1513 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1514 /* Now recheck reliably when holding dqptr_sem */
1515 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1516 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1517 goto put_all;
1518 }
1510 spin_lock(&dq_data_lock); 1519 spin_lock(&dq_data_lock);
1511 space = inode_get_bytes(inode); 1520 space = inode_get_bytes(inode);
1512 /* Build the transfer_from list and check the limits */ 1521 /* Build the transfer_from list and check the limits */
@@ -1517,7 +1526,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1517 if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == 1526 if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
1518 NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, 1527 NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
1519 warntype_to + cnt) == NO_QUOTA) 1528 warntype_to + cnt) == NO_QUOTA)
1520 goto warn_put_all; 1529 goto over_quota;
1521 } 1530 }
1522 1531
1523 /* 1532 /*
@@ -1545,28 +1554,37 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1545 1554
1546 inode->i_dquot[cnt] = transfer_to[cnt]; 1555 inode->i_dquot[cnt] = transfer_to[cnt];
1547 } 1556 }
1548 ret = QUOTA_OK;
1549warn_put_all:
1550 spin_unlock(&dq_data_lock); 1557 spin_unlock(&dq_data_lock);
1558 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1559
1551 /* Dirtify all the dquots - this can block when journalling */ 1560 /* Dirtify all the dquots - this can block when journalling */
1552 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1561 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1553 if (transfer_from[cnt]) 1562 if (transfer_from[cnt])
1554 mark_dquot_dirty(transfer_from[cnt]); 1563 mark_dquot_dirty(transfer_from[cnt]);
1555 if (transfer_to[cnt]) 1564 if (transfer_to[cnt]) {
1556 mark_dquot_dirty(transfer_to[cnt]); 1565 mark_dquot_dirty(transfer_to[cnt]);
1566 /* The reference we got is transferred to the inode */
1567 transfer_to[cnt] = NODQUOT;
1568 }
1557 } 1569 }
1570warn_put_all:
1558 flush_warnings(transfer_to, warntype_to); 1571 flush_warnings(transfer_to, warntype_to);
1559 flush_warnings(transfer_from, warntype_from_inodes); 1572 flush_warnings(transfer_from, warntype_from_inodes);
1560 flush_warnings(transfer_from, warntype_from_space); 1573 flush_warnings(transfer_from, warntype_from_space);
1561 1574put_all:
1562 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1575 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1563 if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT) 1576 dqput(transfer_from[cnt]);
1564 dqput(transfer_from[cnt]); 1577 dqput(transfer_to[cnt]);
1565 if (ret == NO_QUOTA && transfer_to[cnt] != NODQUOT)
1566 dqput(transfer_to[cnt]);
1567 } 1578 }
1568 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1569 return ret; 1579 return ret;
1580over_quota:
1581 spin_unlock(&dq_data_lock);
1582 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1583 /* Clear dquot pointers we don't want to dqput() */
1584 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1585 transfer_from[cnt] = NODQUOT;
1586 ret = NO_QUOTA;
1587 goto warn_put_all;
1570} 1588}
1571 1589
1572/* Wrapper for transferring ownership of an inode */ 1590/* Wrapper for transferring ownership of an inode */
@@ -1651,19 +1669,24 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
1651 continue; 1669 continue;
1652 1670
1653 if (flags & DQUOT_SUSPENDED) { 1671 if (flags & DQUOT_SUSPENDED) {
1672 spin_lock(&dq_state_lock);
1654 dqopt->flags |= 1673 dqopt->flags |=
1655 dquot_state_flag(DQUOT_SUSPENDED, cnt); 1674 dquot_state_flag(DQUOT_SUSPENDED, cnt);
1675 spin_unlock(&dq_state_lock);
1656 } else { 1676 } else {
1677 spin_lock(&dq_state_lock);
1657 dqopt->flags &= ~dquot_state_flag(flags, cnt); 1678 dqopt->flags &= ~dquot_state_flag(flags, cnt);
1658 /* Turning off suspended quotas? */ 1679 /* Turning off suspended quotas? */
1659 if (!sb_has_quota_loaded(sb, cnt) && 1680 if (!sb_has_quota_loaded(sb, cnt) &&
1660 sb_has_quota_suspended(sb, cnt)) { 1681 sb_has_quota_suspended(sb, cnt)) {
1661 dqopt->flags &= ~dquot_state_flag( 1682 dqopt->flags &= ~dquot_state_flag(
1662 DQUOT_SUSPENDED, cnt); 1683 DQUOT_SUSPENDED, cnt);
1684 spin_unlock(&dq_state_lock);
1663 iput(dqopt->files[cnt]); 1685 iput(dqopt->files[cnt]);
1664 dqopt->files[cnt] = NULL; 1686 dqopt->files[cnt] = NULL;
1665 continue; 1687 continue;
1666 } 1688 }
1689 spin_unlock(&dq_state_lock);
1667 } 1690 }
1668 1691
1669 /* We still have to keep quota loaded? */ 1692 /* We still have to keep quota loaded? */
@@ -1830,7 +1853,9 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
1830 } 1853 }
1831 mutex_unlock(&dqopt->dqio_mutex); 1854 mutex_unlock(&dqopt->dqio_mutex);
1832 mutex_unlock(&inode->i_mutex); 1855 mutex_unlock(&inode->i_mutex);
1856 spin_lock(&dq_state_lock);
1833 dqopt->flags |= dquot_state_flag(flags, type); 1857 dqopt->flags |= dquot_state_flag(flags, type);
1858 spin_unlock(&dq_state_lock);
1834 1859
1835 add_dquot_ref(sb, type); 1860 add_dquot_ref(sb, type);
1836 mutex_unlock(&dqopt->dqonoff_mutex); 1861 mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1872,9 +1897,11 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
1872 } 1897 }
1873 inode = dqopt->files[type]; 1898 inode = dqopt->files[type];
1874 dqopt->files[type] = NULL; 1899 dqopt->files[type] = NULL;
1900 spin_lock(&dq_state_lock);
1875 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | 1901 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
1876 DQUOT_LIMITS_ENABLED, type); 1902 DQUOT_LIMITS_ENABLED, type);
1877 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type); 1903 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
1904 spin_unlock(&dq_state_lock);
1878 mutex_unlock(&dqopt->dqonoff_mutex); 1905 mutex_unlock(&dqopt->dqonoff_mutex);
1879 1906
1880 flags = dquot_generic_flag(flags, type); 1907 flags = dquot_generic_flag(flags, type);
@@ -1952,7 +1979,9 @@ int vfs_quota_enable(struct inode *inode, int type, int format_id,
1952 ret = -EBUSY; 1979 ret = -EBUSY;
1953 goto out_lock; 1980 goto out_lock;
1954 } 1981 }
1982 spin_lock(&dq_state_lock);
1955 sb_dqopt(sb)->flags |= dquot_state_flag(flags, type); 1983 sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
1984 spin_unlock(&dq_state_lock);
1956out_lock: 1985out_lock:
1957 mutex_unlock(&dqopt->dqonoff_mutex); 1986 mutex_unlock(&dqopt->dqonoff_mutex);
1958 return ret; 1987 return ret;
@@ -2039,14 +2068,12 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
2039{ 2068{
2040 struct dquot *dquot; 2069 struct dquot *dquot;
2041 2070
2042 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 2071 dquot = dqget(sb, id, type);
2043 if (!(dquot = dqget(sb, id, type))) { 2072 if (dquot == NODQUOT)
2044 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2045 return -ESRCH; 2073 return -ESRCH;
2046 }
2047 do_get_dqblk(dquot, di); 2074 do_get_dqblk(dquot, di);
2048 dqput(dquot); 2075 dqput(dquot);
2049 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2076
2050 return 0; 2077 return 0;
2051} 2078}
2052 2079
@@ -2130,7 +2157,6 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
2130 struct dquot *dquot; 2157 struct dquot *dquot;
2131 int rc; 2158 int rc;
2132 2159
2133 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
2134 dquot = dqget(sb, id, type); 2160 dquot = dqget(sb, id, type);
2135 if (!dquot) { 2161 if (!dquot) {
2136 rc = -ESRCH; 2162 rc = -ESRCH;
@@ -2139,7 +2165,6 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
2139 rc = do_set_dqblk(dquot, di); 2165 rc = do_set_dqblk(dquot, di);
2140 dqput(dquot); 2166 dqput(dquot);
2141out: 2167out:
2142 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2143 return rc; 2168 return rc;
2144} 2169}
2145 2170
@@ -2370,11 +2395,9 @@ EXPORT_SYMBOL(dquot_release);
2370EXPORT_SYMBOL(dquot_mark_dquot_dirty); 2395EXPORT_SYMBOL(dquot_mark_dquot_dirty);
2371EXPORT_SYMBOL(dquot_initialize); 2396EXPORT_SYMBOL(dquot_initialize);
2372EXPORT_SYMBOL(dquot_drop); 2397EXPORT_SYMBOL(dquot_drop);
2373EXPORT_SYMBOL(dquot_drop_locked);
2374EXPORT_SYMBOL(vfs_dq_drop); 2398EXPORT_SYMBOL(vfs_dq_drop);
2375EXPORT_SYMBOL(dqget); 2399EXPORT_SYMBOL(dqget);
2376EXPORT_SYMBOL(dqput); 2400EXPORT_SYMBOL(dqput);
2377EXPORT_SYMBOL(dquot_is_cached);
2378EXPORT_SYMBOL(dquot_alloc_space); 2401EXPORT_SYMBOL(dquot_alloc_space);
2379EXPORT_SYMBOL(dquot_alloc_inode); 2402EXPORT_SYMBOL(dquot_alloc_inode);
2380EXPORT_SYMBOL(dquot_free_space); 2403EXPORT_SYMBOL(dquot_free_space);
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
new file mode 100644
index 000000000000..0c754e64232b
--- /dev/null
+++ b/fs/ecryptfs/Kconfig
@@ -0,0 +1,11 @@
1config ECRYPT_FS
2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
3 depends on EXPERIMENTAL && KEYS && CRYPTO && NET
4 help
5 Encrypted filesystem that operates on the VFS layer. See
6 <file:Documentation/filesystems/ecryptfs.txt> to learn more about
7 eCryptfs. Userspace components are required and can be
8 obtained from <http://ecryptfs.sf.net>.
9
10 To compile this file system support as a module, choose M here: the
11 module will be called ecryptfs.
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index c01e043670e2..8b65f289ee00 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -946,6 +946,8 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
946 list_for_each_entry(global_auth_tok, 946 list_for_each_entry(global_auth_tok,
947 &mount_crypt_stat->global_auth_tok_list, 947 &mount_crypt_stat->global_auth_tok_list,
948 mount_crypt_stat_list) { 948 mount_crypt_stat_list) {
949 if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK)
950 continue;
949 rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); 951 rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
950 if (rc) { 952 if (rc) {
951 printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); 953 printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
@@ -1322,14 +1324,13 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
1322} 1324}
1323 1325
1324static int 1326static int
1325ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat, 1327ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
1326 struct dentry *ecryptfs_dentry, 1328 char *virt, size_t virt_len)
1327 char *virt)
1328{ 1329{
1329 int rc; 1330 int rc;
1330 1331
1331 rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, 1332 rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
1332 0, crypt_stat->num_header_bytes_at_front); 1333 0, virt_len);
1333 if (rc) 1334 if (rc)
1334 printk(KERN_ERR "%s: Error attempting to write header " 1335 printk(KERN_ERR "%s: Error attempting to write header "
1335 "information to lower file; rc = [%d]\n", __func__, 1336 "information to lower file; rc = [%d]\n", __func__,
@@ -1339,7 +1340,6 @@ ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat,
1339 1340
1340static int 1341static int
1341ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, 1342ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
1342 struct ecryptfs_crypt_stat *crypt_stat,
1343 char *page_virt, size_t size) 1343 char *page_virt, size_t size)
1344{ 1344{
1345 int rc; 1345 int rc;
@@ -1349,6 +1349,17 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
1349 return rc; 1349 return rc;
1350} 1350}
1351 1351
1352static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
1353 unsigned int order)
1354{
1355 struct page *page;
1356
1357 page = alloc_pages(gfp_mask | __GFP_ZERO, order);
1358 if (page)
1359 return (unsigned long) page_address(page);
1360 return 0;
1361}
1362
1352/** 1363/**
1353 * ecryptfs_write_metadata 1364 * ecryptfs_write_metadata
1354 * @ecryptfs_dentry: The eCryptfs dentry 1365 * @ecryptfs_dentry: The eCryptfs dentry
@@ -1365,7 +1376,9 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
1365{ 1376{
1366 struct ecryptfs_crypt_stat *crypt_stat = 1377 struct ecryptfs_crypt_stat *crypt_stat =
1367 &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; 1378 &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
1379 unsigned int order;
1368 char *virt; 1380 char *virt;
1381 size_t virt_len;
1369 size_t size = 0; 1382 size_t size = 0;
1370 int rc = 0; 1383 int rc = 0;
1371 1384
@@ -1381,33 +1394,35 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
1381 rc = -EINVAL; 1394 rc = -EINVAL;
1382 goto out; 1395 goto out;
1383 } 1396 }
1397 virt_len = crypt_stat->num_header_bytes_at_front;
1398 order = get_order(virt_len);
1384 /* Released in this function */ 1399 /* Released in this function */
1385 virt = (char *)get_zeroed_page(GFP_KERNEL); 1400 virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
1386 if (!virt) { 1401 if (!virt) {
1387 printk(KERN_ERR "%s: Out of memory\n", __func__); 1402 printk(KERN_ERR "%s: Out of memory\n", __func__);
1388 rc = -ENOMEM; 1403 rc = -ENOMEM;
1389 goto out; 1404 goto out;
1390 } 1405 }
1391 rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size, 1406 rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat,
1392 crypt_stat, ecryptfs_dentry); 1407 ecryptfs_dentry);
1393 if (unlikely(rc)) { 1408 if (unlikely(rc)) {
1394 printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n", 1409 printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n",
1395 __func__, rc); 1410 __func__, rc);
1396 goto out_free; 1411 goto out_free;
1397 } 1412 }
1398 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 1413 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
1399 rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, 1414 rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
1400 crypt_stat, virt, size); 1415 size);
1401 else 1416 else
1402 rc = ecryptfs_write_metadata_to_contents(crypt_stat, 1417 rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt,
1403 ecryptfs_dentry, virt); 1418 virt_len);
1404 if (rc) { 1419 if (rc) {
1405 printk(KERN_ERR "%s: Error writing metadata out to lower file; " 1420 printk(KERN_ERR "%s: Error writing metadata out to lower file; "
1406 "rc = [%d]\n", __func__, rc); 1421 "rc = [%d]\n", __func__, rc);
1407 goto out_free; 1422 goto out_free;
1408 } 1423 }
1409out_free: 1424out_free:
1410 free_page((unsigned long)virt); 1425 free_pages((unsigned long)virt, order);
1411out: 1426out:
1412 return rc; 1427 return rc;
1413} 1428}
@@ -1716,7 +1731,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
1716{ 1731{
1717 int rc = 0; 1732 int rc = 0;
1718 1733
1719 (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL); 1734 (*copied_name) = kmalloc((name_size + 1), GFP_KERNEL);
1720 if (!(*copied_name)) { 1735 if (!(*copied_name)) {
1721 rc = -ENOMEM; 1736 rc = -ENOMEM;
1722 goto out; 1737 goto out;
@@ -1726,7 +1741,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
1726 * in printing out the 1741 * in printing out the
1727 * string in debug 1742 * string in debug
1728 * messages */ 1743 * messages */
1729 (*copied_name_size) = (name_size + 1); 1744 (*copied_name_size) = name_size;
1730out: 1745out:
1731 return rc; 1746 return rc;
1732} 1747}
@@ -2206,17 +2221,19 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
2206 struct dentry *ecryptfs_dir_dentry, 2221 struct dentry *ecryptfs_dir_dentry,
2207 const char *name, size_t name_size) 2222 const char *name, size_t name_size)
2208{ 2223{
2224 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
2225 &ecryptfs_superblock_to_private(
2226 ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
2209 char *decoded_name; 2227 char *decoded_name;
2210 size_t decoded_name_size; 2228 size_t decoded_name_size;
2211 size_t packet_size; 2229 size_t packet_size;
2212 int rc = 0; 2230 int rc = 0;
2213 2231
2214 if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) 2232 if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
2233 && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
2234 && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
2215 && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, 2235 && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
2216 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { 2236 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
2217 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
2218 &ecryptfs_superblock_to_private(
2219 ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
2220 const char *orig_name = name; 2237 const char *orig_name = name;
2221 size_t orig_name_size = name_size; 2238 size_t orig_name_size = name_size;
2222 2239
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c11fc95714ab..ac749d4d644f 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -328,6 +328,7 @@ struct ecryptfs_dentry_info {
328 */ 328 */
329struct ecryptfs_global_auth_tok { 329struct ecryptfs_global_auth_tok {
330#define ECRYPTFS_AUTH_TOK_INVALID 0x00000001 330#define ECRYPTFS_AUTH_TOK_INVALID 0x00000001
331#define ECRYPTFS_AUTH_TOK_FNEK 0x00000002
331 u32 flags; 332 u32 flags;
332 struct list_head mount_crypt_stat_list; 333 struct list_head mount_crypt_stat_list;
333 struct key *global_auth_tok_key; 334 struct key *global_auth_tok_key;
@@ -619,7 +620,6 @@ int ecryptfs_interpose(struct dentry *hidden_dentry,
619 u32 flags); 620 u32 flags);
620int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, 621int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
621 struct dentry *lower_dentry, 622 struct dentry *lower_dentry,
622 struct ecryptfs_crypt_stat *crypt_stat,
623 struct inode *ecryptfs_dir_inode, 623 struct inode *ecryptfs_dir_inode,
624 struct nameidata *ecryptfs_nd); 624 struct nameidata *ecryptfs_nd);
625int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, 625int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
@@ -696,7 +696,7 @@ ecryptfs_write_header_metadata(char *virt,
696int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig); 696int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig);
697int 697int
698ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, 698ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
699 char *sig); 699 char *sig, u32 global_auth_tok_flags);
700int ecryptfs_get_global_auth_tok_for_sig( 700int ecryptfs_get_global_auth_tok_for_sig(
701 struct ecryptfs_global_auth_tok **global_auth_tok, 701 struct ecryptfs_global_auth_tok **global_auth_tok,
702 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig); 702 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5697899a168d..55b3145b8072 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -246,7 +246,6 @@ out:
246 */ 246 */
247int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, 247int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
248 struct dentry *lower_dentry, 248 struct dentry *lower_dentry,
249 struct ecryptfs_crypt_stat *crypt_stat,
250 struct inode *ecryptfs_dir_inode, 249 struct inode *ecryptfs_dir_inode,
251 struct nameidata *ecryptfs_nd) 250 struct nameidata *ecryptfs_nd)
252{ 251{
@@ -254,6 +253,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
254 struct vfsmount *lower_mnt; 253 struct vfsmount *lower_mnt;
255 struct inode *lower_inode; 254 struct inode *lower_inode;
256 struct ecryptfs_mount_crypt_stat *mount_crypt_stat; 255 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
256 struct ecryptfs_crypt_stat *crypt_stat;
257 char *page_virt = NULL; 257 char *page_virt = NULL;
258 u64 file_size; 258 u64 file_size;
259 int rc = 0; 259 int rc = 0;
@@ -314,6 +314,11 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
314 goto out_free_kmem; 314 goto out_free_kmem;
315 } 315 }
316 } 316 }
317 crypt_stat = &ecryptfs_inode_to_private(
318 ecryptfs_dentry->d_inode)->crypt_stat;
319 /* TODO: lock for crypt_stat comparison */
320 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
321 ecryptfs_set_default_sizes(crypt_stat);
317 rc = ecryptfs_read_and_validate_header_region(page_virt, 322 rc = ecryptfs_read_and_validate_header_region(page_virt,
318 ecryptfs_dentry->d_inode); 323 ecryptfs_dentry->d_inode);
319 if (rc) { 324 if (rc) {
@@ -362,9 +367,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
362{ 367{
363 char *encrypted_and_encoded_name = NULL; 368 char *encrypted_and_encoded_name = NULL;
364 size_t encrypted_and_encoded_name_size; 369 size_t encrypted_and_encoded_name_size;
365 struct ecryptfs_crypt_stat *crypt_stat = NULL;
366 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; 370 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
367 struct ecryptfs_inode_info *inode_info;
368 struct dentry *lower_dir_dentry, *lower_dentry; 371 struct dentry *lower_dir_dentry, *lower_dentry;
369 int rc = 0; 372 int rc = 0;
370 373
@@ -388,26 +391,15 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
388 } 391 }
389 if (lower_dentry->d_inode) 392 if (lower_dentry->d_inode)
390 goto lookup_and_interpose; 393 goto lookup_and_interpose;
391 inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); 394 mount_crypt_stat = &ecryptfs_superblock_to_private(
392 if (inode_info) { 395 ecryptfs_dentry->d_sb)->mount_crypt_stat;
393 crypt_stat = &inode_info->crypt_stat; 396 if (!(mount_crypt_stat
394 /* TODO: lock for crypt_stat comparison */ 397 && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
395 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
396 ecryptfs_set_default_sizes(crypt_stat);
397 }
398 if (crypt_stat)
399 mount_crypt_stat = crypt_stat->mount_crypt_stat;
400 else
401 mount_crypt_stat = &ecryptfs_superblock_to_private(
402 ecryptfs_dentry->d_sb)->mount_crypt_stat;
403 if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
404 && !(mount_crypt_stat && (mount_crypt_stat->flags
405 & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
406 goto lookup_and_interpose; 398 goto lookup_and_interpose;
407 dput(lower_dentry); 399 dput(lower_dentry);
408 rc = ecryptfs_encrypt_and_encode_filename( 400 rc = ecryptfs_encrypt_and_encode_filename(
409 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, 401 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
410 crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name, 402 NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name,
411 ecryptfs_dentry->d_name.len); 403 ecryptfs_dentry->d_name.len);
412 if (rc) { 404 if (rc) {
413 printk(KERN_ERR "%s: Error attempting to encrypt and encode " 405 printk(KERN_ERR "%s: Error attempting to encrypt and encode "
@@ -426,7 +418,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
426 } 418 }
427lookup_and_interpose: 419lookup_and_interpose:
428 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, 420 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
429 crypt_stat, ecryptfs_dir_inode, 421 ecryptfs_dir_inode,
430 ecryptfs_nd); 422 ecryptfs_nd);
431 goto out; 423 goto out;
432out_d_drop: 424out_d_drop:
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ff539420cc6f..e4a6223c3145 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -2375,7 +2375,7 @@ struct kmem_cache *ecryptfs_global_auth_tok_cache;
2375 2375
2376int 2376int
2377ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, 2377ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
2378 char *sig) 2378 char *sig, u32 global_auth_tok_flags)
2379{ 2379{
2380 struct ecryptfs_global_auth_tok *new_auth_tok; 2380 struct ecryptfs_global_auth_tok *new_auth_tok;
2381 int rc = 0; 2381 int rc = 0;
@@ -2389,6 +2389,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
2389 goto out; 2389 goto out;
2390 } 2390 }
2391 memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX); 2391 memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX);
2392 new_auth_tok->flags = global_auth_tok_flags;
2392 new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; 2393 new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
2393 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); 2394 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
2394 list_add(&new_auth_tok->mount_crypt_stat_list, 2395 list_add(&new_auth_tok->mount_crypt_stat_list,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 789cf2e1be1e..aed56c25539b 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -319,7 +319,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
319 case ecryptfs_opt_ecryptfs_sig: 319 case ecryptfs_opt_ecryptfs_sig:
320 sig_src = args[0].from; 320 sig_src = args[0].from;
321 rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, 321 rc = ecryptfs_add_global_auth_tok(mount_crypt_stat,
322 sig_src); 322 sig_src, 0);
323 if (rc) { 323 if (rc) {
324 printk(KERN_ERR "Error attempting to register " 324 printk(KERN_ERR "Error attempting to register "
325 "global sig; rc = [%d]\n", rc); 325 "global sig; rc = [%d]\n", rc);
@@ -370,7 +370,8 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
370 ECRYPTFS_SIG_SIZE_HEX] = '\0'; 370 ECRYPTFS_SIG_SIZE_HEX] = '\0';
371 rc = ecryptfs_add_global_auth_tok( 371 rc = ecryptfs_add_global_auth_tok(
372 mount_crypt_stat, 372 mount_crypt_stat,
373 mount_crypt_stat->global_default_fnek_sig); 373 mount_crypt_stat->global_default_fnek_sig,
374 ECRYPTFS_AUTH_TOK_FNEK);
374 if (rc) { 375 if (rc) {
375 printk(KERN_ERR "Error attempting to register " 376 printk(KERN_ERR "Error attempting to register "
376 "global fnek sig [%s]; rc = [%d]\n", 377 "global fnek sig [%s]; rc = [%d]\n",
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
new file mode 100644
index 000000000000..6ebfc1c207a8
--- /dev/null
+++ b/fs/efs/Kconfig
@@ -0,0 +1,14 @@
1config EFS_FS
2 tristate "EFS file system support (read only) (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 help
5 EFS is an older file system used for non-ISO9660 CD-ROMs and hard
6 disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
7 uses the XFS file system for hard disk partitions however).
8
9 This implementation only offers read-only access. If you don't know
10 what all this is about, it's safe to say N. For more information
11 about EFS see its home page at <http://aeschi.ch.eu.org/efs/>.
12
13 To compile the EFS file system support as a module, choose M here: the
14 module will be called efs.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ba2f9ec71192..c5c424f23fd5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -234,8 +234,6 @@ struct ep_pqueue {
234/* 234/*
235 * Configuration options available inside /proc/sys/fs/epoll/ 235 * Configuration options available inside /proc/sys/fs/epoll/
236 */ 236 */
237/* Maximum number of epoll devices, per user */
238static int max_user_instances __read_mostly;
239/* Maximum number of epoll watched descriptors, per user */ 237/* Maximum number of epoll watched descriptors, per user */
240static int max_user_watches __read_mostly; 238static int max_user_watches __read_mostly;
241 239
@@ -261,14 +259,6 @@ static int zero;
261 259
262ctl_table epoll_table[] = { 260ctl_table epoll_table[] = {
263 { 261 {
264 .procname = "max_user_instances",
265 .data = &max_user_instances,
266 .maxlen = sizeof(int),
267 .mode = 0644,
268 .proc_handler = &proc_dointvec_minmax,
269 .extra1 = &zero,
270 },
271 {
272 .procname = "max_user_watches", 262 .procname = "max_user_watches",
273 .data = &max_user_watches, 263 .data = &max_user_watches,
274 .maxlen = sizeof(int), 264 .maxlen = sizeof(int),
@@ -427,10 +417,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
427 ep_unregister_pollwait(ep, epi); 417 ep_unregister_pollwait(ep, epi);
428 418
429 /* Remove the current item from the list of epoll hooks */ 419 /* Remove the current item from the list of epoll hooks */
430 spin_lock(&file->f_ep_lock); 420 spin_lock(&file->f_lock);
431 if (ep_is_linked(&epi->fllink)) 421 if (ep_is_linked(&epi->fllink))
432 list_del_init(&epi->fllink); 422 list_del_init(&epi->fllink);
433 spin_unlock(&file->f_ep_lock); 423 spin_unlock(&file->f_lock);
434 424
435 rb_erase(&epi->rbn, &ep->rbr); 425 rb_erase(&epi->rbn, &ep->rbr);
436 426
@@ -491,7 +481,6 @@ static void ep_free(struct eventpoll *ep)
491 481
492 mutex_unlock(&epmutex); 482 mutex_unlock(&epmutex);
493 mutex_destroy(&ep->mtx); 483 mutex_destroy(&ep->mtx);
494 atomic_dec(&ep->user->epoll_devs);
495 free_uid(ep->user); 484 free_uid(ep->user);
496 kfree(ep); 485 kfree(ep);
497} 486}
@@ -549,7 +538,7 @@ void eventpoll_release_file(struct file *file)
549 struct epitem *epi; 538 struct epitem *epi;
550 539
551 /* 540 /*
552 * We don't want to get "file->f_ep_lock" because it is not 541 * We don't want to get "file->f_lock" because it is not
553 * necessary. It is not necessary because we're in the "struct file" 542 * necessary. It is not necessary because we're in the "struct file"
554 * cleanup path, and this means that noone is using this file anymore. 543 * cleanup path, and this means that noone is using this file anymore.
555 * So, for example, epoll_ctl() cannot hit here sicne if we reach this 544 * So, for example, epoll_ctl() cannot hit here sicne if we reach this
@@ -558,6 +547,8 @@ void eventpoll_release_file(struct file *file)
558 * will correctly serialize the operation. We do need to acquire 547 * will correctly serialize the operation. We do need to acquire
559 * "ep->mtx" after "epmutex" because ep_remove() requires it when called 548 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
560 * from anywhere but ep_free(). 549 * from anywhere but ep_free().
550 *
551 * Besides, ep_remove() acquires the lock, so we can't hold it here.
561 */ 552 */
562 mutex_lock(&epmutex); 553 mutex_lock(&epmutex);
563 554
@@ -581,10 +572,6 @@ static int ep_alloc(struct eventpoll **pep)
581 struct eventpoll *ep; 572 struct eventpoll *ep;
582 573
583 user = get_current_user(); 574 user = get_current_user();
584 error = -EMFILE;
585 if (unlikely(atomic_read(&user->epoll_devs) >=
586 max_user_instances))
587 goto free_uid;
588 error = -ENOMEM; 575 error = -ENOMEM;
589 ep = kzalloc(sizeof(*ep), GFP_KERNEL); 576 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
590 if (unlikely(!ep)) 577 if (unlikely(!ep))
@@ -800,9 +787,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
800 goto error_unregister; 787 goto error_unregister;
801 788
802 /* Add the current item to the list of active epoll hook for this file */ 789 /* Add the current item to the list of active epoll hook for this file */
803 spin_lock(&tfile->f_ep_lock); 790 spin_lock(&tfile->f_lock);
804 list_add_tail(&epi->fllink, &tfile->f_ep_links); 791 list_add_tail(&epi->fllink, &tfile->f_ep_links);
805 spin_unlock(&tfile->f_ep_lock); 792 spin_unlock(&tfile->f_lock);
806 793
807 /* 794 /*
808 * Add the current item to the RB tree. All RB tree operations are 795 * Add the current item to the RB tree. All RB tree operations are
@@ -1141,7 +1128,6 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
1141 flags & O_CLOEXEC); 1128 flags & O_CLOEXEC);
1142 if (fd < 0) 1129 if (fd < 0)
1143 ep_free(ep); 1130 ep_free(ep);
1144 atomic_inc(&ep->user->epoll_devs);
1145 1131
1146error_return: 1132error_return:
1147 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", 1133 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
@@ -1366,8 +1352,10 @@ static int __init eventpoll_init(void)
1366 struct sysinfo si; 1352 struct sysinfo si;
1367 1353
1368 si_meminfo(&si); 1354 si_meminfo(&si);
1369 max_user_instances = 128; 1355 /*
1370 max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) / 1356 * Allows top 4% of lomem to be allocated for epoll watches (per user).
1357 */
1358 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
1371 EP_ITEM_COST; 1359 EP_ITEM_COST;
1372 1360
1373 /* Initialize the structure used to perform safe poll wait head wake ups */ 1361 /* Initialize the structure used to perform safe poll wait head wake ups */
diff --git a/fs/exec.c b/fs/exec.c
index 0dd60a01f1b4..b9f1c144b7a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -45,6 +45,7 @@
45#include <linux/proc_fs.h> 45#include <linux/proc_fs.h>
46#include <linux/mount.h> 46#include <linux/mount.h>
47#include <linux/security.h> 47#include <linux/security.h>
48#include <linux/ima.h>
48#include <linux/syscalls.h> 49#include <linux/syscalls.h>
49#include <linux/tsacct_kern.h> 50#include <linux/tsacct_kern.h>
50#include <linux/cn_proc.h> 51#include <linux/cn_proc.h>
@@ -127,6 +128,9 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
127 MAY_READ | MAY_EXEC | MAY_OPEN); 128 MAY_READ | MAY_EXEC | MAY_OPEN);
128 if (error) 129 if (error)
129 goto exit; 130 goto exit;
131 error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
132 if (error)
133 goto exit;
130 134
131 file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE); 135 file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
132 error = PTR_ERR(file); 136 error = PTR_ERR(file);
@@ -674,6 +678,9 @@ struct file *open_exec(const char *name)
674 err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN); 678 err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
675 if (err) 679 if (err)
676 goto out_path_put; 680 goto out_path_put;
681 err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
682 if (err)
683 goto out_path_put;
677 684
678 file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE); 685 file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
679 if (IS_ERR(file)) 686 if (IS_ERR(file))
@@ -1049,16 +1056,32 @@ EXPORT_SYMBOL(install_exec_creds);
1049 * - the caller must hold current->cred_exec_mutex to protect against 1056 * - the caller must hold current->cred_exec_mutex to protect against
1050 * PTRACE_ATTACH 1057 * PTRACE_ATTACH
1051 */ 1058 */
1052void check_unsafe_exec(struct linux_binprm *bprm) 1059void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files)
1053{ 1060{
1054 struct task_struct *p = current; 1061 struct task_struct *p = current, *t;
1062 unsigned long flags;
1063 unsigned n_fs, n_files, n_sighand;
1055 1064
1056 bprm->unsafe = tracehook_unsafe_exec(p); 1065 bprm->unsafe = tracehook_unsafe_exec(p);
1057 1066
1058 if (atomic_read(&p->fs->count) > 1 || 1067 n_fs = 1;
1059 atomic_read(&p->files->count) > 1 || 1068 n_files = 1;
1060 atomic_read(&p->sighand->count) > 1) 1069 n_sighand = 1;
1070 lock_task_sighand(p, &flags);
1071 for (t = next_thread(p); t != p; t = next_thread(t)) {
1072 if (t->fs == p->fs)
1073 n_fs++;
1074 if (t->files == files)
1075 n_files++;
1076 n_sighand++;
1077 }
1078
1079 if (atomic_read(&p->fs->count) > n_fs ||
1080 atomic_read(&p->files->count) > n_files ||
1081 atomic_read(&p->sighand->count) > n_sighand)
1061 bprm->unsafe |= LSM_UNSAFE_SHARE; 1082 bprm->unsafe |= LSM_UNSAFE_SHARE;
1083
1084 unlock_task_sighand(p, &flags);
1062} 1085}
1063 1086
1064/* 1087/*
@@ -1168,6 +1191,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1168 retval = security_bprm_check(bprm); 1191 retval = security_bprm_check(bprm);
1169 if (retval) 1192 if (retval)
1170 return retval; 1193 return retval;
1194 retval = ima_bprm_check(bprm);
1195 if (retval)
1196 return retval;
1171 1197
1172 /* kernel module loader fixup */ 1198 /* kernel module loader fixup */
1173 /* so we don't try to load run modprobe in kernel space. */ 1199 /* so we don't try to load run modprobe in kernel space. */
@@ -1268,12 +1294,13 @@ int do_execve(char * filename,
1268 retval = mutex_lock_interruptible(&current->cred_exec_mutex); 1294 retval = mutex_lock_interruptible(&current->cred_exec_mutex);
1269 if (retval < 0) 1295 if (retval < 0)
1270 goto out_free; 1296 goto out_free;
1297 current->in_execve = 1;
1271 1298
1272 retval = -ENOMEM; 1299 retval = -ENOMEM;
1273 bprm->cred = prepare_exec_creds(); 1300 bprm->cred = prepare_exec_creds();
1274 if (!bprm->cred) 1301 if (!bprm->cred)
1275 goto out_unlock; 1302 goto out_unlock;
1276 check_unsafe_exec(bprm); 1303 check_unsafe_exec(bprm, displaced);
1277 1304
1278 file = open_exec(filename); 1305 file = open_exec(filename);
1279 retval = PTR_ERR(file); 1306 retval = PTR_ERR(file);
@@ -1321,6 +1348,7 @@ int do_execve(char * filename,
1321 goto out; 1348 goto out;
1322 1349
1323 /* execve succeeded */ 1350 /* execve succeeded */
1351 current->in_execve = 0;
1324 mutex_unlock(&current->cred_exec_mutex); 1352 mutex_unlock(&current->cred_exec_mutex);
1325 acct_update_integrals(current); 1353 acct_update_integrals(current);
1326 free_bprm(bprm); 1354 free_bprm(bprm);
@@ -1339,6 +1367,7 @@ out_file:
1339 } 1367 }
1340 1368
1341out_unlock: 1369out_unlock:
1370 current->in_execve = 0;
1342 mutex_unlock(&current->cred_exec_mutex); 1371 mutex_unlock(&current->cred_exec_mutex);
1343 1372
1344out_free: 1373out_free:
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index da8bdeaa2e6d..7c6e3606f0ec 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1185,9 +1185,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1185 es = sbi->s_es; 1185 es = sbi->s_es;
1186 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != 1186 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
1187 (old_mount_opt & EXT2_MOUNT_XIP)) && 1187 (old_mount_opt & EXT2_MOUNT_XIP)) &&
1188 invalidate_inodes(sb)) 1188 invalidate_inodes(sb)) {
1189 ext2_warning(sb, __func__, "busy inodes while remounting "\ 1189 ext2_warning(sb, __func__, "refusing change of xip flag "
1190 "xip remain in cache (no functional problem)"); 1190 "with busy inodes while remounting");
1191 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
1192 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1193 }
1191 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1194 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1192 return 0; 1195 return 0;
1193 if (*flags & MS_RDONLY) { 1196 if (*flags & MS_RDONLY) {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5fa453b49a64..05e5c2e5c0d7 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1435,6 +1435,10 @@ static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1435 return 0; 1435 return 0;
1436} 1436}
1437 1437
1438static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
1439{
1440 return !buffer_mapped(bh);
1441}
1438/* 1442/*
1439 * Note that we always start a transaction even if we're not journalling 1443 * Note that we always start a transaction even if we're not journalling
1440 * data. This is to preserve ordering: any hole instantiation within 1444 * data. This is to preserve ordering: any hole instantiation within
@@ -1505,6 +1509,15 @@ static int ext3_ordered_writepage(struct page *page,
1505 if (ext3_journal_current_handle()) 1509 if (ext3_journal_current_handle())
1506 goto out_fail; 1510 goto out_fail;
1507 1511
1512 if (!page_has_buffers(page)) {
1513 create_empty_buffers(page, inode->i_sb->s_blocksize,
1514 (1 << BH_Dirty)|(1 << BH_Uptodate));
1515 } else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
1516 /* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */
1517 return block_write_full_page(page, NULL, wbc);
1518 }
1519 page_bufs = page_buffers(page);
1520
1508 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1521 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1509 1522
1510 if (IS_ERR(handle)) { 1523 if (IS_ERR(handle)) {
@@ -1512,11 +1525,6 @@ static int ext3_ordered_writepage(struct page *page,
1512 goto out_fail; 1525 goto out_fail;
1513 } 1526 }
1514 1527
1515 if (!page_has_buffers(page)) {
1516 create_empty_buffers(page, inode->i_sb->s_blocksize,
1517 (1 << BH_Dirty)|(1 << BH_Uptodate));
1518 }
1519 page_bufs = page_buffers(page);
1520 walk_page_buffers(handle, page_bufs, 0, 1528 walk_page_buffers(handle, page_bufs, 0,
1521 PAGE_CACHE_SIZE, NULL, bget_one); 1529 PAGE_CACHE_SIZE, NULL, bget_one);
1522 1530
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 69a3d19ca9fd..4db4ffa1edad 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1358,7 +1358,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1358 struct fake_dirent *fde; 1358 struct fake_dirent *fde;
1359 1359
1360 blocksize = dir->i_sb->s_blocksize; 1360 blocksize = dir->i_sb->s_blocksize;
1361 dxtrace(printk("Creating index\n")); 1361 dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1362 retval = ext3_journal_get_write_access(handle, bh); 1362 retval = ext3_journal_get_write_access(handle, bh);
1363 if (retval) { 1363 if (retval) {
1364 ext3_std_error(dir->i_sb, retval); 1364 ext3_std_error(dir->i_sb, retval);
@@ -1367,6 +1367,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1367 } 1367 }
1368 root = (struct dx_root *) bh->b_data; 1368 root = (struct dx_root *) bh->b_data;
1369 1369
1370 /* The 0th block becomes the root, move the dirents out */
1371 fde = &root->dotdot;
1372 de = (struct ext3_dir_entry_2 *)((char *)fde +
1373 ext3_rec_len_from_disk(fde->rec_len));
1374 if ((char *) de >= (((char *) root) + blocksize)) {
1375 ext3_error(dir->i_sb, __func__,
1376 "invalid rec_len for '..' in inode %lu",
1377 dir->i_ino);
1378 brelse(bh);
1379 return -EIO;
1380 }
1381 len = ((char *) root) + blocksize - (char *) de;
1382
1370 bh2 = ext3_append (handle, dir, &block, &retval); 1383 bh2 = ext3_append (handle, dir, &block, &retval);
1371 if (!(bh2)) { 1384 if (!(bh2)) {
1372 brelse(bh); 1385 brelse(bh);
@@ -1375,11 +1388,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1375 EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; 1388 EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
1376 data1 = bh2->b_data; 1389 data1 = bh2->b_data;
1377 1390
1378 /* The 0th block becomes the root, move the dirents out */
1379 fde = &root->dotdot;
1380 de = (struct ext3_dir_entry_2 *)((char *)fde +
1381 ext3_rec_len_from_disk(fde->rec_len));
1382 len = ((char *) root) + blocksize - (char *) de;
1383 memcpy (data1, de, len); 1391 memcpy (data1, de, len);
1384 de = (struct ext3_dir_entry_2 *) data1; 1392 de = (struct ext3_dir_entry_2 *) data1;
1385 top = data1 + len; 1393 top = data1 + len;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b70d90e08a3c..4a970411a458 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2428,12 +2428,13 @@ static void ext3_write_super (struct super_block * sb)
2428 2428
2429static int ext3_sync_fs(struct super_block *sb, int wait) 2429static int ext3_sync_fs(struct super_block *sb, int wait)
2430{ 2430{
2431 sb->s_dirt = 0; 2431 tid_t target;
2432 if (wait)
2433 ext3_force_commit(sb);
2434 else
2435 journal_start_commit(EXT3_SB(sb)->s_journal, NULL);
2436 2432
2433 sb->s_dirt = 0;
2434 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
2435 if (wait)
2436 log_wait_commit(EXT3_SB(sb)->s_journal, target);
2437 }
2437 return 0; 2438 return 0;
2438} 2439}
2439 2440
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 6bba06b09dd1..de9459b4cb94 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -609,7 +609,9 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
609 */ 609 */
610int ext4_should_retry_alloc(struct super_block *sb, int *retries) 610int ext4_should_retry_alloc(struct super_block *sb, int *retries)
611{ 611{
612 if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3) 612 if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
613 (*retries)++ > 3 ||
614 !EXT4_SB(sb)->s_journal)
613 return 0; 615 return 0;
614 616
615 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); 617 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -684,15 +686,15 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
684 gdp = ext4_get_group_desc(sb, i, NULL); 686 gdp = ext4_get_group_desc(sb, i, NULL);
685 if (!gdp) 687 if (!gdp)
686 continue; 688 continue;
687 desc_count += le16_to_cpu(gdp->bg_free_blocks_count); 689 desc_count += ext4_free_blks_count(sb, gdp);
688 brelse(bitmap_bh); 690 brelse(bitmap_bh);
689 bitmap_bh = ext4_read_block_bitmap(sb, i); 691 bitmap_bh = ext4_read_block_bitmap(sb, i);
690 if (bitmap_bh == NULL) 692 if (bitmap_bh == NULL)
691 continue; 693 continue;
692 694
693 x = ext4_count_free(bitmap_bh, sb->s_blocksize); 695 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
694 printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n", 696 printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
695 i, le16_to_cpu(gdp->bg_free_blocks_count), x); 697 i, ext4_free_blks_count(sb, gdp), x);
696 bitmap_count += x; 698 bitmap_count += x;
697 } 699 }
698 brelse(bitmap_bh); 700 brelse(bitmap_bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c668e4377d76..b0c87dce66a3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -868,7 +868,7 @@ static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
868{ 868{
869 unsigned len = le16_to_cpu(dlen); 869 unsigned len = le16_to_cpu(dlen);
870 870
871 if (len == EXT4_MAX_REC_LEN) 871 if (len == EXT4_MAX_REC_LEN || len == 0)
872 return 1 << 16; 872 return 1 << 16;
873 return len; 873 return len;
874} 874}
@@ -1206,8 +1206,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
1206 1206
1207static inline loff_t ext4_isize(struct ext4_inode *raw_inode) 1207static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
1208{ 1208{
1209 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | 1209 if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
1210 le32_to_cpu(raw_inode->i_size_lo); 1210 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
1211 le32_to_cpu(raw_inode->i_size_lo);
1212 else
1213 return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
1211} 1214}
1212 1215
1213static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) 1216static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 54bf0623a9ae..e0aa4fe4f596 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1122,7 +1122,8 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1122 struct ext4_extent_idx *ix; 1122 struct ext4_extent_idx *ix;
1123 struct ext4_extent *ex; 1123 struct ext4_extent *ex;
1124 ext4_fsblk_t block; 1124 ext4_fsblk_t block;
1125 int depth, ee_len; 1125 int depth; /* Note, NOT eh_depth; depth from top of tree */
1126 int ee_len;
1126 1127
1127 BUG_ON(path == NULL); 1128 BUG_ON(path == NULL);
1128 depth = path->p_depth; 1129 depth = path->p_depth;
@@ -1179,7 +1180,8 @@ got_index:
1179 if (bh == NULL) 1180 if (bh == NULL)
1180 return -EIO; 1181 return -EIO;
1181 eh = ext_block_hdr(bh); 1182 eh = ext_block_hdr(bh);
1182 if (ext4_ext_check_header(inode, eh, depth)) { 1183 /* subtract from p_depth to get proper eh_depth */
1184 if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
1183 put_bh(bh); 1185 put_bh(bh);
1184 return -EIO; 1186 return -EIO;
1185 } 1187 }
@@ -3048,7 +3050,7 @@ retry:
3048 WARN_ON(ret <= 0); 3050 WARN_ON(ret <= 0);
3049 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3051 printk(KERN_ERR "%s: ext4_ext_get_blocks "
3050 "returned error inode#%lu, block=%u, " 3052 "returned error inode#%lu, block=%u, "
3051 "max_blocks=%lu", __func__, 3053 "max_blocks=%u", __func__,
3052 inode->i_ino, block, max_blocks); 3054 inode->i_ino, block, max_blocks);
3053#endif 3055#endif
3054 ext4_mark_inode_dirty(handle, inode); 3056 ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4fb86a0061d0..2d2b3585ee91 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -188,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
188 struct ext4_group_desc *gdp; 188 struct ext4_group_desc *gdp;
189 struct ext4_super_block *es; 189 struct ext4_super_block *es;
190 struct ext4_sb_info *sbi; 190 struct ext4_sb_info *sbi;
191 int fatal = 0, err, count; 191 int fatal = 0, err, count, cleared;
192 ext4_group_t flex_group; 192 ext4_group_t flex_group;
193 193
194 if (atomic_read(&inode->i_count) > 1) { 194 if (atomic_read(&inode->i_count) > 1) {
@@ -248,8 +248,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
248 goto error_return; 248 goto error_return;
249 249
250 /* Ok, now we can actually update the inode bitmaps.. */ 250 /* Ok, now we can actually update the inode bitmaps.. */
251 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 251 spin_lock(sb_bgl_lock(sbi, block_group));
252 bit, bitmap_bh->b_data)) 252 cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
253 spin_unlock(sb_bgl_lock(sbi, block_group));
254 if (!cleared)
253 ext4_error(sb, "ext4_free_inode", 255 ext4_error(sb, "ext4_free_inode",
254 "bit already cleared for inode %lu", ino); 256 "bit already cleared for inode %lu", ino);
255 else { 257 else {
@@ -696,6 +698,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
696 struct inode *ret; 698 struct inode *ret;
697 ext4_group_t i; 699 ext4_group_t i;
698 int free = 0; 700 int free = 0;
701 static int once = 1;
699 ext4_group_t flex_group; 702 ext4_group_t flex_group;
700 703
701 /* Cannot create files in a deleted directory */ 704 /* Cannot create files in a deleted directory */
@@ -715,6 +718,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
715 718
716 if (sbi->s_log_groups_per_flex) { 719 if (sbi->s_log_groups_per_flex) {
717 ret2 = find_group_flex(sb, dir, &group); 720 ret2 = find_group_flex(sb, dir, &group);
721 if (ret2 == -1) {
722 ret2 = find_group_other(sb, dir, &group);
723 if (ret2 == 0 && once)
724 once = 0;
725 printk(KERN_NOTICE "ext4: find_group_flex "
726 "failed, fallback succeeded dir %lu\n",
727 dir->i_ino);
728 }
718 goto got_group; 729 goto got_group;
719 } 730 }
720 731
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a6444cee0c7e..c7fed5b18745 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,8 +47,10 @@
47static inline int ext4_begin_ordered_truncate(struct inode *inode, 47static inline int ext4_begin_ordered_truncate(struct inode *inode,
48 loff_t new_size) 48 loff_t new_size)
49{ 49{
50 return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, 50 return jbd2_journal_begin_ordered_truncate(
51 new_size); 51 EXT4_SB(inode->i_sb)->s_journal,
52 &EXT4_I(inode)->jinode,
53 new_size);
52} 54}
53 55
54static void ext4_invalidatepage(struct page *page, unsigned long offset); 56static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -360,9 +362,9 @@ static int ext4_block_to_path(struct inode *inode,
360 final = ptrs; 362 final = ptrs;
361 } else { 363 } else {
362 ext4_warning(inode->i_sb, "ext4_block_to_path", 364 ext4_warning(inode->i_sb, "ext4_block_to_path",
363 "block %lu > max", 365 "block %lu > max in inode %lu",
364 i_block + direct_blocks + 366 i_block + direct_blocks +
365 indirect_blocks + double_blocks); 367 indirect_blocks + double_blocks, inode->i_ino);
366 } 368 }
367 if (boundary) 369 if (boundary)
368 *boundary = final - 1 - (i_block & (ptrs - 1)); 370 *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -1366,6 +1368,10 @@ retry:
1366 goto out; 1368 goto out;
1367 } 1369 }
1368 1370
1371 /* We cannot recurse into the filesystem as the transaction is already
1372 * started */
1373 flags |= AOP_FLAG_NOFS;
1374
1369 page = grab_cache_page_write_begin(mapping, index, flags); 1375 page = grab_cache_page_write_begin(mapping, index, flags);
1370 if (!page) { 1376 if (!page) {
1371 ext4_journal_stop(handle); 1377 ext4_journal_stop(handle);
@@ -1375,7 +1381,7 @@ retry:
1375 *pagep = page; 1381 *pagep = page;
1376 1382
1377 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1383 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1378 ext4_get_block); 1384 ext4_get_block);
1379 1385
1380 if (!ret && ext4_should_journal_data(inode)) { 1386 if (!ret && ext4_should_journal_data(inode)) {
1381 ret = walk_page_buffers(handle, page_buffers(page), 1387 ret = walk_page_buffers(handle, page_buffers(page),
@@ -2437,6 +2443,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2437 int no_nrwrite_index_update; 2443 int no_nrwrite_index_update;
2438 int pages_written = 0; 2444 int pages_written = 0;
2439 long pages_skipped; 2445 long pages_skipped;
2446 int range_cyclic, cycled = 1, io_done = 0;
2440 int needed_blocks, ret = 0, nr_to_writebump = 0; 2447 int needed_blocks, ret = 0, nr_to_writebump = 0;
2441 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2448 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2442 2449
@@ -2488,9 +2495,15 @@ static int ext4_da_writepages(struct address_space *mapping,
2488 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2495 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2489 range_whole = 1; 2496 range_whole = 1;
2490 2497
2491 if (wbc->range_cyclic) 2498 range_cyclic = wbc->range_cyclic;
2499 if (wbc->range_cyclic) {
2492 index = mapping->writeback_index; 2500 index = mapping->writeback_index;
2493 else 2501 if (index)
2502 cycled = 0;
2503 wbc->range_start = index << PAGE_CACHE_SHIFT;
2504 wbc->range_end = LLONG_MAX;
2505 wbc->range_cyclic = 0;
2506 } else
2494 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2507 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2495 2508
2496 mpd.wbc = wbc; 2509 mpd.wbc = wbc;
@@ -2504,6 +2517,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2504 wbc->no_nrwrite_index_update = 1; 2517 wbc->no_nrwrite_index_update = 1;
2505 pages_skipped = wbc->pages_skipped; 2518 pages_skipped = wbc->pages_skipped;
2506 2519
2520retry:
2507 while (!ret && wbc->nr_to_write > 0) { 2521 while (!ret && wbc->nr_to_write > 0) {
2508 2522
2509 /* 2523 /*
@@ -2530,7 +2544,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2530 2544
2531 ext4_journal_stop(handle); 2545 ext4_journal_stop(handle);
2532 2546
2533 if (mpd.retval == -ENOSPC) { 2547 if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
2534 /* commit the transaction which would 2548 /* commit the transaction which would
2535 * free blocks released in the transaction 2549 * free blocks released in the transaction
2536 * and try again 2550 * and try again
@@ -2546,6 +2560,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2546 pages_written += mpd.pages_written; 2560 pages_written += mpd.pages_written;
2547 wbc->pages_skipped = pages_skipped; 2561 wbc->pages_skipped = pages_skipped;
2548 ret = 0; 2562 ret = 0;
2563 io_done = 1;
2549 } else if (wbc->nr_to_write) 2564 } else if (wbc->nr_to_write)
2550 /* 2565 /*
2551 * There is no more writeout needed 2566 * There is no more writeout needed
@@ -2554,6 +2569,13 @@ static int ext4_da_writepages(struct address_space *mapping,
2554 */ 2569 */
2555 break; 2570 break;
2556 } 2571 }
2572 if (!io_done && !cycled) {
2573 cycled = 1;
2574 index = 0;
2575 wbc->range_start = index << PAGE_CACHE_SHIFT;
2576 wbc->range_end = mapping->writeback_index - 1;
2577 goto retry;
2578 }
2557 if (pages_skipped != wbc->pages_skipped) 2579 if (pages_skipped != wbc->pages_skipped)
2558 printk(KERN_EMERG "This should not happen leaving %s " 2580 printk(KERN_EMERG "This should not happen leaving %s "
2559 "with nr_to_write = %ld ret = %d\n", 2581 "with nr_to_write = %ld ret = %d\n",
@@ -2561,6 +2583,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2561 2583
2562 /* Update index */ 2584 /* Update index */
2563 index += pages_written; 2585 index += pages_written;
2586 wbc->range_cyclic = range_cyclic;
2564 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2587 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2565 /* 2588 /*
2566 * set the writeback_index so that range_cyclic 2589 * set the writeback_index so that range_cyclic
@@ -2648,6 +2671,9 @@ retry:
2648 ret = PTR_ERR(handle); 2671 ret = PTR_ERR(handle);
2649 goto out; 2672 goto out;
2650 } 2673 }
2674 /* We cannot recurse into the filesystem as the transaction is already
2675 * started */
2676 flags |= AOP_FLAG_NOFS;
2651 2677
2652 page = grab_cache_page_write_begin(mapping, index, flags); 2678 page = grab_cache_page_write_begin(mapping, index, flags);
2653 if (!page) { 2679 if (!page) {
@@ -2821,9 +2847,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2821 filemap_write_and_wait(mapping); 2847 filemap_write_and_wait(mapping);
2822 } 2848 }
2823 2849
2824 BUG_ON(!EXT4_JOURNAL(inode) &&
2825 EXT4_I(inode)->i_state & EXT4_STATE_JDATA);
2826
2827 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2850 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
2828 /* 2851 /*
2829 * This is a REALLY heavyweight approach, but the use of 2852 * This is a REALLY heavyweight approach, but the use of
@@ -3622,7 +3645,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
3622 * block pointed to itself, it would have been detached when 3645 * block pointed to itself, it would have been detached when
3623 * the block was cleared. Check for this instead of OOPSing. 3646 * the block was cleared. Check for this instead of OOPSing.
3624 */ 3647 */
3625 if (bh2jh(this_bh)) 3648 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
3626 ext4_handle_dirty_metadata(handle, inode, this_bh); 3649 ext4_handle_dirty_metadata(handle, inode, this_bh);
3627 else 3650 else
3628 ext4_error(inode->i_sb, __func__, 3651 ext4_error(inode->i_sb, __func__,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 918aec0c8a11..9f61e62f435f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1447,7 +1447,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1447 struct ext4_free_extent *gex = &ac->ac_g_ex; 1447 struct ext4_free_extent *gex = &ac->ac_g_ex;
1448 1448
1449 BUG_ON(ex->fe_len <= 0); 1449 BUG_ON(ex->fe_len <= 0);
1450 BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1450 BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1451 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1451 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1452 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 1452 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1453 1453
@@ -3025,7 +3025,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3025 goto out_err; 3025 goto out_err;
3026 3026
3027 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 3027 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
3028 gdp->bg_free_blocks_count); 3028 ext4_free_blks_count(sb, gdp));
3029 3029
3030 err = ext4_journal_get_write_access(handle, gdp_bh); 3030 err = ext4_journal_get_write_access(handle, gdp_bh);
3031 if (err) 3031 if (err)
@@ -3292,7 +3292,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3292 } 3292 }
3293 BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3293 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3294 start > ac->ac_o_ex.fe_logical); 3294 start > ac->ac_o_ex.fe_logical);
3295 BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 3295 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3296 3296
3297 /* now prepare goal request */ 3297 /* now prepare goal request */
3298 3298
@@ -3589,6 +3589,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3589 struct super_block *sb, struct ext4_prealloc_space *pa) 3589 struct super_block *sb, struct ext4_prealloc_space *pa)
3590{ 3590{
3591 ext4_group_t grp; 3591 ext4_group_t grp;
3592 ext4_fsblk_t grp_blk;
3592 3593
3593 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) 3594 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3594 return; 3595 return;
@@ -3603,8 +3604,12 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3603 pa->pa_deleted = 1; 3604 pa->pa_deleted = 1;
3604 spin_unlock(&pa->pa_lock); 3605 spin_unlock(&pa->pa_lock);
3605 3606
3606 /* -1 is to protect from crossing allocation group */ 3607 grp_blk = pa->pa_pstart;
3607 ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); 3608 /* If linear, pa_pstart may be in the next group when pa is used up */
3609 if (pa->pa_linear)
3610 grp_blk--;
3611
3612 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
3608 3613
3609 /* 3614 /*
3610 * possible race: 3615 * possible race:
@@ -3693,6 +3698,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3693 pa->pa_free = pa->pa_len; 3698 pa->pa_free = pa->pa_len;
3694 atomic_set(&pa->pa_count, 1); 3699 atomic_set(&pa->pa_count, 1);
3695 spin_lock_init(&pa->pa_lock); 3700 spin_lock_init(&pa->pa_lock);
3701 INIT_LIST_HEAD(&pa->pa_inode_list);
3702 INIT_LIST_HEAD(&pa->pa_group_list);
3696 pa->pa_deleted = 0; 3703 pa->pa_deleted = 0;
3697 pa->pa_linear = 0; 3704 pa->pa_linear = 0;
3698 3705
@@ -3755,6 +3762,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3755 atomic_set(&pa->pa_count, 1); 3762 atomic_set(&pa->pa_count, 1);
3756 spin_lock_init(&pa->pa_lock); 3763 spin_lock_init(&pa->pa_lock);
3757 INIT_LIST_HEAD(&pa->pa_inode_list); 3764 INIT_LIST_HEAD(&pa->pa_inode_list);
3765 INIT_LIST_HEAD(&pa->pa_group_list);
3758 pa->pa_deleted = 0; 3766 pa->pa_deleted = 0;
3759 pa->pa_linear = 1; 3767 pa->pa_linear = 1;
3760 3768
@@ -4476,23 +4484,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4476 pa->pa_free -= ac->ac_b_ex.fe_len; 4484 pa->pa_free -= ac->ac_b_ex.fe_len;
4477 pa->pa_len -= ac->ac_b_ex.fe_len; 4485 pa->pa_len -= ac->ac_b_ex.fe_len;
4478 spin_unlock(&pa->pa_lock); 4486 spin_unlock(&pa->pa_lock);
4479 /*
4480 * We want to add the pa to the right bucket.
4481 * Remove it from the list and while adding
4482 * make sure the list to which we are adding
4483 * doesn't grow big.
4484 */
4485 if (likely(pa->pa_free)) {
4486 spin_lock(pa->pa_obj_lock);
4487 list_del_rcu(&pa->pa_inode_list);
4488 spin_unlock(pa->pa_obj_lock);
4489 ext4_mb_add_n_trim(ac);
4490 }
4491 } 4487 }
4492 ext4_mb_put_pa(ac, ac->ac_sb, pa);
4493 } 4488 }
4494 if (ac->alloc_semp) 4489 if (ac->alloc_semp)
4495 up_read(ac->alloc_semp); 4490 up_read(ac->alloc_semp);
4491 if (pa) {
4492 /*
4493 * We want to add the pa to the right bucket.
4494 * Remove it from the list and while adding
4495 * make sure the list to which we are adding
4496 * doesn't grow big. We need to release
4497 * alloc_semp before calling ext4_mb_add_n_trim()
4498 */
4499 if (pa->pa_linear && likely(pa->pa_free)) {
4500 spin_lock(pa->pa_obj_lock);
4501 list_del_rcu(&pa->pa_inode_list);
4502 spin_unlock(pa->pa_obj_lock);
4503 ext4_mb_add_n_trim(ac);
4504 }
4505 ext4_mb_put_pa(ac, ac->ac_sb, pa);
4506 }
4496 if (ac->ac_bitmap_page) 4507 if (ac->ac_bitmap_page)
4497 page_cache_release(ac->ac_bitmap_page); 4508 page_cache_release(ac->ac_bitmap_page);
4498 if (ac->ac_buddy_page) 4509 if (ac->ac_buddy_page)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 734abca25e35..fe64d9f79852 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode)
481 + 1); 481 + 1);
482 if (IS_ERR(handle)) { 482 if (IS_ERR(handle)) {
483 retval = PTR_ERR(handle); 483 retval = PTR_ERR(handle);
484 goto err_out; 484 return retval;
485 } 485 }
486 tmp_inode = ext4_new_inode(handle, 486 tmp_inode = ext4_new_inode(handle,
487 inode->i_sb->s_root->d_inode, 487 inode->i_sb->s_root->d_inode,
@@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode)
489 if (IS_ERR(tmp_inode)) { 489 if (IS_ERR(tmp_inode)) {
490 retval = -ENOMEM; 490 retval = -ENOMEM;
491 ext4_journal_stop(handle); 491 ext4_journal_stop(handle);
492 tmp_inode = NULL; 492 return retval;
493 goto err_out;
494 } 493 }
495 i_size_write(tmp_inode, i_size_read(inode)); 494 i_size_write(tmp_inode, i_size_read(inode));
496 /* 495 /*
@@ -618,8 +617,7 @@ err_out:
618 617
619 ext4_journal_stop(handle); 618 ext4_journal_stop(handle);
620 619
621 if (tmp_inode) 620 iput(tmp_inode);
622 iput(tmp_inode);
623 621
624 return retval; 622 return retval;
625} 623}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index fec0b4c2f5f1..ba702bd7910d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1368,7 +1368,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1368 struct fake_dirent *fde; 1368 struct fake_dirent *fde;
1369 1369
1370 blocksize = dir->i_sb->s_blocksize; 1370 blocksize = dir->i_sb->s_blocksize;
1371 dxtrace(printk(KERN_DEBUG "Creating index\n")); 1371 dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1372 retval = ext4_journal_get_write_access(handle, bh); 1372 retval = ext4_journal_get_write_access(handle, bh);
1373 if (retval) { 1373 if (retval) {
1374 ext4_std_error(dir->i_sb, retval); 1374 ext4_std_error(dir->i_sb, retval);
@@ -1377,6 +1377,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1377 } 1377 }
1378 root = (struct dx_root *) bh->b_data; 1378 root = (struct dx_root *) bh->b_data;
1379 1379
1380 /* The 0th block becomes the root, move the dirents out */
1381 fde = &root->dotdot;
1382 de = (struct ext4_dir_entry_2 *)((char *)fde +
1383 ext4_rec_len_from_disk(fde->rec_len));
1384 if ((char *) de >= (((char *) root) + blocksize)) {
1385 ext4_error(dir->i_sb, __func__,
1386 "invalid rec_len for '..' in inode %lu",
1387 dir->i_ino);
1388 brelse(bh);
1389 return -EIO;
1390 }
1391 len = ((char *) root) + blocksize - (char *) de;
1392
1393 /* Allocate new block for the 0th block's dirents */
1380 bh2 = ext4_append(handle, dir, &block, &retval); 1394 bh2 = ext4_append(handle, dir, &block, &retval);
1381 if (!(bh2)) { 1395 if (!(bh2)) {
1382 brelse(bh); 1396 brelse(bh);
@@ -1385,11 +1399,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1385 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; 1399 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
1386 data1 = bh2->b_data; 1400 data1 = bh2->b_data;
1387 1401
1388 /* The 0th block becomes the root, move the dirents out */
1389 fde = &root->dotdot;
1390 de = (struct ext4_dir_entry_2 *)((char *)fde +
1391 ext4_rec_len_from_disk(fde->rec_len));
1392 len = ((char *) root) + blocksize - (char *) de;
1393 memcpy (data1, de, len); 1402 memcpy (data1, de, len);
1394 de = (struct ext4_dir_entry_2 *) data1; 1403 de = (struct ext4_dir_entry_2 *) data1;
1395 top = data1 + len; 1404 top = data1 + len;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c328be5d6885..c06886abd658 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -861,12 +861,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
861 gdp = (struct ext4_group_desc *)((char *)primary->b_data + 861 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
862 gdb_off * EXT4_DESC_SIZE(sb)); 862 gdb_off * EXT4_DESC_SIZE(sb));
863 863
864 memset(gdp, 0, EXT4_DESC_SIZE(sb));
864 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ 865 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
865 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ 866 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
866 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ 867 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
867 ext4_free_blks_set(sb, gdp, input->free_blocks_count); 868 ext4_free_blks_set(sb, gdp, input->free_blocks_count);
868 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); 869 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
869 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); 870 gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
870 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); 871 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
871 872
872 /* 873 /*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e5f06a5f045e..39d1993cfa13 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3046,14 +3046,17 @@ static void ext4_write_super(struct super_block *sb)
3046static int ext4_sync_fs(struct super_block *sb, int wait) 3046static int ext4_sync_fs(struct super_block *sb, int wait)
3047{ 3047{
3048 int ret = 0; 3048 int ret = 0;
3049 tid_t target;
3049 3050
3050 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); 3051 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
3051 sb->s_dirt = 0; 3052 sb->s_dirt = 0;
3052 if (EXT4_SB(sb)->s_journal) { 3053 if (EXT4_SB(sb)->s_journal) {
3053 if (wait) 3054 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
3054 ret = ext4_force_commit(sb); 3055 &target)) {
3055 else 3056 if (wait)
3056 jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); 3057 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
3058 target);
3059 }
3057 } else { 3060 } else {
3058 ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); 3061 ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
3059 } 3062 }
@@ -3088,7 +3091,6 @@ static int ext4_freeze(struct super_block *sb)
3088 3091
3089 /* Journal blocked and flushed, clear needs_recovery flag. */ 3092 /* Journal blocked and flushed, clear needs_recovery flag. */
3090 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3093 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3091 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3092 error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); 3094 error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3093 if (error) 3095 if (error)
3094 goto out; 3096 goto out;
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
new file mode 100644
index 000000000000..d0a69ff25375
--- /dev/null
+++ b/fs/fat/Kconfig
@@ -0,0 +1,97 @@
1config FAT_FS
2 tristate
3 select NLS
4 help
5 If you want to use one of the FAT-based file systems (the MS-DOS and
6 VFAT (Windows 95) file systems), then you must say Y or M here
7 to include FAT support. You will then be able to mount partitions or
8 diskettes with FAT-based file systems and transparently access the
9 files on them, i.e. MSDOS files will look and behave just like all
10 other Unix files.
11
12 This FAT support is not a file system in itself, it only provides
13 the foundation for the other file systems. You will have to say Y or
14 M to at least one of "MSDOS fs support" or "VFAT fs support" in
15 order to make use of it.
16
17 Another way to read and write MSDOS floppies and hard drive
18 partitions from within Linux (but not transparently) is with the
19 mtools ("man mtools") program suite. You don't need to say Y here in
20 order to do that.
21
22 If you need to move large files on floppies between a DOS and a
23 Linux box, say Y here, mount the floppy under Linux with an MSDOS
24 file system and use GNU tar's M option. GNU tar is a program
25 available for Unix and DOS ("man tar" or "info tar").
26
27 The FAT support will enlarge your kernel by about 37 KB. If unsure,
28 say Y.
29
30 To compile this as a module, choose M here: the module will be called
31 fat. Note that if you compile the FAT support as a module, you
32 cannot compile any of the FAT-based file systems into the kernel
33 -- they will have to be modules as well.
34
35config MSDOS_FS
36 tristate "MSDOS fs support"
37 select FAT_FS
38 help
39 This allows you to mount MSDOS partitions of your hard drive (unless
40 they are compressed; to access compressed MSDOS partitions under
41 Linux, you can either use the DOS emulator DOSEMU, described in the
42 DOSEMU-HOWTO, available from
43 <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
44 <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
45 intend to use dosemu with a non-compressed MSDOS partition, say Y
46 here) and MSDOS floppies. This means that file access becomes
47 transparent, i.e. the MSDOS files look and behave just like all
48 other Unix files.
49
50 If you have Windows 95 or Windows NT installed on your MSDOS
51 partitions, you should use the VFAT file system (say Y to "VFAT fs
52 support" below), or you will not be able to see the long filenames
53 generated by Windows 95 / Windows NT.
54
55 This option will enlarge your kernel by about 7 KB. If unsure,
56 answer Y. This will only work if you said Y to "DOS FAT fs support"
57 as well. To compile this as a module, choose M here: the module will
58 be called msdos.
59
60config VFAT_FS
61 tristate "VFAT (Windows-95) fs support"
62 select FAT_FS
63 help
64 This option provides support for normal Windows file systems with
65 long filenames. That includes non-compressed FAT-based file systems
66 used by Windows 95, Windows 98, Windows NT 4.0, and the Unix
67 programs from the mtools package.
68
69 The VFAT support enlarges your kernel by about 10 KB and it only
70 works if you said Y to the "DOS FAT fs support" above. Please read
71 the file <file:Documentation/filesystems/vfat.txt> for details. If
72 unsure, say Y.
73
74 To compile this as a module, choose M here: the module will be called
75 vfat.
76
77config FAT_DEFAULT_CODEPAGE
78 int "Default codepage for FAT"
79 depends on MSDOS_FS || VFAT_FS
80 default 437
81 help
82 This option should be set to the codepage of your FAT filesystems.
83 It can be overridden with the "codepage" mount option.
84 See <file:Documentation/filesystems/vfat.txt> for more information.
85
86config FAT_DEFAULT_IOCHARSET
87 string "Default iocharset for FAT"
88 depends on VFAT_FS
89 default "iso8859-1"
90 help
91 Set this to the default input/output character set you'd
92 like FAT to use. It should probably match the character set
93 that most of your FAT filesystems use, and can be overridden
94 with the "iocharset" mount option for FAT filesystems.
95 Note that "utf8" is not recommended for FAT filesystems.
96 If unsure, you shouldn't set "utf8" here.
97 See <file:Documentation/filesystems/vfat.txt> for more information.
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 6b74d09adbe5..de0004fe6e00 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -202,9 +202,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
202 sector_t blocknr; 202 sector_t blocknr;
203 203
204 /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ 204 /* fat_get_cluster() assumes the requested blocknr isn't truncated. */
205 mutex_lock(&mapping->host->i_mutex); 205 down_read(&mapping->host->i_alloc_sem);
206 blocknr = generic_block_bmap(mapping, block, fat_get_block); 206 blocknr = generic_block_bmap(mapping, block, fat_get_block);
207 mutex_unlock(&mapping->host->i_mutex); 207 up_read(&mapping->host->i_alloc_sem);
208 208
209 return blocknr; 209 return blocknr;
210} 210}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bd215cc791da..d865ca66ccba 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -141,7 +141,7 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
141 return ret; 141 return ret;
142} 142}
143 143
144#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT | O_NOATIME) 144#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
145 145
146static int setfl(int fd, struct file * filp, unsigned long arg) 146static int setfl(int fd, struct file * filp, unsigned long arg)
147{ 147{
@@ -177,21 +177,21 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
177 return error; 177 return error;
178 178
179 /* 179 /*
180 * We still need a lock here for now to keep multiple FASYNC calls 180 * ->fasync() is responsible for setting the FASYNC bit.
181 * from racing with each other.
182 */ 181 */
183 lock_kernel(); 182 if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op &&
184 if ((arg ^ filp->f_flags) & FASYNC) { 183 filp->f_op->fasync) {
185 if (filp->f_op && filp->f_op->fasync) { 184 error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
186 error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); 185 if (error < 0)
187 if (error < 0) 186 goto out;
188 goto out; 187 if (error > 0)
189 } 188 error = 0;
190 } 189 }
191 190 spin_lock(&filp->f_lock);
192 filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); 191 filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
192 spin_unlock(&filp->f_lock);
193
193 out: 194 out:
194 unlock_kernel();
195 return error; 195 return error;
196} 196}
197 197
@@ -516,7 +516,7 @@ static DEFINE_RWLOCK(fasync_lock);
516static struct kmem_cache *fasync_cache __read_mostly; 516static struct kmem_cache *fasync_cache __read_mostly;
517 517
518/* 518/*
519 * fasync_helper() is used by some character device drivers (mainly mice) 519 * fasync_helper() is used by almost all character device drivers
520 * to set up the fasync queue. It returns negative on error, 0 if it did 520 * to set up the fasync queue. It returns negative on error, 0 if it did
521 * no changes and positive if it added/deleted the entry. 521 * no changes and positive if it added/deleted the entry.
522 */ 522 */
@@ -555,6 +555,13 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
555 result = 1; 555 result = 1;
556 } 556 }
557out: 557out:
558 /* Fix up FASYNC bit while still holding fasync_lock */
559 spin_lock(&filp->f_lock);
560 if (on)
561 filp->f_flags |= FASYNC;
562 else
563 filp->f_flags &= ~FASYNC;
564 spin_unlock(&filp->f_lock);
558 write_unlock_irq(&fasync_lock); 565 write_unlock_irq(&fasync_lock);
559 return result; 566 return result;
560} 567}
diff --git a/fs/file_table.c b/fs/file_table.c
index bbeeac6efa1a..b74a8e1da913 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/security.h> 15#include <linux/security.h>
16#include <linux/ima.h>
16#include <linux/eventpoll.h> 17#include <linux/eventpoll.h>
17#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
@@ -127,6 +128,7 @@ struct file *get_empty_filp(void)
127 atomic_long_set(&f->f_count, 1); 128 atomic_long_set(&f->f_count, 1);
128 rwlock_init(&f->f_owner.lock); 129 rwlock_init(&f->f_owner.lock);
129 f->f_cred = get_cred(cred); 130 f->f_cred = get_cred(cred);
131 spin_lock_init(&f->f_lock);
130 eventpoll_init_file(f); 132 eventpoll_init_file(f);
131 /* f->f_version: 0 */ 133 /* f->f_version: 0 */
132 return f; 134 return f;
@@ -279,6 +281,7 @@ void __fput(struct file *file)
279 if (file->f_op && file->f_op->release) 281 if (file->f_op && file->f_op->release)
280 file->f_op->release(inode, file); 282 file->f_op->release(inode, file);
281 security_file_free(file); 283 security_file_free(file);
284 ima_file_free(file);
282 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) 285 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
283 cdev_put(inode->i_cdev); 286 cdev_put(inode->i_cdev);
284 fops_put(file->f_op); 287 fops_put(file->f_op);
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
new file mode 100644
index 000000000000..8dc1cd5c1efe
--- /dev/null
+++ b/fs/freevxfs/Kconfig
@@ -0,0 +1,16 @@
1config VXFS_FS
2 tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
3 depends on BLOCK
4 help
5 FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
6 file system format. VERITAS VxFS(TM) is the standard file system
7 of SCO UnixWare (and possibly others) and optionally available
8 for Sunsoft Solaris, HP-UX and many other operating systems.
9 Currently only readonly access is supported.
10
11 NOTE: the file system type as used by mount(1), mount(2) and
12 fstab(5) is 'vxfs' as it describes the file system format, not
13 the actual driver.
14
15 To compile this as a module, choose M here: the module will be
16 called freevxfs. If unsure, say N.
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e5eaa62fd17f..e3fe9918faaf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -274,6 +274,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
274 int ret; 274 int ret;
275 275
276 BUG_ON(inode->i_state & I_SYNC); 276 BUG_ON(inode->i_state & I_SYNC);
277 WARN_ON(inode->i_state & I_NEW);
277 278
278 /* Set I_SYNC, reset I_DIRTY */ 279 /* Set I_SYNC, reset I_DIRTY */
279 dirty = inode->i_state & I_DIRTY; 280 dirty = inode->i_state & I_DIRTY;
@@ -298,6 +299,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
298 } 299 }
299 300
300 spin_lock(&inode_lock); 301 spin_lock(&inode_lock);
302 WARN_ON(inode->i_state & I_NEW);
301 inode->i_state &= ~I_SYNC; 303 inode->i_state &= ~I_SYNC;
302 if (!(inode->i_state & I_FREEING)) { 304 if (!(inode->i_state & I_FREEING)) {
303 if (!(inode->i_state & I_DIRTY) && 305 if (!(inode->i_state & I_DIRTY) &&
@@ -470,6 +472,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
470 break; 472 break;
471 } 473 }
472 474
475 if (inode->i_state & I_NEW) {
476 requeue_io(inode);
477 continue;
478 }
479
473 if (wbc->nonblocking && bdi_write_congested(bdi)) { 480 if (wbc->nonblocking && bdi_write_congested(bdi)) {
474 wbc->encountered_congestion = 1; 481 wbc->encountered_congestion = 1;
475 if (!sb_is_blkdev_sb(sb)) 482 if (!sb_is_blkdev_sb(sb))
@@ -531,7 +538,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
531 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 538 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
532 struct address_space *mapping; 539 struct address_space *mapping;
533 540
534 if (inode->i_state & (I_FREEING|I_WILL_FREE)) 541 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
535 continue; 542 continue;
536 mapping = inode->i_mapping; 543 mapping = inode->i_mapping;
537 if (mapping->nrpages == 0) 544 if (mapping->nrpages == 0)
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
new file mode 100644
index 000000000000..0cf160a94eda
--- /dev/null
+++ b/fs/fuse/Kconfig
@@ -0,0 +1,15 @@
1config FUSE_FS
2 tristate "FUSE (Filesystem in Userspace) support"
3 help
4 With FUSE it is possible to implement a fully functional filesystem
5 in a userspace program.
6
7 There's also companion library: libfuse. This library along with
8 utilities is available from the FUSE homepage:
9 <http://fuse.sourceforge.net/>
10
11 See <file:Documentation/filesystems/fuse.txt> for more information.
12 See <file:Documentation/Changes> for needed library/utility version.
13
14 If you want to develop a userspace FS, or if you want to use
15 a filesystem based on FUSE, answer Y or M.
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e0c7ada08a1f..ba76b68c52ff 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -281,7 +281,8 @@ __releases(&fc->lock)
281 fc->blocked = 0; 281 fc->blocked = 0;
282 wake_up_all(&fc->blocked_waitq); 282 wake_up_all(&fc->blocked_waitq);
283 } 283 }
284 if (fc->num_background == FUSE_CONGESTION_THRESHOLD) { 284 if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
285 fc->connected) {
285 clear_bdi_congested(&fc->bdi, READ); 286 clear_bdi_congested(&fc->bdi, READ);
286 clear_bdi_congested(&fc->bdi, WRITE); 287 clear_bdi_congested(&fc->bdi, WRITE);
287 } 288 }
@@ -825,16 +826,21 @@ static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
825 struct fuse_copy_state *cs) 826 struct fuse_copy_state *cs)
826{ 827{
827 struct fuse_notify_poll_wakeup_out outarg; 828 struct fuse_notify_poll_wakeup_out outarg;
828 int err; 829 int err = -EINVAL;
829 830
830 if (size != sizeof(outarg)) 831 if (size != sizeof(outarg))
831 return -EINVAL; 832 goto err;
832 833
833 err = fuse_copy_one(cs, &outarg, sizeof(outarg)); 834 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
834 if (err) 835 if (err)
835 return err; 836 goto err;
836 837
838 fuse_copy_finish(cs);
837 return fuse_notify_poll_wakeup(fc, &outarg); 839 return fuse_notify_poll_wakeup(fc, &outarg);
840
841err:
842 fuse_copy_finish(cs);
843 return err;
838} 844}
839 845
840static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, 846static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
@@ -845,6 +851,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
845 return fuse_notify_poll(fc, size, cs); 851 return fuse_notify_poll(fc, size, cs);
846 852
847 default: 853 default:
854 fuse_copy_finish(cs);
848 return -EINVAL; 855 return -EINVAL;
849 } 856 }
850} 857}
@@ -923,7 +930,6 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
923 */ 930 */
924 if (!oh.unique) { 931 if (!oh.unique) {
925 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); 932 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
926 fuse_copy_finish(&cs);
927 return err ? err : nbytes; 933 return err ? err : nbytes;
928 } 934 }
929 935
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e8162646a9b5..d9fdb7cec538 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -54,7 +54,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
54 ff->reserved_req = fuse_request_alloc(); 54 ff->reserved_req = fuse_request_alloc();
55 if (!ff->reserved_req) { 55 if (!ff->reserved_req) {
56 kfree(ff); 56 kfree(ff);
57 ff = NULL; 57 return NULL;
58 } else { 58 } else {
59 INIT_LIST_HEAD(&ff->write_entry); 59 INIT_LIST_HEAD(&ff->write_entry);
60 atomic_set(&ff->count, 0); 60 atomic_set(&ff->count, 0);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 47c96fdca1ac..459b73dd45e1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -292,6 +292,7 @@ static void fuse_put_super(struct super_block *sb)
292 list_del(&fc->entry); 292 list_del(&fc->entry);
293 fuse_ctl_remove_conn(fc); 293 fuse_ctl_remove_conn(fc);
294 mutex_unlock(&fuse_mutex); 294 mutex_unlock(&fuse_mutex);
295 bdi_destroy(&fc->bdi);
295 fuse_conn_put(fc); 296 fuse_conn_put(fc);
296} 297}
297 298
@@ -532,7 +533,6 @@ void fuse_conn_put(struct fuse_conn *fc)
532 if (fc->destroy_req) 533 if (fc->destroy_req)
533 fuse_request_free(fc->destroy_req); 534 fuse_request_free(fc->destroy_req);
534 mutex_destroy(&fc->inst_mutex); 535 mutex_destroy(&fc->inst_mutex);
535 bdi_destroy(&fc->bdi);
536 fc->release(fc); 536 fc->release(fc);
537 } 537 }
538} 538}
@@ -805,16 +805,18 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
805 int err; 805 int err;
806 int is_bdev = sb->s_bdev != NULL; 806 int is_bdev = sb->s_bdev != NULL;
807 807
808 err = -EINVAL;
808 if (sb->s_flags & MS_MANDLOCK) 809 if (sb->s_flags & MS_MANDLOCK)
809 return -EINVAL; 810 goto err;
810 811
811 if (!parse_fuse_opt((char *) data, &d, is_bdev)) 812 if (!parse_fuse_opt((char *) data, &d, is_bdev))
812 return -EINVAL; 813 goto err;
813 814
814 if (is_bdev) { 815 if (is_bdev) {
815#ifdef CONFIG_BLOCK 816#ifdef CONFIG_BLOCK
817 err = -EINVAL;
816 if (!sb_set_blocksize(sb, d.blksize)) 818 if (!sb_set_blocksize(sb, d.blksize))
817 return -EINVAL; 819 goto err;
818#endif 820#endif
819 } else { 821 } else {
820 sb->s_blocksize = PAGE_CACHE_SIZE; 822 sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -826,20 +828,22 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
826 sb->s_export_op = &fuse_export_operations; 828 sb->s_export_op = &fuse_export_operations;
827 829
828 file = fget(d.fd); 830 file = fget(d.fd);
831 err = -EINVAL;
829 if (!file) 832 if (!file)
830 return -EINVAL; 833 goto err;
831 834
832 if (file->f_op != &fuse_dev_operations) 835 if (file->f_op != &fuse_dev_operations)
833 return -EINVAL; 836 goto err_fput;
834 837
835 fc = kmalloc(sizeof(*fc), GFP_KERNEL); 838 fc = kmalloc(sizeof(*fc), GFP_KERNEL);
839 err = -ENOMEM;
836 if (!fc) 840 if (!fc)
837 return -ENOMEM; 841 goto err_fput;
838 842
839 err = fuse_conn_init(fc, sb); 843 err = fuse_conn_init(fc, sb);
840 if (err) { 844 if (err) {
841 kfree(fc); 845 kfree(fc);
842 return err; 846 goto err_fput;
843 } 847 }
844 848
845 fc->release = fuse_free_conn; 849 fc->release = fuse_free_conn;
@@ -854,12 +858,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
854 err = -ENOMEM; 858 err = -ENOMEM;
855 root = fuse_get_root_inode(sb, d.rootmode); 859 root = fuse_get_root_inode(sb, d.rootmode);
856 if (!root) 860 if (!root)
857 goto err; 861 goto err_put_conn;
858 862
859 root_dentry = d_alloc_root(root); 863 root_dentry = d_alloc_root(root);
860 if (!root_dentry) { 864 if (!root_dentry) {
861 iput(root); 865 iput(root);
862 goto err; 866 goto err_put_conn;
863 } 867 }
864 868
865 init_req = fuse_request_alloc(); 869 init_req = fuse_request_alloc();
@@ -903,9 +907,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
903 fuse_request_free(init_req); 907 fuse_request_free(init_req);
904 err_put_root: 908 err_put_root:
905 dput(root_dentry); 909 dput(root_dentry);
906 err: 910 err_put_conn:
907 fput(file);
908 fuse_conn_put(fc); 911 fuse_conn_put(fc);
912 err_fput:
913 fput(file);
914 err:
909 return err; 915 return err;
910} 916}
911 917
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index e563a6449811..3a981b7f64ca 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,10 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL && (64BIT || LBD) 3 depends on EXPERIMENTAL && (64BIT || LBD)
4 select DLM if GFS2_FS_LOCKING_DLM
5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
6 select SYSFS if GFS2_FS_LOCKING_DLM
7 select IP_SCTP if DLM_SCTP
4 select FS_POSIX_ACL 8 select FS_POSIX_ACL
5 select CRC32 9 select CRC32
6 help 10 help
@@ -18,17 +22,16 @@ config GFS2_FS
18 the locking module below. Documentation and utilities for GFS2 can 22 the locking module below. Documentation and utilities for GFS2 can
19 be found here: http://sources.redhat.com/cluster 23 be found here: http://sources.redhat.com/cluster
20 24
21 The "nolock" lock module is now built in to GFS2 by default. 25 The "nolock" lock module is now built in to GFS2 by default. If
26 you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
27 networking.
22 28
23config GFS2_FS_LOCKING_DLM 29config GFS2_FS_LOCKING_DLM
24 tristate "GFS2 DLM locking module" 30 bool "GFS2 DLM locking"
25 depends on GFS2_FS && SYSFS && NET && INET && (IPV6 || IPV6=n) 31 depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && HOTPLUG
26 select IP_SCTP if DLM_SCTP
27 select CONFIGFS_FS
28 select DLM
29 help 32 help
30 Multiple node locking module for GFS2 33 Multiple node locking module for GFS2
31 34
32 Most users of GFS2 will require this module. It provides the locking 35 Most users of GFS2 will require this. It provides the locking
33 interface between GFS2 and the DLM, which is required to use GFS2 36 interface between GFS2 and the DLM, which is required to use GFS2
34 in a cluster environment. 37 in a cluster environment.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index c1b4ec6a9650..a851ea4bdf70 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,9 +1,9 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o 1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ 2gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o log.o lops.o locking.o main.o meta_io.o \ 3 glops.o inode.o log.o lops.o main.o meta_io.o \
4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ 4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o quota.o \ 5 ops_fstype.o ops_inode.o ops_super.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o 6 recovery.o rgrp.o super.o sys.o trans.o util.o
7 7
8obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/ 8gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
9 9
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index e335dceb6a4f..43764f4fa763 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -15,7 +15,6 @@
15#include <linux/posix_acl.h> 15#include <linux/posix_acl.h>
16#include <linux/posix_acl_xattr.h> 16#include <linux/posix_acl_xattr.h>
17#include <linux/gfs2_ondisk.h> 17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19 18
20#include "gfs2.h" 19#include "gfs2.h"
21#include "incore.h" 20#include "incore.h"
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 11ffc56f1f81..3a5d3f883e10 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -13,7 +13,6 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/lm_interface.h>
17 16
18#include "gfs2.h" 17#include "gfs2.h"
19#include "incore.h" 18#include "incore.h"
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b7c8e5c70791..aef4d0c06748 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -60,7 +60,6 @@
60#include <linux/gfs2_ondisk.h> 60#include <linux/gfs2_ondisk.h>
61#include <linux/crc32.h> 61#include <linux/crc32.h>
62#include <linux/vmalloc.h> 62#include <linux/vmalloc.h>
63#include <linux/lm_interface.h>
64 63
65#include "gfs2.h" 64#include "gfs2.h"
66#include "incore.h" 65#include "incore.h"
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index f114ba2b3557..dee9b03e5b37 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -14,7 +14,6 @@
14#include <linux/capability.h> 14#include <linux/capability.h>
15#include <linux/xattr.h> 15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18#include <asm/uaccess.h> 17#include <asm/uaccess.h>
19 18
20#include "gfs2.h" 19#include "gfs2.h"
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 0d1c76d906ae..899763aed217 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -13,7 +13,6 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/xattr.h> 14#include <linux/xattr.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17#include <asm/uaccess.h> 16#include <asm/uaccess.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6b983aef785d..3984e47d1d33 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -10,7 +10,6 @@
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
15#include <linux/delay.h> 14#include <linux/delay.h>
16#include <linux/sort.h> 15#include <linux/sort.h>
@@ -18,7 +17,6 @@
18#include <linux/kallsyms.h> 17#include <linux/kallsyms.h>
19#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
20#include <linux/list.h> 19#include <linux/list.h>
21#include <linux/lm_interface.h>
22#include <linux/wait.h> 20#include <linux/wait.h>
23#include <linux/module.h> 21#include <linux/module.h>
24#include <linux/rwsem.h> 22#include <linux/rwsem.h>
@@ -155,13 +153,10 @@ static void glock_free(struct gfs2_glock *gl)
155 struct gfs2_sbd *sdp = gl->gl_sbd; 153 struct gfs2_sbd *sdp = gl->gl_sbd;
156 struct inode *aspace = gl->gl_aspace; 154 struct inode *aspace = gl->gl_aspace;
157 155
158 if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
159 sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
160
161 if (aspace) 156 if (aspace)
162 gfs2_aspace_put(aspace); 157 gfs2_aspace_put(aspace);
163 158
164 kmem_cache_free(gfs2_glock_cachep, gl); 159 sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl);
165} 160}
166 161
167/** 162/**
@@ -172,6 +167,7 @@ static void glock_free(struct gfs2_glock *gl)
172 167
173static void gfs2_glock_hold(struct gfs2_glock *gl) 168static void gfs2_glock_hold(struct gfs2_glock *gl)
174{ 169{
170 GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
175 atomic_inc(&gl->gl_ref); 171 atomic_inc(&gl->gl_ref);
176} 172}
177 173
@@ -211,17 +207,15 @@ int gfs2_glock_put(struct gfs2_glock *gl)
211 atomic_dec(&lru_count); 207 atomic_dec(&lru_count);
212 } 208 }
213 spin_unlock(&lru_lock); 209 spin_unlock(&lru_lock);
214 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
215 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
216 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); 210 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
217 glock_free(gl); 211 glock_free(gl);
218 rv = 1; 212 rv = 1;
219 goto out; 213 goto out;
220 } 214 }
221 write_unlock(gl_lock_addr(gl->gl_hash));
222 /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */ 215 /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
223 if (atomic_read(&gl->gl_ref) == 2) 216 if (atomic_read(&gl->gl_ref) == 2)
224 gfs2_glock_schedule_for_reclaim(gl); 217 gfs2_glock_schedule_for_reclaim(gl);
218 write_unlock(gl_lock_addr(gl->gl_hash));
225out: 219out:
226 return rv; 220 return rv;
227} 221}
@@ -256,27 +250,6 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
256} 250}
257 251
258/** 252/**
259 * gfs2_glock_find() - Find glock by lock number
260 * @sdp: The GFS2 superblock
261 * @name: The lock name
262 *
263 * Returns: NULL, or the struct gfs2_glock with the requested number
264 */
265
266static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
267 const struct lm_lockname *name)
268{
269 unsigned int hash = gl_hash(sdp, name);
270 struct gfs2_glock *gl;
271
272 read_lock(gl_lock_addr(hash));
273 gl = search_bucket(hash, sdp, name);
274 read_unlock(gl_lock_addr(hash));
275
276 return gl;
277}
278
279/**
280 * may_grant - check if its ok to grant a new lock 253 * may_grant - check if its ok to grant a new lock
281 * @gl: The glock 254 * @gl: The glock
282 * @gh: The lock request which we wish to grant 255 * @gh: The lock request which we wish to grant
@@ -523,7 +496,7 @@ out_locked:
523} 496}
524 497
525static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, 498static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
526 unsigned int cur_state, unsigned int req_state, 499 unsigned int req_state,
527 unsigned int flags) 500 unsigned int flags)
528{ 501{
529 int ret = LM_OUT_ERROR; 502 int ret = LM_OUT_ERROR;
@@ -532,7 +505,7 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
532 return req_state == LM_ST_UNLOCKED ? 0 : req_state; 505 return req_state == LM_ST_UNLOCKED ? 0 : req_state;
533 506
534 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 507 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
535 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state, 508 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
536 req_state, flags); 509 req_state, flags);
537 return ret; 510 return ret;
538} 511}
@@ -575,7 +548,7 @@ __acquires(&gl->gl_spin)
575 gl->gl_state == LM_ST_DEFERRED) && 548 gl->gl_state == LM_ST_DEFERRED) &&
576 !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) 549 !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
577 lck_flags |= LM_FLAG_TRY_1CB; 550 lck_flags |= LM_FLAG_TRY_1CB;
578 ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags); 551 ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
579 552
580 if (!(ret & LM_OUT_ASYNC)) { 553 if (!(ret & LM_OUT_ASYNC)) {
581 finish_xmote(gl, ret); 554 finish_xmote(gl, ret);
@@ -624,10 +597,11 @@ __acquires(&gl->gl_spin)
624 597
625 GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); 598 GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
626 599
600 down_read(&gfs2_umount_flush_sem);
627 if (test_bit(GLF_DEMOTE, &gl->gl_flags) && 601 if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
628 gl->gl_demote_state != gl->gl_state) { 602 gl->gl_demote_state != gl->gl_state) {
629 if (find_first_holder(gl)) 603 if (find_first_holder(gl))
630 goto out; 604 goto out_unlock;
631 if (nonblock) 605 if (nonblock)
632 goto out_sched; 606 goto out_sched;
633 set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); 607 set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
@@ -638,23 +612,26 @@ __acquires(&gl->gl_spin)
638 gfs2_demote_wake(gl); 612 gfs2_demote_wake(gl);
639 ret = do_promote(gl); 613 ret = do_promote(gl);
640 if (ret == 0) 614 if (ret == 0)
641 goto out; 615 goto out_unlock;
642 if (ret == 2) 616 if (ret == 2)
643 return; 617 goto out_sem;
644 gh = find_first_waiter(gl); 618 gh = find_first_waiter(gl);
645 gl->gl_target = gh->gh_state; 619 gl->gl_target = gh->gh_state;
646 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) 620 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
647 do_error(gl, 0); /* Fail queued try locks */ 621 do_error(gl, 0); /* Fail queued try locks */
648 } 622 }
649 do_xmote(gl, gh, gl->gl_target); 623 do_xmote(gl, gh, gl->gl_target);
624out_sem:
625 up_read(&gfs2_umount_flush_sem);
650 return; 626 return;
651 627
652out_sched: 628out_sched:
653 gfs2_glock_hold(gl); 629 gfs2_glock_hold(gl);
654 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 630 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
655 gfs2_glock_put(gl); 631 gfs2_glock_put(gl);
656out: 632out_unlock:
657 clear_bit(GLF_LOCK, &gl->gl_flags); 633 clear_bit(GLF_LOCK, &gl->gl_flags);
634 goto out_sem;
658} 635}
659 636
660static void glock_work_func(struct work_struct *work) 637static void glock_work_func(struct work_struct *work)
@@ -681,18 +658,6 @@ static void glock_work_func(struct work_struct *work)
681 gfs2_glock_put(gl); 658 gfs2_glock_put(gl);
682} 659}
683 660
684static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
685 void **lockp)
686{
687 int error = -EIO;
688 if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
689 return 0;
690 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
691 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
692 sdp->sd_lockstruct.ls_lockspace, name, lockp);
693 return error;
694}
695
696/** 661/**
697 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist 662 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
698 * @sdp: The GFS2 superblock 663 * @sdp: The GFS2 superblock
@@ -719,10 +684,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
719 gl = search_bucket(hash, sdp, &name); 684 gl = search_bucket(hash, sdp, &name);
720 read_unlock(gl_lock_addr(hash)); 685 read_unlock(gl_lock_addr(hash));
721 686
722 if (gl || !create) { 687 *glp = gl;
723 *glp = gl; 688 if (gl)
724 return 0; 689 return 0;
725 } 690 if (!create)
691 return -ENOENT;
726 692
727 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); 693 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
728 if (!gl) 694 if (!gl)
@@ -736,7 +702,9 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
736 gl->gl_demote_state = LM_ST_EXCLUSIVE; 702 gl->gl_demote_state = LM_ST_EXCLUSIVE;
737 gl->gl_hash = hash; 703 gl->gl_hash = hash;
738 gl->gl_ops = glops; 704 gl->gl_ops = glops;
739 gl->gl_stamp = jiffies; 705 snprintf(gl->gl_strname, GDLM_STRNAME_BYTES, "%8x%16llx", name.ln_type, (unsigned long long)number);
706 memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
707 gl->gl_lksb.sb_lvbptr = gl->gl_lvb;
740 gl->gl_tchange = jiffies; 708 gl->gl_tchange = jiffies;
741 gl->gl_object = NULL; 709 gl->gl_object = NULL;
742 gl->gl_sbd = sdp; 710 gl->gl_sbd = sdp;
@@ -753,10 +721,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
753 } 721 }
754 } 722 }
755 723
756 error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
757 if (error)
758 goto fail_aspace;
759
760 write_lock(gl_lock_addr(hash)); 724 write_lock(gl_lock_addr(hash));
761 tmp = search_bucket(hash, sdp, &name); 725 tmp = search_bucket(hash, sdp, &name);
762 if (tmp) { 726 if (tmp) {
@@ -772,9 +736,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
772 736
773 return 0; 737 return 0;
774 738
775fail_aspace:
776 if (gl->gl_aspace)
777 gfs2_aspace_put(gl->gl_aspace);
778fail: 739fail:
779 kmem_cache_free(gfs2_glock_cachep, gl); 740 kmem_cache_free(gfs2_glock_cachep, gl);
780 return error; 741 return error;
@@ -966,7 +927,7 @@ do_cancel:
966 if (!(gh->gh_flags & LM_FLAG_PRIORITY)) { 927 if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
967 spin_unlock(&gl->gl_spin); 928 spin_unlock(&gl->gl_spin);
968 if (sdp->sd_lockstruct.ls_ops->lm_cancel) 929 if (sdp->sd_lockstruct.ls_ops->lm_cancel)
969 sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock); 930 sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
970 spin_lock(&gl->gl_spin); 931 spin_lock(&gl->gl_spin);
971 } 932 }
972 return; 933 return;
@@ -1051,7 +1012,6 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1051 spin_lock(&gl->gl_spin); 1012 spin_lock(&gl->gl_spin);
1052 clear_bit(GLF_LOCK, &gl->gl_flags); 1013 clear_bit(GLF_LOCK, &gl->gl_flags);
1053 } 1014 }
1054 gl->gl_stamp = jiffies;
1055 if (list_empty(&gl->gl_holders) && 1015 if (list_empty(&gl->gl_holders) &&
1056 !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && 1016 !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
1057 !test_bit(GLF_DEMOTE, &gl->gl_flags)) 1017 !test_bit(GLF_DEMOTE, &gl->gl_flags))
@@ -1240,70 +1200,13 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1240 gfs2_glock_dq_uninit(&ghs[x]); 1200 gfs2_glock_dq_uninit(&ghs[x]);
1241} 1201}
1242 1202
1243static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp) 1203void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1244{
1245 int error = -EIO;
1246 if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
1247 return 0;
1248 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
1249 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
1250 return error;
1251}
1252
1253/**
1254 * gfs2_lvb_hold - attach a LVB from a glock
1255 * @gl: The glock in question
1256 *
1257 */
1258
1259int gfs2_lvb_hold(struct gfs2_glock *gl)
1260{
1261 int error;
1262
1263 if (!atomic_read(&gl->gl_lvb_count)) {
1264 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
1265 if (error)
1266 return error;
1267 gfs2_glock_hold(gl);
1268 }
1269 atomic_inc(&gl->gl_lvb_count);
1270
1271 return 0;
1272}
1273
1274/**
1275 * gfs2_lvb_unhold - detach a LVB from a glock
1276 * @gl: The glock in question
1277 *
1278 */
1279
1280void gfs2_lvb_unhold(struct gfs2_glock *gl)
1281{
1282 struct gfs2_sbd *sdp = gl->gl_sbd;
1283
1284 gfs2_glock_hold(gl);
1285 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1286 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1287 if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
1288 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
1289 gl->gl_lvb = NULL;
1290 gfs2_glock_put(gl);
1291 }
1292 gfs2_glock_put(gl);
1293}
1294
1295static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1296 unsigned int state)
1297{ 1204{
1298 struct gfs2_glock *gl;
1299 unsigned long delay = 0; 1205 unsigned long delay = 0;
1300 unsigned long holdtime; 1206 unsigned long holdtime;
1301 unsigned long now = jiffies; 1207 unsigned long now = jiffies;
1302 1208
1303 gl = gfs2_glock_find(sdp, name); 1209 gfs2_glock_hold(gl);
1304 if (!gl)
1305 return;
1306
1307 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; 1210 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1308 if (time_before(now, holdtime)) 1211 if (time_before(now, holdtime))
1309 delay = holdtime - now; 1212 delay = holdtime - now;
@@ -1317,74 +1220,33 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1317 gfs2_glock_put(gl); 1220 gfs2_glock_put(gl);
1318} 1221}
1319 1222
1320static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
1321{
1322 struct gfs2_jdesc *jd;
1323
1324 spin_lock(&sdp->sd_jindex_spin);
1325 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
1326 if (jd->jd_jid != jid)
1327 continue;
1328 jd->jd_dirty = 1;
1329 break;
1330 }
1331 spin_unlock(&sdp->sd_jindex_spin);
1332}
1333
1334/** 1223/**
1335 * gfs2_glock_cb - Callback used by locking module 1224 * gfs2_glock_complete - Callback used by locking
1336 * @sdp: Pointer to the superblock 1225 * @gl: Pointer to the glock
1337 * @type: Type of callback 1226 * @ret: The return value from the dlm
1338 * @data: Type dependent data pointer
1339 * 1227 *
1340 * Called by the locking module when it wants to tell us something.
1341 * Either we need to drop a lock, one of our ASYNC requests completed, or
1342 * a journal from another client needs to be recovered.
1343 */ 1228 */
1344 1229
1345void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) 1230void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1346{ 1231{
1347 struct gfs2_sbd *sdp = cb_data; 1232 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
1348 1233 gl->gl_reply = ret;
1349 switch (type) { 1234 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
1350 case LM_CB_NEED_E: 1235 struct gfs2_holder *gh;
1351 blocking_cb(sdp, data, LM_ST_UNLOCKED); 1236 spin_lock(&gl->gl_spin);
1352 return; 1237 gh = find_first_waiter(gl);
1353 1238 if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) &&
1354 case LM_CB_NEED_D: 1239 (gl->gl_target != LM_ST_UNLOCKED)) ||
1355 blocking_cb(sdp, data, LM_ST_DEFERRED); 1240 ((ret & ~LM_OUT_ST_MASK) != 0))
1356 return; 1241 set_bit(GLF_FROZEN, &gl->gl_flags);
1357 1242 spin_unlock(&gl->gl_spin);
1358 case LM_CB_NEED_S: 1243 if (test_bit(GLF_FROZEN, &gl->gl_flags))
1359 blocking_cb(sdp, data, LM_ST_SHARED);
1360 return;
1361
1362 case LM_CB_ASYNC: {
1363 struct lm_async_cb *async = data;
1364 struct gfs2_glock *gl;
1365
1366 down_read(&gfs2_umount_flush_sem);
1367 gl = gfs2_glock_find(sdp, &async->lc_name);
1368 if (gfs2_assert_warn(sdp, gl))
1369 return; 1244 return;
1370 gl->gl_reply = async->lc_ret;
1371 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1372 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1373 gfs2_glock_put(gl);
1374 up_read(&gfs2_umount_flush_sem);
1375 return;
1376 }
1377
1378 case LM_CB_NEED_RECOVERY:
1379 gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
1380 if (sdp->sd_recoverd_process)
1381 wake_up_process(sdp->sd_recoverd_process);
1382 return;
1383
1384 default:
1385 gfs2_assert_warn(sdp, 0);
1386 return;
1387 } 1245 }
1246 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1247 gfs2_glock_hold(gl);
1248 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1249 gfs2_glock_put(gl);
1388} 1250}
1389 1251
1390/** 1252/**
@@ -1515,6 +1377,25 @@ out:
1515 return has_entries; 1377 return has_entries;
1516} 1378}
1517 1379
1380
1381/**
1382 * thaw_glock - thaw out a glock which has an unprocessed reply waiting
1383 * @gl: The glock to thaw
1384 *
1385 * N.B. When we freeze a glock, we leave a ref to the glock outstanding,
1386 * so this has to result in the ref count being dropped by one.
1387 */
1388
1389static void thaw_glock(struct gfs2_glock *gl)
1390{
1391 if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
1392 return;
1393 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1394 gfs2_glock_hold(gl);
1395 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1396 gfs2_glock_put(gl);
1397}
1398
1518/** 1399/**
1519 * clear_glock - look at a glock and see if we can free it from glock cache 1400 * clear_glock - look at a glock and see if we can free it from glock cache
1520 * @gl: the glock to look at 1401 * @gl: the glock to look at
@@ -1540,6 +1421,20 @@ static void clear_glock(struct gfs2_glock *gl)
1540} 1421}
1541 1422
1542/** 1423/**
1424 * gfs2_glock_thaw - Thaw any frozen glocks
1425 * @sdp: The super block
1426 *
1427 */
1428
1429void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1430{
1431 unsigned x;
1432
1433 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1434 examine_bucket(thaw_glock, sdp, x);
1435}
1436
1437/**
1543 * gfs2_gl_hash_clear - Empty out the glock hash table 1438 * gfs2_gl_hash_clear - Empty out the glock hash table
1544 * @sdp: the filesystem 1439 * @sdp: the filesystem
1545 * @wait: wait until it's all gone 1440 * @wait: wait until it's all gone
@@ -1619,7 +1514,7 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1619 if (flags & LM_FLAG_NOEXP) 1514 if (flags & LM_FLAG_NOEXP)
1620 *p++ = 'e'; 1515 *p++ = 'e';
1621 if (flags & LM_FLAG_ANY) 1516 if (flags & LM_FLAG_ANY)
1622 *p++ = 'a'; 1517 *p++ = 'A';
1623 if (flags & LM_FLAG_PRIORITY) 1518 if (flags & LM_FLAG_PRIORITY)
1624 *p++ = 'p'; 1519 *p++ = 'p';
1625 if (flags & GL_ASYNC) 1520 if (flags & GL_ASYNC)
@@ -1683,6 +1578,10 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
1683 *p++ = 'i'; 1578 *p++ = 'i';
1684 if (test_bit(GLF_REPLY_PENDING, gflags)) 1579 if (test_bit(GLF_REPLY_PENDING, gflags))
1685 *p++ = 'r'; 1580 *p++ = 'r';
1581 if (test_bit(GLF_INITIAL, gflags))
1582 *p++ = 'I';
1583 if (test_bit(GLF_FROZEN, gflags))
1584 *p++ = 'F';
1686 *p = 0; 1585 *p = 0;
1687 return buf; 1586 return buf;
1688} 1587}
@@ -1717,14 +1616,13 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1717 dtime *= 1000000/HZ; /* demote time in uSec */ 1616 dtime *= 1000000/HZ; /* demote time in uSec */
1718 if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) 1617 if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
1719 dtime = 0; 1618 dtime = 0;
1720 gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n", 1619 gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n",
1721 state2str(gl->gl_state), 1620 state2str(gl->gl_state),
1722 gl->gl_name.ln_type, 1621 gl->gl_name.ln_type,
1723 (unsigned long long)gl->gl_name.ln_number, 1622 (unsigned long long)gl->gl_name.ln_number,
1724 gflags2str(gflags_buf, &gl->gl_flags), 1623 gflags2str(gflags_buf, &gl->gl_flags),
1725 state2str(gl->gl_target), 1624 state2str(gl->gl_target),
1726 state2str(gl->gl_demote_state), dtime, 1625 state2str(gl->gl_demote_state), dtime,
1727 atomic_read(&gl->gl_lvb_count),
1728 atomic_read(&gl->gl_ail_count), 1626 atomic_read(&gl->gl_ail_count),
1729 atomic_read(&gl->gl_ref)); 1627 atomic_read(&gl->gl_ref));
1730 1628
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 543ec7ecfbda..a602a28f6f08 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -11,15 +11,130 @@
11#define __GLOCK_DOT_H__ 11#define __GLOCK_DOT_H__
12 12
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/parser.h>
14#include "incore.h" 15#include "incore.h"
15 16
16/* Flags for lock requests; used in gfs2_holder gh_flag field. 17/* Options for hostdata parser */
17 From lm_interface.h: 18
19enum {
20 Opt_jid,
21 Opt_id,
22 Opt_first,
23 Opt_nodir,
24 Opt_err,
25};
26
27/*
28 * lm_lockname types
29 */
30
31#define LM_TYPE_RESERVED 0x00
32#define LM_TYPE_NONDISK 0x01
33#define LM_TYPE_INODE 0x02
34#define LM_TYPE_RGRP 0x03
35#define LM_TYPE_META 0x04
36#define LM_TYPE_IOPEN 0x05
37#define LM_TYPE_FLOCK 0x06
38#define LM_TYPE_PLOCK 0x07
39#define LM_TYPE_QUOTA 0x08
40#define LM_TYPE_JOURNAL 0x09
41
42/*
43 * lm_lock() states
44 *
45 * SHARED is compatible with SHARED, not with DEFERRED or EX.
46 * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
47 */
48
49#define LM_ST_UNLOCKED 0
50#define LM_ST_EXCLUSIVE 1
51#define LM_ST_DEFERRED 2
52#define LM_ST_SHARED 3
53
54/*
55 * lm_lock() flags
56 *
57 * LM_FLAG_TRY
58 * Don't wait to acquire the lock if it can't be granted immediately.
59 *
60 * LM_FLAG_TRY_1CB
61 * Send one blocking callback if TRY is set and the lock is not granted.
62 *
63 * LM_FLAG_NOEXP
64 * GFS sets this flag on lock requests it makes while doing journal recovery.
65 * These special requests should not be blocked due to the recovery like
66 * ordinary locks would be.
67 *
68 * LM_FLAG_ANY
69 * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
70 * also be granted in SHARED. The preferred state is whichever is compatible
71 * with other granted locks, or the specified state if no other locks exist.
72 *
73 * LM_FLAG_PRIORITY
74 * Override fairness considerations. Suppose a lock is held in a shared state
75 * and there is a pending request for the deferred state. A shared lock
76 * request with the priority flag would be allowed to bypass the deferred
77 * request and directly join the other shared lock. A shared lock request
78 * without the priority flag might be forced to wait until the deferred
79 * requested had acquired and released the lock.
80 */
81
18#define LM_FLAG_TRY 0x00000001 82#define LM_FLAG_TRY 0x00000001
19#define LM_FLAG_TRY_1CB 0x00000002 83#define LM_FLAG_TRY_1CB 0x00000002
20#define LM_FLAG_NOEXP 0x00000004 84#define LM_FLAG_NOEXP 0x00000004
21#define LM_FLAG_ANY 0x00000008 85#define LM_FLAG_ANY 0x00000008
22#define LM_FLAG_PRIORITY 0x00000010 */ 86#define LM_FLAG_PRIORITY 0x00000010
87#define GL_ASYNC 0x00000040
88#define GL_EXACT 0x00000080
89#define GL_SKIP 0x00000100
90#define GL_ATIME 0x00000200
91#define GL_NOCACHE 0x00000400
92
93/*
94 * lm_lock() and lm_async_cb return flags
95 *
96 * LM_OUT_ST_MASK
97 * Masks the lower two bits of lock state in the returned value.
98 *
99 * LM_OUT_CANCELED
100 * The lock request was canceled.
101 *
102 * LM_OUT_ASYNC
103 * The result of the request will be returned in an LM_CB_ASYNC callback.
104 *
105 */
106
107#define LM_OUT_ST_MASK 0x00000003
108#define LM_OUT_CANCELED 0x00000008
109#define LM_OUT_ASYNC 0x00000080
110#define LM_OUT_ERROR 0x00000100
111
112/*
113 * lm_recovery_done() messages
114 */
115
116#define LM_RD_GAVEUP 308
117#define LM_RD_SUCCESS 309
118
119#define GLR_TRYFAILED 13
120
121struct lm_lockops {
122 const char *lm_proto_name;
123 int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
124 void (*lm_unmount) (struct gfs2_sbd *sdp);
125 void (*lm_withdraw) (struct gfs2_sbd *sdp);
126 void (*lm_put_lock) (struct kmem_cache *cachep, void *gl);
127 unsigned int (*lm_lock) (struct gfs2_glock *gl,
128 unsigned int req_state, unsigned int flags);
129 void (*lm_cancel) (struct gfs2_glock *gl);
130 const match_table_t *lm_tokens;
131};
132
133#define LM_FLAG_TRY 0x00000001
134#define LM_FLAG_TRY_1CB 0x00000002
135#define LM_FLAG_NOEXP 0x00000004
136#define LM_FLAG_ANY 0x00000008
137#define LM_FLAG_PRIORITY 0x00000010
23 138
24#define GL_ASYNC 0x00000040 139#define GL_ASYNC 0x00000040
25#define GL_EXACT 0x00000080 140#define GL_EXACT 0x00000080
@@ -128,10 +243,12 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
128int gfs2_lvb_hold(struct gfs2_glock *gl); 243int gfs2_lvb_hold(struct gfs2_glock *gl);
129void gfs2_lvb_unhold(struct gfs2_glock *gl); 244void gfs2_lvb_unhold(struct gfs2_glock *gl);
130 245
131void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); 246void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
247void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
132void gfs2_reclaim_glock(struct gfs2_sbd *sdp); 248void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
133void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); 249void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
134void gfs2_glock_finish_truncate(struct gfs2_inode *ip); 250void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
251void gfs2_glock_thaw(struct gfs2_sbd *sdp);
135 252
136int __init gfs2_glock_init(void); 253int __init gfs2_glock_init(void);
137void gfs2_glock_exit(void); 254void gfs2_glock_exit(void);
@@ -141,4 +258,6 @@ void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
141int gfs2_register_debugfs(void); 258int gfs2_register_debugfs(void);
142void gfs2_unregister_debugfs(void); 259void gfs2_unregister_debugfs(void);
143 260
261extern const struct lm_lockops gfs2_dlm_ops;
262
144#endif /* __GLOCK_DOT_H__ */ 263#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 8522d3aa64fc..bf23a62aa925 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/lm_interface.h>
16#include <linux/bio.h> 15#include <linux/bio.h>
17 16
18#include "gfs2.h" 17#include "gfs2.h"
@@ -38,20 +37,25 @@
38static void gfs2_ail_empty_gl(struct gfs2_glock *gl) 37static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
39{ 38{
40 struct gfs2_sbd *sdp = gl->gl_sbd; 39 struct gfs2_sbd *sdp = gl->gl_sbd;
41 unsigned int blocks;
42 struct list_head *head = &gl->gl_ail_list; 40 struct list_head *head = &gl->gl_ail_list;
43 struct gfs2_bufdata *bd; 41 struct gfs2_bufdata *bd;
44 struct buffer_head *bh; 42 struct buffer_head *bh;
45 int error; 43 struct gfs2_trans tr;
46 44
47 blocks = atomic_read(&gl->gl_ail_count); 45 memset(&tr, 0, sizeof(tr));
48 if (!blocks) 46 tr.tr_revokes = atomic_read(&gl->gl_ail_count);
49 return;
50 47
51 error = gfs2_trans_begin(sdp, 0, blocks); 48 if (!tr.tr_revokes)
52 if (gfs2_assert_withdraw(sdp, !error))
53 return; 49 return;
54 50
51 /* A shortened, inline version of gfs2_trans_begin() */
52 tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
53 tr.tr_ip = (unsigned long)__builtin_return_address(0);
54 INIT_LIST_HEAD(&tr.tr_list_buf);
55 gfs2_log_reserve(sdp, tr.tr_reserved);
56 BUG_ON(current->journal_info);
57 current->journal_info = &tr;
58
55 gfs2_log_lock(sdp); 59 gfs2_log_lock(sdp);
56 while (!list_empty(head)) { 60 while (!list_empty(head)) {
57 bd = list_entry(head->next, struct gfs2_bufdata, 61 bd = list_entry(head->next, struct gfs2_bufdata,
@@ -72,29 +76,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
72} 76}
73 77
74/** 78/**
75 * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock 79 * rgrp_go_sync - sync out the metadata for this glock
76 * @gl: the glock
77 *
78 */
79
80static void gfs2_pte_inval(struct gfs2_glock *gl)
81{
82 struct gfs2_inode *ip;
83 struct inode *inode;
84
85 ip = gl->gl_object;
86 inode = &ip->i_inode;
87 if (!ip || !S_ISREG(inode->i_mode))
88 return;
89
90 unmap_shared_mapping_range(inode->i_mapping, 0, 0);
91 if (test_bit(GIF_SW_PAGED, &ip->i_flags))
92 set_bit(GLF_DIRTY, &gl->gl_flags);
93
94}
95
96/**
97 * meta_go_sync - sync out the metadata for this glock
98 * @gl: the glock 80 * @gl: the glock
99 * 81 *
100 * Called when demoting or unlocking an EX glock. We must flush 82 * Called when demoting or unlocking an EX glock. We must flush
@@ -102,36 +84,42 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
102 * not return to caller to demote/unlock the glock until I/O is complete. 84 * not return to caller to demote/unlock the glock until I/O is complete.
103 */ 85 */
104 86
105static void meta_go_sync(struct gfs2_glock *gl) 87static void rgrp_go_sync(struct gfs2_glock *gl)
106{ 88{
107 if (gl->gl_state != LM_ST_EXCLUSIVE) 89 struct address_space *metamapping = gl->gl_aspace->i_mapping;
90 int error;
91
92 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
108 return; 93 return;
94 BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
109 95
110 if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) { 96 gfs2_log_flush(gl->gl_sbd, gl);
111 gfs2_log_flush(gl->gl_sbd, gl); 97 filemap_fdatawrite(metamapping);
112 gfs2_meta_sync(gl); 98 error = filemap_fdatawait(metamapping);
113 gfs2_ail_empty_gl(gl); 99 mapping_set_error(metamapping, error);
114 } 100 gfs2_ail_empty_gl(gl);
115} 101}
116 102
117/** 103/**
118 * meta_go_inval - invalidate the metadata for this glock 104 * rgrp_go_inval - invalidate the metadata for this glock
119 * @gl: the glock 105 * @gl: the glock
120 * @flags: 106 * @flags:
121 * 107 *
108 * We never used LM_ST_DEFERRED with resource groups, so that we
109 * should always see the metadata flag set here.
110 *
122 */ 111 */
123 112
124static void meta_go_inval(struct gfs2_glock *gl, int flags) 113static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
125{ 114{
126 if (!(flags & DIO_METADATA)) 115 struct address_space *mapping = gl->gl_aspace->i_mapping;
127 return;
128 116
129 gfs2_meta_inval(gl); 117 BUG_ON(!(flags & DIO_METADATA));
130 if (gl->gl_object == GFS2_I(gl->gl_sbd->sd_rindex)) 118 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
131 gl->gl_sbd->sd_rindex_uptodate = 0; 119 truncate_inode_pages(mapping, 0);
132 else if (gl->gl_ops == &gfs2_rgrp_glops && gl->gl_object) {
133 struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
134 120
121 if (gl->gl_object) {
122 struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
135 rgd->rd_flags &= ~GFS2_RDF_UPTODATE; 123 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
136 } 124 }
137} 125}
@@ -148,48 +136,54 @@ static void inode_go_sync(struct gfs2_glock *gl)
148 struct address_space *metamapping = gl->gl_aspace->i_mapping; 136 struct address_space *metamapping = gl->gl_aspace->i_mapping;
149 int error; 137 int error;
150 138
151 if (gl->gl_state != LM_ST_UNLOCKED)
152 gfs2_pte_inval(gl);
153 if (gl->gl_state != LM_ST_EXCLUSIVE)
154 return;
155
156 if (ip && !S_ISREG(ip->i_inode.i_mode)) 139 if (ip && !S_ISREG(ip->i_inode.i_mode))
157 ip = NULL; 140 ip = NULL;
141 if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
142 unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
143 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
144 return;
158 145
159 if (test_bit(GLF_DIRTY, &gl->gl_flags)) { 146 BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
160 gfs2_log_flush(gl->gl_sbd, gl); 147
161 filemap_fdatawrite(metamapping); 148 gfs2_log_flush(gl->gl_sbd, gl);
162 if (ip) { 149 filemap_fdatawrite(metamapping);
163 struct address_space *mapping = ip->i_inode.i_mapping; 150 if (ip) {
164 filemap_fdatawrite(mapping); 151 struct address_space *mapping = ip->i_inode.i_mapping;
165 error = filemap_fdatawait(mapping); 152 filemap_fdatawrite(mapping);
166 mapping_set_error(mapping, error); 153 error = filemap_fdatawait(mapping);
167 } 154 mapping_set_error(mapping, error);
168 error = filemap_fdatawait(metamapping);
169 mapping_set_error(metamapping, error);
170 clear_bit(GLF_DIRTY, &gl->gl_flags);
171 gfs2_ail_empty_gl(gl);
172 } 155 }
156 error = filemap_fdatawait(metamapping);
157 mapping_set_error(metamapping, error);
158 gfs2_ail_empty_gl(gl);
173} 159}
174 160
175/** 161/**
176 * inode_go_inval - prepare a inode glock to be released 162 * inode_go_inval - prepare a inode glock to be released
177 * @gl: the glock 163 * @gl: the glock
178 * @flags: 164 * @flags:
165 *
166 * Normally we invlidate everything, but if we are moving into
167 * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we
168 * can keep hold of the metadata, since it won't have changed.
179 * 169 *
180 */ 170 */
181 171
182static void inode_go_inval(struct gfs2_glock *gl, int flags) 172static void inode_go_inval(struct gfs2_glock *gl, int flags)
183{ 173{
184 struct gfs2_inode *ip = gl->gl_object; 174 struct gfs2_inode *ip = gl->gl_object;
185 int meta = (flags & DIO_METADATA);
186 175
187 if (meta) { 176 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
188 gfs2_meta_inval(gl); 177
178 if (flags & DIO_METADATA) {
179 struct address_space *mapping = gl->gl_aspace->i_mapping;
180 truncate_inode_pages(mapping, 0);
189 if (ip) 181 if (ip)
190 set_bit(GIF_INVALID, &ip->i_flags); 182 set_bit(GIF_INVALID, &ip->i_flags);
191 } 183 }
192 184
185 if (ip == GFS2_I(gl->gl_sbd->sd_rindex))
186 gl->gl_sbd->sd_rindex_uptodate = 0;
193 if (ip && S_ISREG(ip->i_inode.i_mode)) 187 if (ip && S_ISREG(ip->i_inode.i_mode))
194 truncate_inode_pages(ip->i_inode.i_mapping, 0); 188 truncate_inode_pages(ip->i_inode.i_mapping, 0);
195} 189}
@@ -390,20 +384,7 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
390 return 0; 384 return 0;
391} 385}
392 386
393/**
394 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
395 * @gl: the glock
396 *
397 * Returns: 1 if it's ok
398 */
399
400static int quota_go_demote_ok(const struct gfs2_glock *gl)
401{
402 return !atomic_read(&gl->gl_lvb_count);
403}
404
405const struct gfs2_glock_operations gfs2_meta_glops = { 387const struct gfs2_glock_operations gfs2_meta_glops = {
406 .go_xmote_th = meta_go_sync,
407 .go_type = LM_TYPE_META, 388 .go_type = LM_TYPE_META,
408}; 389};
409 390
@@ -418,8 +399,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
418}; 399};
419 400
420const struct gfs2_glock_operations gfs2_rgrp_glops = { 401const struct gfs2_glock_operations gfs2_rgrp_glops = {
421 .go_xmote_th = meta_go_sync, 402 .go_xmote_th = rgrp_go_sync,
422 .go_inval = meta_go_inval, 403 .go_inval = rgrp_go_inval,
423 .go_demote_ok = rgrp_go_demote_ok, 404 .go_demote_ok = rgrp_go_demote_ok,
424 .go_lock = rgrp_go_lock, 405 .go_lock = rgrp_go_lock,
425 .go_unlock = rgrp_go_unlock, 406 .go_unlock = rgrp_go_unlock,
@@ -448,7 +429,6 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
448}; 429};
449 430
450const struct gfs2_glock_operations gfs2_quota_glops = { 431const struct gfs2_glock_operations gfs2_quota_glops = {
451 .go_demote_ok = quota_go_demote_ok,
452 .go_type = LM_TYPE_QUOTA, 432 .go_type = LM_TYPE_QUOTA,
453}; 433};
454 434
@@ -456,3 +436,15 @@ const struct gfs2_glock_operations gfs2_journal_glops = {
456 .go_type = LM_TYPE_JOURNAL, 436 .go_type = LM_TYPE_JOURNAL,
457}; 437};
458 438
439const struct gfs2_glock_operations *gfs2_glops_list[] = {
440 [LM_TYPE_META] = &gfs2_meta_glops,
441 [LM_TYPE_INODE] = &gfs2_inode_glops,
442 [LM_TYPE_RGRP] = &gfs2_rgrp_glops,
443 [LM_TYPE_NONDISK] = &gfs2_trans_glops,
444 [LM_TYPE_IOPEN] = &gfs2_iopen_glops,
445 [LM_TYPE_FLOCK] = &gfs2_flock_glops,
446 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
447 [LM_TYPE_QUOTA] = &gfs2_quota_glops,
448 [LM_TYPE_JOURNAL] = &gfs2_journal_glops,
449};
450
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index a1d9b5b024e6..b3aa2e3210fd 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -21,5 +21,6 @@ extern const struct gfs2_glock_operations gfs2_flock_glops;
21extern const struct gfs2_glock_operations gfs2_nondisk_glops; 21extern const struct gfs2_glock_operations gfs2_nondisk_glops;
22extern const struct gfs2_glock_operations gfs2_quota_glops; 22extern const struct gfs2_glock_operations gfs2_quota_glops;
23extern const struct gfs2_glock_operations gfs2_journal_glops; 23extern const struct gfs2_glock_operations gfs2_journal_glops;
24extern const struct gfs2_glock_operations *gfs2_glops_list[];
24 25
25#endif /* __GLOPS_DOT_H__ */ 26#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 608849d00021..399d1b978049 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,8 @@
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/workqueue.h> 14#include <linux/workqueue.h>
15#include <linux/dlm.h>
16#include <linux/buffer_head.h>
15 17
16#define DIO_WAIT 0x00000010 18#define DIO_WAIT 0x00000010
17#define DIO_METADATA 0x00000020 19#define DIO_METADATA 0x00000020
@@ -26,6 +28,7 @@ struct gfs2_trans;
26struct gfs2_ail; 28struct gfs2_ail;
27struct gfs2_jdesc; 29struct gfs2_jdesc;
28struct gfs2_sbd; 30struct gfs2_sbd;
31struct lm_lockops;
29 32
30typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); 33typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
31 34
@@ -121,6 +124,28 @@ struct gfs2_bufdata {
121 struct list_head bd_ail_gl_list; 124 struct list_head bd_ail_gl_list;
122}; 125};
123 126
127/*
128 * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
129 * prefix of lock_dlm_ gets awkward.
130 */
131
132#define GDLM_STRNAME_BYTES 25
133#define GDLM_LVB_SIZE 32
134
135enum {
136 DFL_BLOCK_LOCKS = 0,
137};
138
139struct lm_lockname {
140 u64 ln_number;
141 unsigned int ln_type;
142};
143
144#define lm_name_equal(name1, name2) \
145 (((name1)->ln_number == (name2)->ln_number) && \
146 ((name1)->ln_type == (name2)->ln_type))
147
148
124struct gfs2_glock_operations { 149struct gfs2_glock_operations {
125 void (*go_xmote_th) (struct gfs2_glock *gl); 150 void (*go_xmote_th) (struct gfs2_glock *gl);
126 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); 151 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
@@ -162,6 +187,8 @@ enum {
162 GLF_LFLUSH = 7, 187 GLF_LFLUSH = 7,
163 GLF_INVALIDATE_IN_PROGRESS = 8, 188 GLF_INVALIDATE_IN_PROGRESS = 8,
164 GLF_REPLY_PENDING = 9, 189 GLF_REPLY_PENDING = 9,
190 GLF_INITIAL = 10,
191 GLF_FROZEN = 11,
165}; 192};
166 193
167struct gfs2_glock { 194struct gfs2_glock {
@@ -176,16 +203,15 @@ struct gfs2_glock {
176 unsigned int gl_target; 203 unsigned int gl_target;
177 unsigned int gl_reply; 204 unsigned int gl_reply;
178 unsigned int gl_hash; 205 unsigned int gl_hash;
206 unsigned int gl_req;
179 unsigned int gl_demote_state; /* state requested by remote node */ 207 unsigned int gl_demote_state; /* state requested by remote node */
180 unsigned long gl_demote_time; /* time of first demote request */ 208 unsigned long gl_demote_time; /* time of first demote request */
181 struct list_head gl_holders; 209 struct list_head gl_holders;
182 210
183 const struct gfs2_glock_operations *gl_ops; 211 const struct gfs2_glock_operations *gl_ops;
184 void *gl_lock; 212 char gl_strname[GDLM_STRNAME_BYTES];
185 char *gl_lvb; 213 struct dlm_lksb gl_lksb;
186 atomic_t gl_lvb_count; 214 char gl_lvb[32];
187
188 unsigned long gl_stamp;
189 unsigned long gl_tchange; 215 unsigned long gl_tchange;
190 void *gl_object; 216 void *gl_object;
191 217
@@ -283,7 +309,9 @@ enum {
283 309
284struct gfs2_quota_data { 310struct gfs2_quota_data {
285 struct list_head qd_list; 311 struct list_head qd_list;
286 unsigned int qd_count; 312 struct list_head qd_reclaim;
313
314 atomic_t qd_count;
287 315
288 u32 qd_id; 316 u32 qd_id;
289 unsigned long qd_flags; /* QDF_... */ 317 unsigned long qd_flags; /* QDF_... */
@@ -303,7 +331,6 @@ struct gfs2_quota_data {
303 331
304 u64 qd_sync_gen; 332 u64 qd_sync_gen;
305 unsigned long qd_last_warn; 333 unsigned long qd_last_warn;
306 unsigned long qd_last_touched;
307}; 334};
308 335
309struct gfs2_trans { 336struct gfs2_trans {
@@ -390,7 +417,7 @@ struct gfs2_args {
390 unsigned int ar_suiddir:1; /* suiddir support */ 417 unsigned int ar_suiddir:1; /* suiddir support */
391 unsigned int ar_data:2; /* ordered/writeback */ 418 unsigned int ar_data:2; /* ordered/writeback */
392 unsigned int ar_meta:1; /* mount metafs */ 419 unsigned int ar_meta:1; /* mount metafs */
393 unsigned int ar_num_glockd; /* Number of glockd threads */ 420 unsigned int ar_discard:1; /* discard requests */
394}; 421};
395 422
396struct gfs2_tune { 423struct gfs2_tune {
@@ -406,7 +433,6 @@ struct gfs2_tune {
406 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ 433 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
407 unsigned int gt_quota_scale_num; /* Numerator */ 434 unsigned int gt_quota_scale_num; /* Numerator */
408 unsigned int gt_quota_scale_den; /* Denominator */ 435 unsigned int gt_quota_scale_den; /* Denominator */
409 unsigned int gt_quota_cache_secs;
410 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ 436 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
411 unsigned int gt_new_files_jdata; 437 unsigned int gt_new_files_jdata;
412 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 438 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
@@ -445,6 +471,31 @@ struct gfs2_sb_host {
445 471
446 char sb_lockproto[GFS2_LOCKNAME_LEN]; 472 char sb_lockproto[GFS2_LOCKNAME_LEN];
447 char sb_locktable[GFS2_LOCKNAME_LEN]; 473 char sb_locktable[GFS2_LOCKNAME_LEN];
474 u8 sb_uuid[16];
475};
476
477/*
478 * lm_mount() return values
479 *
480 * ls_jid - the journal ID this node should use
481 * ls_first - this node is the first to mount the file system
482 * ls_lockspace - lock module's context for this file system
483 * ls_ops - lock module's functions
484 */
485
486struct lm_lockstruct {
487 u32 ls_id;
488 unsigned int ls_jid;
489 unsigned int ls_first;
490 unsigned int ls_first_done;
491 unsigned int ls_nodir;
492 const struct lm_lockops *ls_ops;
493 unsigned long ls_flags;
494 dlm_lockspace_t *ls_dlm;
495
496 int ls_recover_jid;
497 int ls_recover_jid_done;
498 int ls_recover_jid_status;
448}; 499};
449 500
450struct gfs2_sbd { 501struct gfs2_sbd {
@@ -520,7 +571,6 @@ struct gfs2_sbd {
520 spinlock_t sd_jindex_spin; 571 spinlock_t sd_jindex_spin;
521 struct mutex sd_jindex_mutex; 572 struct mutex sd_jindex_mutex;
522 unsigned int sd_journals; 573 unsigned int sd_journals;
523 unsigned long sd_jindex_refresh_time;
524 574
525 struct gfs2_jdesc *sd_jdesc; 575 struct gfs2_jdesc *sd_jdesc;
526 struct gfs2_holder sd_journal_gh; 576 struct gfs2_holder sd_journal_gh;
@@ -540,7 +590,6 @@ struct gfs2_sbd {
540 590
541 struct list_head sd_quota_list; 591 struct list_head sd_quota_list;
542 atomic_t sd_quota_count; 592 atomic_t sd_quota_count;
543 spinlock_t sd_quota_spin;
544 struct mutex sd_quota_mutex; 593 struct mutex sd_quota_mutex;
545 wait_queue_head_t sd_quota_wait; 594 wait_queue_head_t sd_quota_wait;
546 struct list_head sd_trunc_list; 595 struct list_head sd_trunc_list;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 3b87c188da41..7b277d449155 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -16,7 +16,6 @@
16#include <linux/sort.h> 16#include <linux/sort.h>
17#include <linux/gfs2_ondisk.h> 17#include <linux/gfs2_ondisk.h>
18#include <linux/crc32.h> 18#include <linux/crc32.h>
19#include <linux/lm_interface.h>
20#include <linux/security.h> 19#include <linux/security.h>
21#include <linux/time.h> 20#include <linux/time.h>
22 21
@@ -137,16 +136,16 @@ void gfs2_set_iop(struct inode *inode)
137 136
138 if (S_ISREG(mode)) { 137 if (S_ISREG(mode)) {
139 inode->i_op = &gfs2_file_iops; 138 inode->i_op = &gfs2_file_iops;
140 if (sdp->sd_args.ar_localflocks) 139 if (gfs2_localflocks(sdp))
141 inode->i_fop = &gfs2_file_fops_nolock; 140 inode->i_fop = gfs2_file_fops_nolock;
142 else 141 else
143 inode->i_fop = &gfs2_file_fops; 142 inode->i_fop = gfs2_file_fops;
144 } else if (S_ISDIR(mode)) { 143 } else if (S_ISDIR(mode)) {
145 inode->i_op = &gfs2_dir_iops; 144 inode->i_op = &gfs2_dir_iops;
146 if (sdp->sd_args.ar_localflocks) 145 if (gfs2_localflocks(sdp))
147 inode->i_fop = &gfs2_dir_fops_nolock; 146 inode->i_fop = gfs2_dir_fops_nolock;
148 else 147 else
149 inode->i_fop = &gfs2_dir_fops; 148 inode->i_fop = gfs2_dir_fops;
150 } else if (S_ISLNK(mode)) { 149 } else if (S_ISLNK(mode)) {
151 inode->i_op = &gfs2_symlink_iops; 150 inode->i_op = &gfs2_symlink_iops;
152 } else { 151 } else {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d5329364cdff..dca4fee3078b 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -101,12 +101,26 @@ void gfs2_dinode_print(const struct gfs2_inode *ip);
101extern const struct inode_operations gfs2_file_iops; 101extern const struct inode_operations gfs2_file_iops;
102extern const struct inode_operations gfs2_dir_iops; 102extern const struct inode_operations gfs2_dir_iops;
103extern const struct inode_operations gfs2_symlink_iops; 103extern const struct inode_operations gfs2_symlink_iops;
104extern const struct file_operations gfs2_file_fops; 104extern const struct file_operations *gfs2_file_fops_nolock;
105extern const struct file_operations gfs2_dir_fops; 105extern const struct file_operations *gfs2_dir_fops_nolock;
106extern const struct file_operations gfs2_file_fops_nolock;
107extern const struct file_operations gfs2_dir_fops_nolock;
108 106
109extern void gfs2_set_inode_flags(struct inode *inode); 107extern void gfs2_set_inode_flags(struct inode *inode);
108
109#ifdef CONFIG_GFS2_FS_LOCKING_DLM
110extern const struct file_operations *gfs2_file_fops;
111extern const struct file_operations *gfs2_dir_fops;
112static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
113{
114 return sdp->sd_args.ar_localflocks;
115}
116#else /* Single node only */
117#define gfs2_file_fops NULL
118#define gfs2_dir_fops NULL
119static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
120{
121 return 1;
122}
123#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
110 124
111#endif /* __INODE_DOT_H__ */ 125#endif /* __INODE_DOT_H__ */
112 126
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
new file mode 100644
index 000000000000..46df988323bc
--- /dev/null
+++ b/fs/gfs2/lock_dlm.c
@@ -0,0 +1,241 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/fs.h>
11#include <linux/dlm.h>
12#include <linux/types.h>
13#include <linux/gfs2_ondisk.h>
14
15#include "incore.h"
16#include "glock.h"
17#include "util.h"
18
19
20static void gdlm_ast(void *arg)
21{
22 struct gfs2_glock *gl = arg;
23 unsigned ret = gl->gl_state;
24
25 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
26
27 if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID)
28 memset(gl->gl_lvb, 0, GDLM_LVB_SIZE);
29
30 switch (gl->gl_lksb.sb_status) {
31 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
32 kmem_cache_free(gfs2_glock_cachep, gl);
33 return;
34 case -DLM_ECANCEL: /* Cancel while getting lock */
35 ret |= LM_OUT_CANCELED;
36 goto out;
37 case -EAGAIN: /* Try lock fails */
38 goto out;
39 case -EINVAL: /* Invalid */
40 case -ENOMEM: /* Out of memory */
41 ret |= LM_OUT_ERROR;
42 goto out;
43 case 0: /* Success */
44 break;
45 default: /* Something unexpected */
46 BUG();
47 }
48
49 ret = gl->gl_req;
50 if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) {
51 if (gl->gl_req == LM_ST_SHARED)
52 ret = LM_ST_DEFERRED;
53 else if (gl->gl_req == LM_ST_DEFERRED)
54 ret = LM_ST_SHARED;
55 else
56 BUG();
57 }
58
59 set_bit(GLF_INITIAL, &gl->gl_flags);
60 gfs2_glock_complete(gl, ret);
61 return;
62out:
63 if (!test_bit(GLF_INITIAL, &gl->gl_flags))
64 gl->gl_lksb.sb_lkid = 0;
65 gfs2_glock_complete(gl, ret);
66}
67
68static void gdlm_bast(void *arg, int mode)
69{
70 struct gfs2_glock *gl = arg;
71
72 switch (mode) {
73 case DLM_LOCK_EX:
74 gfs2_glock_cb(gl, LM_ST_UNLOCKED);
75 break;
76 case DLM_LOCK_CW:
77 gfs2_glock_cb(gl, LM_ST_DEFERRED);
78 break;
79 case DLM_LOCK_PR:
80 gfs2_glock_cb(gl, LM_ST_SHARED);
81 break;
82 default:
83 printk(KERN_ERR "unknown bast mode %d", mode);
84 BUG();
85 }
86}
87
88/* convert gfs lock-state to dlm lock-mode */
89
90static int make_mode(const unsigned int lmstate)
91{
92 switch (lmstate) {
93 case LM_ST_UNLOCKED:
94 return DLM_LOCK_NL;
95 case LM_ST_EXCLUSIVE:
96 return DLM_LOCK_EX;
97 case LM_ST_DEFERRED:
98 return DLM_LOCK_CW;
99 case LM_ST_SHARED:
100 return DLM_LOCK_PR;
101 }
102 printk(KERN_ERR "unknown LM state %d", lmstate);
103 BUG();
104 return -1;
105}
106
107static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
108 const int req)
109{
110 u32 lkf = 0;
111
112 if (gfs_flags & LM_FLAG_TRY)
113 lkf |= DLM_LKF_NOQUEUE;
114
115 if (gfs_flags & LM_FLAG_TRY_1CB) {
116 lkf |= DLM_LKF_NOQUEUE;
117 lkf |= DLM_LKF_NOQUEUEBAST;
118 }
119
120 if (gfs_flags & LM_FLAG_PRIORITY) {
121 lkf |= DLM_LKF_NOORDER;
122 lkf |= DLM_LKF_HEADQUE;
123 }
124
125 if (gfs_flags & LM_FLAG_ANY) {
126 if (req == DLM_LOCK_PR)
127 lkf |= DLM_LKF_ALTCW;
128 else if (req == DLM_LOCK_CW)
129 lkf |= DLM_LKF_ALTPR;
130 else
131 BUG();
132 }
133
134 if (lkid != 0)
135 lkf |= DLM_LKF_CONVERT;
136
137 lkf |= DLM_LKF_VALBLK;
138
139 return lkf;
140}
141
142static unsigned int gdlm_lock(struct gfs2_glock *gl,
143 unsigned int req_state, unsigned int flags)
144{
145 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
146 int error;
147 int req;
148 u32 lkf;
149
150 gl->gl_req = req_state;
151 req = make_mode(req_state);
152 lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
153
154 /*
155 * Submit the actual lock request.
156 */
157
158 error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
159 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
160 if (error == -EAGAIN)
161 return 0;
162 if (error)
163 return LM_OUT_ERROR;
164 return LM_OUT_ASYNC;
165}
166
167static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr)
168{
169 struct gfs2_glock *gl = ptr;
170 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
171 int error;
172
173 if (gl->gl_lksb.sb_lkid == 0) {
174 kmem_cache_free(cachep, gl);
175 return;
176 }
177
178 error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
179 NULL, gl);
180 if (error) {
181 printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n",
182 gl->gl_name.ln_type,
183 (unsigned long long)gl->gl_name.ln_number, error);
184 return;
185 }
186}
187
188static void gdlm_cancel(struct gfs2_glock *gl)
189{
190 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
191 dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
192}
193
194static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname)
195{
196 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
197 int error;
198
199 if (fsname == NULL) {
200 fs_info(sdp, "no fsname found\n");
201 return -EINVAL;
202 }
203
204 error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm,
205 DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
206 (ls->ls_nodir ? DLM_LSFL_NODIR : 0),
207 GDLM_LVB_SIZE);
208 if (error)
209 printk(KERN_ERR "dlm_new_lockspace error %d", error);
210
211 return error;
212}
213
214static void gdlm_unmount(struct gfs2_sbd *sdp)
215{
216 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
217
218 if (ls->ls_dlm) {
219 dlm_release_lockspace(ls->ls_dlm, 2);
220 ls->ls_dlm = NULL;
221 }
222}
223
224static const match_table_t dlm_tokens = {
225 { Opt_jid, "jid=%d"},
226 { Opt_id, "id=%d"},
227 { Opt_first, "first=%d"},
228 { Opt_nodir, "nodir=%d"},
229 { Opt_err, NULL },
230};
231
232const struct lm_lockops gfs2_dlm_ops = {
233 .lm_proto_name = "lock_dlm",
234 .lm_mount = gdlm_mount,
235 .lm_unmount = gdlm_unmount,
236 .lm_put_lock = gdlm_put_lock,
237 .lm_lock = gdlm_lock,
238 .lm_cancel = gdlm_cancel,
239 .lm_tokens = &dlm_tokens,
240};
241
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
deleted file mode 100644
index 523243a13a21..000000000000
--- a/fs/gfs2/locking.c
+++ /dev/null
@@ -1,232 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/string.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/sched.h>
16#include <linux/kmod.h>
17#include <linux/fs.h>
18#include <linux/delay.h>
19#include <linux/lm_interface.h>
20
21struct lmh_wrapper {
22 struct list_head lw_list;
23 const struct lm_lockops *lw_ops;
24};
25
26static int nolock_mount(char *table_name, char *host_data,
27 lm_callback_t cb, void *cb_data,
28 unsigned int min_lvb_size, int flags,
29 struct lm_lockstruct *lockstruct,
30 struct kobject *fskobj);
31
32/* List of registered low-level locking protocols. A file system selects one
33 of them by name at mount time, e.g. lock_nolock, lock_dlm. */
34
35static const struct lm_lockops nolock_ops = {
36 .lm_proto_name = "lock_nolock",
37 .lm_mount = nolock_mount,
38};
39
40static struct lmh_wrapper nolock_proto = {
41 .lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
42 .lw_ops = &nolock_ops,
43};
44
45static LIST_HEAD(lmh_list);
46static DEFINE_MUTEX(lmh_lock);
47
48static int nolock_mount(char *table_name, char *host_data,
49 lm_callback_t cb, void *cb_data,
50 unsigned int min_lvb_size, int flags,
51 struct lm_lockstruct *lockstruct,
52 struct kobject *fskobj)
53{
54 char *c;
55 unsigned int jid;
56
57 c = strstr(host_data, "jid=");
58 if (!c)
59 jid = 0;
60 else {
61 c += 4;
62 sscanf(c, "%u", &jid);
63 }
64
65 lockstruct->ls_jid = jid;
66 lockstruct->ls_first = 1;
67 lockstruct->ls_lvb_size = min_lvb_size;
68 lockstruct->ls_ops = &nolock_ops;
69 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
70
71 return 0;
72}
73
74/**
75 * gfs2_register_lockproto - Register a low-level locking protocol
76 * @proto: the protocol definition
77 *
78 * Returns: 0 on success, -EXXX on failure
79 */
80
81int gfs2_register_lockproto(const struct lm_lockops *proto)
82{
83 struct lmh_wrapper *lw;
84
85 mutex_lock(&lmh_lock);
86
87 list_for_each_entry(lw, &lmh_list, lw_list) {
88 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
89 mutex_unlock(&lmh_lock);
90 printk(KERN_INFO "GFS2: protocol %s already exists\n",
91 proto->lm_proto_name);
92 return -EEXIST;
93 }
94 }
95
96 lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
97 if (!lw) {
98 mutex_unlock(&lmh_lock);
99 return -ENOMEM;
100 }
101
102 lw->lw_ops = proto;
103 list_add(&lw->lw_list, &lmh_list);
104
105 mutex_unlock(&lmh_lock);
106
107 return 0;
108}
109
110/**
111 * gfs2_unregister_lockproto - Unregister a low-level locking protocol
112 * @proto: the protocol definition
113 *
114 */
115
116void gfs2_unregister_lockproto(const struct lm_lockops *proto)
117{
118 struct lmh_wrapper *lw;
119
120 mutex_lock(&lmh_lock);
121
122 list_for_each_entry(lw, &lmh_list, lw_list) {
123 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
124 list_del(&lw->lw_list);
125 mutex_unlock(&lmh_lock);
126 kfree(lw);
127 return;
128 }
129 }
130
131 mutex_unlock(&lmh_lock);
132
133 printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
134 proto->lm_proto_name);
135}
136
137/**
138 * gfs2_mount_lockproto - Mount a lock protocol
139 * @proto_name - the name of the protocol
140 * @table_name - the name of the lock space
141 * @host_data - data specific to this host
142 * @cb - the callback to the code using the lock module
143 * @sdp - The GFS2 superblock
144 * @min_lvb_size - the mininum LVB size that the caller can deal with
145 * @flags - LM_MFLAG_*
146 * @lockstruct - a structure returned describing the mount
147 *
148 * Returns: 0 on success, -EXXX on failure
149 */
150
151int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
152 lm_callback_t cb, void *cb_data,
153 unsigned int min_lvb_size, int flags,
154 struct lm_lockstruct *lockstruct,
155 struct kobject *fskobj)
156{
157 struct lmh_wrapper *lw = NULL;
158 int try = 0;
159 int error, found;
160
161
162retry:
163 mutex_lock(&lmh_lock);
164
165 if (list_empty(&nolock_proto.lw_list))
166 list_add(&nolock_proto.lw_list, &lmh_list);
167
168 found = 0;
169 list_for_each_entry(lw, &lmh_list, lw_list) {
170 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
171 found = 1;
172 break;
173 }
174 }
175
176 if (!found) {
177 if (!try && capable(CAP_SYS_MODULE)) {
178 try = 1;
179 mutex_unlock(&lmh_lock);
180 request_module(proto_name);
181 goto retry;
182 }
183 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
184 error = -ENOENT;
185 goto out;
186 }
187
188 if (lw->lw_ops->lm_owner &&
189 !try_module_get(lw->lw_ops->lm_owner)) {
190 try = 0;
191 mutex_unlock(&lmh_lock);
192 msleep(1000);
193 goto retry;
194 }
195
196 error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data,
197 min_lvb_size, flags, lockstruct, fskobj);
198 if (error)
199 module_put(lw->lw_ops->lm_owner);
200out:
201 mutex_unlock(&lmh_lock);
202 return error;
203}
204
205void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
206{
207 mutex_lock(&lmh_lock);
208 if (lockstruct->ls_ops->lm_unmount)
209 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
210 if (lockstruct->ls_ops->lm_owner)
211 module_put(lockstruct->ls_ops->lm_owner);
212 mutex_unlock(&lmh_lock);
213}
214
215/**
216 * gfs2_withdraw_lockproto - abnormally unmount a lock module
217 * @lockstruct: the lockstruct passed into mount
218 *
219 */
220
221void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
222{
223 mutex_lock(&lmh_lock);
224 lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
225 if (lockstruct->ls_ops->lm_owner)
226 module_put(lockstruct->ls_ops->lm_owner);
227 mutex_unlock(&lmh_lock);
228}
229
230EXPORT_SYMBOL_GPL(gfs2_register_lockproto);
231EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto);
232
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
deleted file mode 100644
index 2609bb6cd013..000000000000
--- a/fs/gfs2/locking/dlm/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
1obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
2lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o
3
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
deleted file mode 100644
index 2482c9047505..000000000000
--- a/fs/gfs2/locking/dlm/lock.c
+++ /dev/null
@@ -1,708 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include "lock_dlm.h"
11
12static char junk_lvb[GDLM_LVB_SIZE];
13
14
15/* convert dlm lock-mode to gfs lock-state */
16
17static s16 gdlm_make_lmstate(s16 dlmmode)
18{
19 switch (dlmmode) {
20 case DLM_LOCK_IV:
21 case DLM_LOCK_NL:
22 return LM_ST_UNLOCKED;
23 case DLM_LOCK_EX:
24 return LM_ST_EXCLUSIVE;
25 case DLM_LOCK_CW:
26 return LM_ST_DEFERRED;
27 case DLM_LOCK_PR:
28 return LM_ST_SHARED;
29 }
30 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
31 return -1;
32}
33
34/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
35 thread gets to it. */
36
37static void queue_submit(struct gdlm_lock *lp)
38{
39 struct gdlm_ls *ls = lp->ls;
40
41 spin_lock(&ls->async_lock);
42 list_add_tail(&lp->delay_list, &ls->submit);
43 spin_unlock(&ls->async_lock);
44 wake_up(&ls->thread_wait);
45}
46
47static void wake_up_ast(struct gdlm_lock *lp)
48{
49 clear_bit(LFL_AST_WAIT, &lp->flags);
50 smp_mb__after_clear_bit();
51 wake_up_bit(&lp->flags, LFL_AST_WAIT);
52}
53
54static void gdlm_delete_lp(struct gdlm_lock *lp)
55{
56 struct gdlm_ls *ls = lp->ls;
57
58 spin_lock(&ls->async_lock);
59 if (!list_empty(&lp->delay_list))
60 list_del_init(&lp->delay_list);
61 ls->all_locks_count--;
62 spin_unlock(&ls->async_lock);
63
64 kfree(lp);
65}
66
67static void gdlm_queue_delayed(struct gdlm_lock *lp)
68{
69 struct gdlm_ls *ls = lp->ls;
70
71 spin_lock(&ls->async_lock);
72 list_add_tail(&lp->delay_list, &ls->delayed);
73 spin_unlock(&ls->async_lock);
74}
75
76static void process_complete(struct gdlm_lock *lp)
77{
78 struct gdlm_ls *ls = lp->ls;
79 struct lm_async_cb acb;
80
81 memset(&acb, 0, sizeof(acb));
82
83 if (lp->lksb.sb_status == -DLM_ECANCEL) {
84 log_info("complete dlm cancel %x,%llx flags %lx",
85 lp->lockname.ln_type,
86 (unsigned long long)lp->lockname.ln_number,
87 lp->flags);
88
89 lp->req = lp->cur;
90 acb.lc_ret |= LM_OUT_CANCELED;
91 if (lp->cur == DLM_LOCK_IV)
92 lp->lksb.sb_lkid = 0;
93 goto out;
94 }
95
96 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
97 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
98 log_info("unlock sb_status %d %x,%llx flags %lx",
99 lp->lksb.sb_status, lp->lockname.ln_type,
100 (unsigned long long)lp->lockname.ln_number,
101 lp->flags);
102 return;
103 }
104
105 lp->cur = DLM_LOCK_IV;
106 lp->req = DLM_LOCK_IV;
107 lp->lksb.sb_lkid = 0;
108
109 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
110 gdlm_delete_lp(lp);
111 return;
112 }
113 goto out;
114 }
115
116 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
117 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
118
119 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
120 if (lp->req == DLM_LOCK_PR)
121 lp->req = DLM_LOCK_CW;
122 else if (lp->req == DLM_LOCK_CW)
123 lp->req = DLM_LOCK_PR;
124 }
125
126 /*
127 * A canceled lock request. The lock was just taken off the delayed
128 * list and was never even submitted to dlm.
129 */
130
131 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
132 log_info("complete internal cancel %x,%llx",
133 lp->lockname.ln_type,
134 (unsigned long long)lp->lockname.ln_number);
135 lp->req = lp->cur;
136 acb.lc_ret |= LM_OUT_CANCELED;
137 goto out;
138 }
139
140 /*
141 * An error occured.
142 */
143
144 if (lp->lksb.sb_status) {
145 /* a "normal" error */
146 if ((lp->lksb.sb_status == -EAGAIN) &&
147 (lp->lkf & DLM_LKF_NOQUEUE)) {
148 lp->req = lp->cur;
149 if (lp->cur == DLM_LOCK_IV)
150 lp->lksb.sb_lkid = 0;
151 goto out;
152 }
153
154 /* this could only happen with cancels I think */
155 log_info("ast sb_status %d %x,%llx flags %lx",
156 lp->lksb.sb_status, lp->lockname.ln_type,
157 (unsigned long long)lp->lockname.ln_number,
158 lp->flags);
159 return;
160 }
161
162 /*
163 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
164 */
165
166 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
167 wake_up_ast(lp);
168 return;
169 }
170
171 /*
172 * A lock has been demoted to NL because it initially completed during
173 * BLOCK_LOCKS. Now it must be requested in the originally requested
174 * mode.
175 */
176
177 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
178 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
179 lp->lockname.ln_type,
180 (unsigned long long)lp->lockname.ln_number);
181 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
182 lp->lockname.ln_type,
183 (unsigned long long)lp->lockname.ln_number);
184
185 lp->cur = DLM_LOCK_NL;
186 lp->req = lp->prev_req;
187 lp->prev_req = DLM_LOCK_IV;
188 lp->lkf &= ~DLM_LKF_CONVDEADLK;
189
190 set_bit(LFL_NOCACHE, &lp->flags);
191
192 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
193 !test_bit(LFL_NOBLOCK, &lp->flags))
194 gdlm_queue_delayed(lp);
195 else
196 queue_submit(lp);
197 return;
198 }
199
200 /*
201 * A request is granted during dlm recovery. It may be granted
202 * because the locks of a failed node were cleared. In that case,
203 * there may be inconsistent data beneath this lock and we must wait
204 * for recovery to complete to use it. When gfs recovery is done this
205 * granted lock will be converted to NL and then reacquired in this
206 * granted state.
207 */
208
209 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
210 !test_bit(LFL_NOBLOCK, &lp->flags) &&
211 lp->req != DLM_LOCK_NL) {
212
213 lp->cur = lp->req;
214 lp->prev_req = lp->req;
215 lp->req = DLM_LOCK_NL;
216 lp->lkf |= DLM_LKF_CONVERT;
217 lp->lkf &= ~DLM_LKF_CONVDEADLK;
218
219 log_debug("rereq %x,%llx id %x %d,%d",
220 lp->lockname.ln_type,
221 (unsigned long long)lp->lockname.ln_number,
222 lp->lksb.sb_lkid, lp->cur, lp->req);
223
224 set_bit(LFL_REREQUEST, &lp->flags);
225 queue_submit(lp);
226 return;
227 }
228
229 /*
230 * DLM demoted the lock to NL before it was granted so GFS must be
231 * told it cannot cache data for this lock.
232 */
233
234 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
235 set_bit(LFL_NOCACHE, &lp->flags);
236
237out:
238 /*
239 * This is an internal lock_dlm lock
240 */
241
242 if (test_bit(LFL_INLOCK, &lp->flags)) {
243 clear_bit(LFL_NOBLOCK, &lp->flags);
244 lp->cur = lp->req;
245 wake_up_ast(lp);
246 return;
247 }
248
249 /*
250 * Normal completion of a lock request. Tell GFS it now has the lock.
251 */
252
253 clear_bit(LFL_NOBLOCK, &lp->flags);
254 lp->cur = lp->req;
255
256 acb.lc_name = lp->lockname;
257 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
258
259 ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
260}
261
262static void gdlm_ast(void *astarg)
263{
264 struct gdlm_lock *lp = astarg;
265 clear_bit(LFL_ACTIVE, &lp->flags);
266 process_complete(lp);
267}
268
269static void process_blocking(struct gdlm_lock *lp, int bast_mode)
270{
271 struct gdlm_ls *ls = lp->ls;
272 unsigned int cb = 0;
273
274 switch (gdlm_make_lmstate(bast_mode)) {
275 case LM_ST_EXCLUSIVE:
276 cb = LM_CB_NEED_E;
277 break;
278 case LM_ST_DEFERRED:
279 cb = LM_CB_NEED_D;
280 break;
281 case LM_ST_SHARED:
282 cb = LM_CB_NEED_S;
283 break;
284 default:
285 gdlm_assert(0, "unknown bast mode %u", bast_mode);
286 }
287
288 ls->fscb(ls->sdp, cb, &lp->lockname);
289}
290
291
292static void gdlm_bast(void *astarg, int mode)
293{
294 struct gdlm_lock *lp = astarg;
295
296 if (!mode) {
297 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
298 lp->lockname.ln_type,
299 (unsigned long long)lp->lockname.ln_number);
300 return;
301 }
302
303 process_blocking(lp, mode);
304}
305
306/* convert gfs lock-state to dlm lock-mode */
307
308static s16 make_mode(s16 lmstate)
309{
310 switch (lmstate) {
311 case LM_ST_UNLOCKED:
312 return DLM_LOCK_NL;
313 case LM_ST_EXCLUSIVE:
314 return DLM_LOCK_EX;
315 case LM_ST_DEFERRED:
316 return DLM_LOCK_CW;
317 case LM_ST_SHARED:
318 return DLM_LOCK_PR;
319 }
320 gdlm_assert(0, "unknown LM state %d", lmstate);
321 return -1;
322}
323
324
325/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
326 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
327
328static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
329{
330 s16 cur = make_mode(cur_state);
331 if (lp->cur != DLM_LOCK_IV)
332 gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
333}
334
335static inline unsigned int make_flags(struct gdlm_lock *lp,
336 unsigned int gfs_flags,
337 s16 cur, s16 req)
338{
339 unsigned int lkf = 0;
340
341 if (gfs_flags & LM_FLAG_TRY)
342 lkf |= DLM_LKF_NOQUEUE;
343
344 if (gfs_flags & LM_FLAG_TRY_1CB) {
345 lkf |= DLM_LKF_NOQUEUE;
346 lkf |= DLM_LKF_NOQUEUEBAST;
347 }
348
349 if (gfs_flags & LM_FLAG_PRIORITY) {
350 lkf |= DLM_LKF_NOORDER;
351 lkf |= DLM_LKF_HEADQUE;
352 }
353
354 if (gfs_flags & LM_FLAG_ANY) {
355 if (req == DLM_LOCK_PR)
356 lkf |= DLM_LKF_ALTCW;
357 else if (req == DLM_LOCK_CW)
358 lkf |= DLM_LKF_ALTPR;
359 }
360
361 if (lp->lksb.sb_lkid != 0) {
362 lkf |= DLM_LKF_CONVERT;
363 }
364
365 if (lp->lvb)
366 lkf |= DLM_LKF_VALBLK;
367
368 return lkf;
369}
370
371/* make_strname - convert GFS lock numbers to a string */
372
373static inline void make_strname(const struct lm_lockname *lockname,
374 struct gdlm_strname *str)
375{
376 sprintf(str->name, "%8x%16llx", lockname->ln_type,
377 (unsigned long long)lockname->ln_number);
378 str->namelen = GDLM_STRNAME_BYTES;
379}
380
381static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
382 struct gdlm_lock **lpp)
383{
384 struct gdlm_lock *lp;
385
386 lp = kzalloc(sizeof(struct gdlm_lock), GFP_NOFS);
387 if (!lp)
388 return -ENOMEM;
389
390 lp->lockname = *name;
391 make_strname(name, &lp->strname);
392 lp->ls = ls;
393 lp->cur = DLM_LOCK_IV;
394 INIT_LIST_HEAD(&lp->delay_list);
395
396 spin_lock(&ls->async_lock);
397 ls->all_locks_count++;
398 spin_unlock(&ls->async_lock);
399
400 *lpp = lp;
401 return 0;
402}
403
404int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
405 void **lockp)
406{
407 struct gdlm_lock *lp;
408 int error;
409
410 error = gdlm_create_lp(lockspace, name, &lp);
411
412 *lockp = lp;
413 return error;
414}
415
416void gdlm_put_lock(void *lock)
417{
418 gdlm_delete_lp(lock);
419}
420
421unsigned int gdlm_do_lock(struct gdlm_lock *lp)
422{
423 struct gdlm_ls *ls = lp->ls;
424 int error, bast = 1;
425
426 /*
427 * When recovery is in progress, delay lock requests for submission
428 * once recovery is done. Requests for recovery (NOEXP) and unlocks
429 * can pass.
430 */
431
432 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
433 !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
434 gdlm_queue_delayed(lp);
435 return LM_OUT_ASYNC;
436 }
437
438 /*
439 * Submit the actual lock request.
440 */
441
442 if (test_bit(LFL_NOBAST, &lp->flags))
443 bast = 0;
444
445 set_bit(LFL_ACTIVE, &lp->flags);
446
447 log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
448 (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
449 lp->cur, lp->req, lp->lkf);
450
451 error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
452 lp->strname.name, lp->strname.namelen, 0, gdlm_ast,
453 lp, bast ? gdlm_bast : NULL);
454
455 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
456 lp->lksb.sb_status = -EAGAIN;
457 gdlm_ast(lp);
458 error = 0;
459 }
460
461 if (error) {
462 log_error("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
463 "flags=%lx", ls->fsname, lp->lockname.ln_type,
464 (unsigned long long)lp->lockname.ln_number, error,
465 lp->cur, lp->req, lp->lkf, lp->flags);
466 return LM_OUT_ERROR;
467 }
468 return LM_OUT_ASYNC;
469}
470
471static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
472{
473 struct gdlm_ls *ls = lp->ls;
474 unsigned int lkf = 0;
475 int error;
476
477 set_bit(LFL_DLM_UNLOCK, &lp->flags);
478 set_bit(LFL_ACTIVE, &lp->flags);
479
480 if (lp->lvb)
481 lkf = DLM_LKF_VALBLK;
482
483 log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
484 (unsigned long long)lp->lockname.ln_number,
485 lp->lksb.sb_lkid, lp->cur, lkf);
486
487 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
488
489 if (error) {
490 log_error("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
491 "flags=%lx", ls->fsname, lp->lockname.ln_type,
492 (unsigned long long)lp->lockname.ln_number, error,
493 lp->cur, lp->req, lp->lkf, lp->flags);
494 return LM_OUT_ERROR;
495 }
496 return LM_OUT_ASYNC;
497}
498
499unsigned int gdlm_lock(void *lock, unsigned int cur_state,
500 unsigned int req_state, unsigned int flags)
501{
502 struct gdlm_lock *lp = lock;
503
504 if (req_state == LM_ST_UNLOCKED)
505 return gdlm_unlock(lock, cur_state);
506
507 if (req_state == LM_ST_UNLOCKED)
508 return gdlm_unlock(lock, cur_state);
509
510 clear_bit(LFL_DLM_CANCEL, &lp->flags);
511 if (flags & LM_FLAG_NOEXP)
512 set_bit(LFL_NOBLOCK, &lp->flags);
513
514 check_cur_state(lp, cur_state);
515 lp->req = make_mode(req_state);
516 lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
517
518 return gdlm_do_lock(lp);
519}
520
521unsigned int gdlm_unlock(void *lock, unsigned int cur_state)
522{
523 struct gdlm_lock *lp = lock;
524
525 clear_bit(LFL_DLM_CANCEL, &lp->flags);
526 if (lp->cur == DLM_LOCK_IV)
527 return 0;
528 return gdlm_do_unlock(lp);
529}
530
531void gdlm_cancel(void *lock)
532{
533 struct gdlm_lock *lp = lock;
534 struct gdlm_ls *ls = lp->ls;
535 int error, delay_list = 0;
536
537 if (test_bit(LFL_DLM_CANCEL, &lp->flags))
538 return;
539
540 log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
541 (unsigned long long)lp->lockname.ln_number, lp->flags);
542
543 spin_lock(&ls->async_lock);
544 if (!list_empty(&lp->delay_list)) {
545 list_del_init(&lp->delay_list);
546 delay_list = 1;
547 }
548 spin_unlock(&ls->async_lock);
549
550 if (delay_list) {
551 set_bit(LFL_CANCEL, &lp->flags);
552 set_bit(LFL_ACTIVE, &lp->flags);
553 gdlm_ast(lp);
554 return;
555 }
556
557 if (!test_bit(LFL_ACTIVE, &lp->flags) ||
558 test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
559 log_info("gdlm_cancel skip %x,%llx flags %lx",
560 lp->lockname.ln_type,
561 (unsigned long long)lp->lockname.ln_number, lp->flags);
562 return;
563 }
564
565 /* the lock is blocked in the dlm */
566
567 set_bit(LFL_DLM_CANCEL, &lp->flags);
568 set_bit(LFL_ACTIVE, &lp->flags);
569
570 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
571 NULL, lp);
572
573 log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
574 lp->lockname.ln_type,
575 (unsigned long long)lp->lockname.ln_number, lp->flags);
576
577 if (error == -EBUSY)
578 clear_bit(LFL_DLM_CANCEL, &lp->flags);
579}
580
581static int gdlm_add_lvb(struct gdlm_lock *lp)
582{
583 char *lvb;
584
585 lvb = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
586 if (!lvb)
587 return -ENOMEM;
588
589 lp->lksb.sb_lvbptr = lvb;
590 lp->lvb = lvb;
591 return 0;
592}
593
594static void gdlm_del_lvb(struct gdlm_lock *lp)
595{
596 kfree(lp->lvb);
597 lp->lvb = NULL;
598 lp->lksb.sb_lvbptr = NULL;
599}
600
601static int gdlm_ast_wait(void *word)
602{
603 schedule();
604 return 0;
605}
606
607/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
608 the completion) because gfs won't call hold_lvb() during a callback (from
609 the context of a lock_dlm thread). */
610
611static int hold_null_lock(struct gdlm_lock *lp)
612{
613 struct gdlm_lock *lpn = NULL;
614 int error;
615
616 if (lp->hold_null) {
617 printk(KERN_INFO "lock_dlm: lvb already held\n");
618 return 0;
619 }
620
621 error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
622 if (error)
623 goto out;
624
625 lpn->lksb.sb_lvbptr = junk_lvb;
626 lpn->lvb = junk_lvb;
627
628 lpn->req = DLM_LOCK_NL;
629 lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
630 set_bit(LFL_NOBAST, &lpn->flags);
631 set_bit(LFL_INLOCK, &lpn->flags);
632 set_bit(LFL_AST_WAIT, &lpn->flags);
633
634 gdlm_do_lock(lpn);
635 wait_on_bit(&lpn->flags, LFL_AST_WAIT, gdlm_ast_wait, TASK_UNINTERRUPTIBLE);
636 error = lpn->lksb.sb_status;
637 if (error) {
638 printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
639 error);
640 gdlm_delete_lp(lpn);
641 lpn = NULL;
642 }
643out:
644 lp->hold_null = lpn;
645 return error;
646}
647
648/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
649 the completion) because gfs may call unhold_lvb() during a callback (from
650 the context of a lock_dlm thread) which could cause a deadlock since the
651 other lock_dlm thread could be engaged in recovery. */
652
653static void unhold_null_lock(struct gdlm_lock *lp)
654{
655 struct gdlm_lock *lpn = lp->hold_null;
656
657 gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
658 (unsigned long long)lp->lockname.ln_number);
659 lpn->lksb.sb_lvbptr = NULL;
660 lpn->lvb = NULL;
661 set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
662 gdlm_do_unlock(lpn);
663 lp->hold_null = NULL;
664}
665
666/* Acquire a NL lock because gfs requires the value block to remain
667 intact on the resource while the lvb is "held" even if it's holding no locks
668 on the resource. */
669
670int gdlm_hold_lvb(void *lock, char **lvbp)
671{
672 struct gdlm_lock *lp = lock;
673 int error;
674
675 error = gdlm_add_lvb(lp);
676 if (error)
677 return error;
678
679 *lvbp = lp->lvb;
680
681 error = hold_null_lock(lp);
682 if (error)
683 gdlm_del_lvb(lp);
684
685 return error;
686}
687
688void gdlm_unhold_lvb(void *lock, char *lvb)
689{
690 struct gdlm_lock *lp = lock;
691
692 unhold_null_lock(lp);
693 gdlm_del_lvb(lp);
694}
695
696void gdlm_submit_delayed(struct gdlm_ls *ls)
697{
698 struct gdlm_lock *lp, *safe;
699
700 spin_lock(&ls->async_lock);
701 list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
702 list_del_init(&lp->delay_list);
703 list_add_tail(&lp->delay_list, &ls->submit);
704 }
705 spin_unlock(&ls->async_lock);
706 wake_up(&ls->thread_wait);
707}
708
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
deleted file mode 100644
index 3c98e7c6f93b..000000000000
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ /dev/null
@@ -1,166 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef LOCK_DLM_DOT_H
11#define LOCK_DLM_DOT_H
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/spinlock.h>
16#include <linux/types.h>
17#include <linux/string.h>
18#include <linux/list.h>
19#include <linux/socket.h>
20#include <linux/delay.h>
21#include <linux/kthread.h>
22#include <linux/kobject.h>
23#include <linux/fcntl.h>
24#include <linux/wait.h>
25#include <net/sock.h>
26
27#include <linux/dlm.h>
28#include <linux/dlm_plock.h>
29#include <linux/lm_interface.h>
30
31/*
32 * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
33 * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module
34 * as "lock_dlm".
35 */
36
37#define GDLM_STRNAME_BYTES 24
38#define GDLM_LVB_SIZE 32
39#define GDLM_DROP_COUNT 0
40#define GDLM_DROP_PERIOD 60
41#define GDLM_NAME_LEN 128
42
43/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
44 We sprintf these numbers into a 24 byte string of hex values to make them
45 human-readable (to make debugging simpler.) */
46
47struct gdlm_strname {
48 unsigned char name[GDLM_STRNAME_BYTES];
49 unsigned short namelen;
50};
51
52enum {
53 DFL_BLOCK_LOCKS = 0,
54 DFL_SPECTATOR = 1,
55 DFL_WITHDRAW = 2,
56};
57
58struct gdlm_ls {
59 u32 id;
60 int jid;
61 int first;
62 int first_done;
63 unsigned long flags;
64 struct kobject kobj;
65 char clustername[GDLM_NAME_LEN];
66 char fsname[GDLM_NAME_LEN];
67 int fsflags;
68 dlm_lockspace_t *dlm_lockspace;
69 lm_callback_t fscb;
70 struct gfs2_sbd *sdp;
71 int recover_jid;
72 int recover_jid_done;
73 int recover_jid_status;
74 spinlock_t async_lock;
75 struct list_head delayed;
76 struct list_head submit;
77 u32 all_locks_count;
78 wait_queue_head_t wait_control;
79 struct task_struct *thread;
80 wait_queue_head_t thread_wait;
81};
82
83enum {
84 LFL_NOBLOCK = 0,
85 LFL_NOCACHE = 1,
86 LFL_DLM_UNLOCK = 2,
87 LFL_DLM_CANCEL = 3,
88 LFL_SYNC_LVB = 4,
89 LFL_FORCE_PROMOTE = 5,
90 LFL_REREQUEST = 6,
91 LFL_ACTIVE = 7,
92 LFL_INLOCK = 8,
93 LFL_CANCEL = 9,
94 LFL_NOBAST = 10,
95 LFL_HEADQUE = 11,
96 LFL_UNLOCK_DELETE = 12,
97 LFL_AST_WAIT = 13,
98};
99
100struct gdlm_lock {
101 struct gdlm_ls *ls;
102 struct lm_lockname lockname;
103 struct gdlm_strname strname;
104 char *lvb;
105 struct dlm_lksb lksb;
106
107 s16 cur;
108 s16 req;
109 s16 prev_req;
110 u32 lkf; /* dlm flags DLM_LKF_ */
111 unsigned long flags; /* lock_dlm flags LFL_ */
112
113 struct list_head delay_list; /* delayed */
114 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
115};
116
117#define gdlm_assert(assertion, fmt, args...) \
118do { \
119 if (unlikely(!(assertion))) { \
120 printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
121 "lock_dlm: " fmt "\n", \
122 #assertion, ##args); \
123 BUG(); \
124 } \
125} while (0)
126
127#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
128#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg)
129#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
130#ifdef LOCK_DLM_LOG_DEBUG
131#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
132#else
133#define log_debug(fmt, arg...)
134#endif
135
136/* sysfs.c */
137
138int gdlm_sysfs_init(void);
139void gdlm_sysfs_exit(void);
140int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
141void gdlm_kobject_release(struct gdlm_ls *);
142
143/* thread.c */
144
145int gdlm_init_threads(struct gdlm_ls *);
146void gdlm_release_threads(struct gdlm_ls *);
147
148/* lock.c */
149
150void gdlm_submit_delayed(struct gdlm_ls *);
151unsigned int gdlm_do_lock(struct gdlm_lock *);
152
153int gdlm_get_lock(void *, struct lm_lockname *, void **);
154void gdlm_put_lock(void *);
155unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int);
156unsigned int gdlm_unlock(void *, unsigned int);
157void gdlm_cancel(void *);
158int gdlm_hold_lvb(void *, char **);
159void gdlm_unhold_lvb(void *, char *);
160
161/* mount.c */
162
163extern const struct lm_lockops gdlm_ops;
164
165#endif
166
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
deleted file mode 100644
index b9a03a7ff801..000000000000
--- a/fs/gfs2/locking/dlm/main.c
+++ /dev/null
@@ -1,48 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/init.h>
11
12#include "lock_dlm.h"
13
14static int __init init_lock_dlm(void)
15{
16 int error;
17
18 error = gfs2_register_lockproto(&gdlm_ops);
19 if (error) {
20 printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n",
21 error);
22 return error;
23 }
24
25 error = gdlm_sysfs_init();
26 if (error) {
27 gfs2_unregister_lockproto(&gdlm_ops);
28 return error;
29 }
30
31 printk(KERN_INFO
32 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
33 return 0;
34}
35
36static void __exit exit_lock_dlm(void)
37{
38 gdlm_sysfs_exit();
39 gfs2_unregister_lockproto(&gdlm_ops);
40}
41
42module_init(init_lock_dlm);
43module_exit(exit_lock_dlm);
44
45MODULE_DESCRIPTION("GFS DLM Locking Module");
46MODULE_AUTHOR("Red Hat, Inc.");
47MODULE_LICENSE("GPL");
48
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
deleted file mode 100644
index 1aa7eb6a0226..000000000000
--- a/fs/gfs2/locking/dlm/mount.c
+++ /dev/null
@@ -1,276 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include "lock_dlm.h"
11
12const struct lm_lockops gdlm_ops;
13
14
15static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
16 int flags, char *table_name)
17{
18 struct gdlm_ls *ls;
19 char buf[256], *p;
20
21 ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
22 if (!ls)
23 return NULL;
24
25 ls->fscb = cb;
26 ls->sdp = sdp;
27 ls->fsflags = flags;
28 spin_lock_init(&ls->async_lock);
29 INIT_LIST_HEAD(&ls->delayed);
30 INIT_LIST_HEAD(&ls->submit);
31 init_waitqueue_head(&ls->thread_wait);
32 init_waitqueue_head(&ls->wait_control);
33 ls->jid = -1;
34
35 strncpy(buf, table_name, 256);
36 buf[255] = '\0';
37
38 p = strchr(buf, ':');
39 if (!p) {
40 log_info("invalid table_name \"%s\"", table_name);
41 kfree(ls);
42 return NULL;
43 }
44 *p = '\0';
45 p++;
46
47 strncpy(ls->clustername, buf, GDLM_NAME_LEN);
48 strncpy(ls->fsname, p, GDLM_NAME_LEN);
49
50 return ls;
51}
52
53static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
54{
55 char data[256];
56 char *options, *x, *y;
57 int error = 0;
58
59 memset(data, 0, 256);
60 strncpy(data, data_arg, 255);
61
62 if (!strlen(data)) {
63 log_error("no mount options, (u)mount helpers not installed");
64 return -EINVAL;
65 }
66
67 for (options = data; (x = strsep(&options, ":")); ) {
68 if (!*x)
69 continue;
70
71 y = strchr(x, '=');
72 if (y)
73 *y++ = 0;
74
75 if (!strcmp(x, "jid")) {
76 if (!y) {
77 log_error("need argument to jid");
78 error = -EINVAL;
79 break;
80 }
81 sscanf(y, "%u", &ls->jid);
82
83 } else if (!strcmp(x, "first")) {
84 if (!y) {
85 log_error("need argument to first");
86 error = -EINVAL;
87 break;
88 }
89 sscanf(y, "%u", &ls->first);
90
91 } else if (!strcmp(x, "id")) {
92 if (!y) {
93 log_error("need argument to id");
94 error = -EINVAL;
95 break;
96 }
97 sscanf(y, "%u", &ls->id);
98
99 } else if (!strcmp(x, "nodir")) {
100 if (!y) {
101 log_error("need argument to nodir");
102 error = -EINVAL;
103 break;
104 }
105 sscanf(y, "%u", nodir);
106
107 } else {
108 log_error("unkonwn option: %s", x);
109 error = -EINVAL;
110 break;
111 }
112 }
113
114 return error;
115}
116
117static int gdlm_mount(char *table_name, char *host_data,
118 lm_callback_t cb, void *cb_data,
119 unsigned int min_lvb_size, int flags,
120 struct lm_lockstruct *lockstruct,
121 struct kobject *fskobj)
122{
123 struct gdlm_ls *ls;
124 int error = -ENOMEM, nodir = 0;
125
126 if (min_lvb_size > GDLM_LVB_SIZE)
127 goto out;
128
129 ls = init_gdlm(cb, cb_data, flags, table_name);
130 if (!ls)
131 goto out;
132
133 error = make_args(ls, host_data, &nodir);
134 if (error)
135 goto out;
136
137 error = gdlm_init_threads(ls);
138 if (error)
139 goto out_free;
140
141 error = gdlm_kobject_setup(ls, fskobj);
142 if (error)
143 goto out_thread;
144
145 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
146 &ls->dlm_lockspace,
147 DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
148 (nodir ? DLM_LSFL_NODIR : 0),
149 GDLM_LVB_SIZE);
150 if (error) {
151 log_error("dlm_new_lockspace error %d", error);
152 goto out_kobj;
153 }
154
155 lockstruct->ls_jid = ls->jid;
156 lockstruct->ls_first = ls->first;
157 lockstruct->ls_lockspace = ls;
158 lockstruct->ls_ops = &gdlm_ops;
159 lockstruct->ls_flags = 0;
160 lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
161 return 0;
162
163out_kobj:
164 gdlm_kobject_release(ls);
165out_thread:
166 gdlm_release_threads(ls);
167out_free:
168 kfree(ls);
169out:
170 return error;
171}
172
173static void gdlm_unmount(void *lockspace)
174{
175 struct gdlm_ls *ls = lockspace;
176
177 log_debug("unmount flags %lx", ls->flags);
178
179 /* FIXME: serialize unmount and withdraw in case they
180 happen at once. Also, if unmount follows withdraw,
181 wait for withdraw to finish. */
182
183 if (test_bit(DFL_WITHDRAW, &ls->flags))
184 goto out;
185
186 gdlm_kobject_release(ls);
187 dlm_release_lockspace(ls->dlm_lockspace, 2);
188 gdlm_release_threads(ls);
189 BUG_ON(ls->all_locks_count);
190out:
191 kfree(ls);
192}
193
194static void gdlm_recovery_done(void *lockspace, unsigned int jid,
195 unsigned int message)
196{
197 char env_jid[20];
198 char env_status[20];
199 char *envp[] = { env_jid, env_status, NULL };
200 struct gdlm_ls *ls = lockspace;
201 ls->recover_jid_done = jid;
202 ls->recover_jid_status = message;
203 sprintf(env_jid, "JID=%d", jid);
204 sprintf(env_status, "RECOVERY=%s",
205 message == LM_RD_SUCCESS ? "Done" : "Failed");
206 kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
207}
208
209static void gdlm_others_may_mount(void *lockspace)
210{
211 char *message = "FIRSTMOUNT=Done";
212 char *envp[] = { message, NULL };
213 struct gdlm_ls *ls = lockspace;
214 ls->first_done = 1;
215 kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
216}
217
218/* Userspace gets the offline uevent, blocks new gfs locks on
219 other mounters, and lets us know (sets WITHDRAW flag). Then,
220 userspace leaves the mount group while we leave the lockspace. */
221
222static void gdlm_withdraw(void *lockspace)
223{
224 struct gdlm_ls *ls = lockspace;
225
226 kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
227
228 wait_event_interruptible(ls->wait_control,
229 test_bit(DFL_WITHDRAW, &ls->flags));
230
231 dlm_release_lockspace(ls->dlm_lockspace, 2);
232 gdlm_release_threads(ls);
233 gdlm_kobject_release(ls);
234}
235
236static int gdlm_plock(void *lockspace, struct lm_lockname *name,
237 struct file *file, int cmd, struct file_lock *fl)
238{
239 struct gdlm_ls *ls = lockspace;
240 return dlm_posix_lock(ls->dlm_lockspace, name->ln_number, file, cmd, fl);
241}
242
243static int gdlm_punlock(void *lockspace, struct lm_lockname *name,
244 struct file *file, struct file_lock *fl)
245{
246 struct gdlm_ls *ls = lockspace;
247 return dlm_posix_unlock(ls->dlm_lockspace, name->ln_number, file, fl);
248}
249
250static int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
251 struct file *file, struct file_lock *fl)
252{
253 struct gdlm_ls *ls = lockspace;
254 return dlm_posix_get(ls->dlm_lockspace, name->ln_number, file, fl);
255}
256
257const struct lm_lockops gdlm_ops = {
258 .lm_proto_name = "lock_dlm",
259 .lm_mount = gdlm_mount,
260 .lm_others_may_mount = gdlm_others_may_mount,
261 .lm_unmount = gdlm_unmount,
262 .lm_withdraw = gdlm_withdraw,
263 .lm_get_lock = gdlm_get_lock,
264 .lm_put_lock = gdlm_put_lock,
265 .lm_lock = gdlm_lock,
266 .lm_unlock = gdlm_unlock,
267 .lm_plock = gdlm_plock,
268 .lm_punlock = gdlm_punlock,
269 .lm_plock_get = gdlm_plock_get,
270 .lm_cancel = gdlm_cancel,
271 .lm_hold_lvb = gdlm_hold_lvb,
272 .lm_unhold_lvb = gdlm_unhold_lvb,
273 .lm_recovery_done = gdlm_recovery_done,
274 .lm_owner = THIS_MODULE,
275};
276
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
deleted file mode 100644
index 9b7edcf7bd49..000000000000
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ /dev/null
@@ -1,226 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/ctype.h>
11#include <linux/stat.h>
12
13#include "lock_dlm.h"
14
15static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
16{
17 return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
18}
19
20static ssize_t block_show(struct gdlm_ls *ls, char *buf)
21{
22 ssize_t ret;
23 int val = 0;
24
25 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
26 val = 1;
27 ret = sprintf(buf, "%d\n", val);
28 return ret;
29}
30
31static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
32{
33 ssize_t ret = len;
34 int val;
35
36 val = simple_strtol(buf, NULL, 0);
37
38 if (val == 1)
39 set_bit(DFL_BLOCK_LOCKS, &ls->flags);
40 else if (val == 0) {
41 clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
42 gdlm_submit_delayed(ls);
43 } else {
44 ret = -EINVAL;
45 }
46 return ret;
47}
48
49static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
50{
51 ssize_t ret;
52 int val = 0;
53
54 if (test_bit(DFL_WITHDRAW, &ls->flags))
55 val = 1;
56 ret = sprintf(buf, "%d\n", val);
57 return ret;
58}
59
60static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
61{
62 ssize_t ret = len;
63 int val;
64
65 val = simple_strtol(buf, NULL, 0);
66
67 if (val == 1)
68 set_bit(DFL_WITHDRAW, &ls->flags);
69 else
70 ret = -EINVAL;
71 wake_up(&ls->wait_control);
72 return ret;
73}
74
75static ssize_t id_show(struct gdlm_ls *ls, char *buf)
76{
77 return sprintf(buf, "%u\n", ls->id);
78}
79
80static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
81{
82 return sprintf(buf, "%d\n", ls->jid);
83}
84
85static ssize_t first_show(struct gdlm_ls *ls, char *buf)
86{
87 return sprintf(buf, "%d\n", ls->first);
88}
89
90static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
91{
92 return sprintf(buf, "%d\n", ls->first_done);
93}
94
95static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
96{
97 return sprintf(buf, "%d\n", ls->recover_jid);
98}
99
100static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
101{
102 ls->recover_jid = simple_strtol(buf, NULL, 0);
103 ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid);
104 return len;
105}
106
107static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
108{
109 return sprintf(buf, "%d\n", ls->recover_jid_done);
110}
111
112static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
113{
114 return sprintf(buf, "%d\n", ls->recover_jid_status);
115}
116
117struct gdlm_attr {
118 struct attribute attr;
119 ssize_t (*show)(struct gdlm_ls *, char *);
120 ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
121};
122
123#define GDLM_ATTR(_name,_mode,_show,_store) \
124static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
125
126GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
127GDLM_ATTR(block, 0644, block_show, block_store);
128GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
129GDLM_ATTR(id, 0444, id_show, NULL);
130GDLM_ATTR(jid, 0444, jid_show, NULL);
131GDLM_ATTR(first, 0444, first_show, NULL);
132GDLM_ATTR(first_done, 0444, first_done_show, NULL);
133GDLM_ATTR(recover, 0644, recover_show, recover_store);
134GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
135GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
136
137static struct attribute *gdlm_attrs[] = {
138 &gdlm_attr_proto_name.attr,
139 &gdlm_attr_block.attr,
140 &gdlm_attr_withdraw.attr,
141 &gdlm_attr_id.attr,
142 &gdlm_attr_jid.attr,
143 &gdlm_attr_first.attr,
144 &gdlm_attr_first_done.attr,
145 &gdlm_attr_recover.attr,
146 &gdlm_attr_recover_done.attr,
147 &gdlm_attr_recover_status.attr,
148 NULL,
149};
150
151static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
152 char *buf)
153{
154 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
155 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
156 return a->show ? a->show(ls, buf) : 0;
157}
158
159static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
160 const char *buf, size_t len)
161{
162 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
163 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
164 return a->store ? a->store(ls, buf, len) : len;
165}
166
167static struct sysfs_ops gdlm_attr_ops = {
168 .show = gdlm_attr_show,
169 .store = gdlm_attr_store,
170};
171
172static struct kobj_type gdlm_ktype = {
173 .default_attrs = gdlm_attrs,
174 .sysfs_ops = &gdlm_attr_ops,
175};
176
177static struct kset *gdlm_kset;
178
179int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
180{
181 int error;
182
183 ls->kobj.kset = gdlm_kset;
184 error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj,
185 "lock_module");
186 if (error)
187 log_error("can't register kobj %d", error);
188 kobject_uevent(&ls->kobj, KOBJ_ADD);
189
190 return error;
191}
192
193void gdlm_kobject_release(struct gdlm_ls *ls)
194{
195 kobject_put(&ls->kobj);
196}
197
198static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
199 struct kobj_uevent_env *env)
200{
201 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
202 add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
203 add_uevent_var(env, "LOCKPROTO=lock_dlm");
204 return 0;
205}
206
207static struct kset_uevent_ops gdlm_uevent_ops = {
208 .uevent = gdlm_uevent,
209};
210
211
212int gdlm_sysfs_init(void)
213{
214 gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
215 if (!gdlm_kset) {
216 printk(KERN_WARNING "%s: can not create kset\n", __func__);
217 return -ENOMEM;
218 }
219 return 0;
220}
221
222void gdlm_sysfs_exit(void)
223{
224 kset_unregister(gdlm_kset);
225}
226
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
deleted file mode 100644
index 38823efd698c..000000000000
--- a/fs/gfs2/locking/dlm/thread.c
+++ /dev/null
@@ -1,68 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include "lock_dlm.h"
11
12static inline int no_work(struct gdlm_ls *ls)
13{
14 int ret;
15
16 spin_lock(&ls->async_lock);
17 ret = list_empty(&ls->submit);
18 spin_unlock(&ls->async_lock);
19
20 return ret;
21}
22
23static int gdlm_thread(void *data)
24{
25 struct gdlm_ls *ls = (struct gdlm_ls *) data;
26 struct gdlm_lock *lp = NULL;
27
28 while (!kthread_should_stop()) {
29 wait_event_interruptible(ls->thread_wait,
30 !no_work(ls) || kthread_should_stop());
31
32 spin_lock(&ls->async_lock);
33
34 if (!list_empty(&ls->submit)) {
35 lp = list_entry(ls->submit.next, struct gdlm_lock,
36 delay_list);
37 list_del_init(&lp->delay_list);
38 spin_unlock(&ls->async_lock);
39 gdlm_do_lock(lp);
40 spin_lock(&ls->async_lock);
41 }
42 spin_unlock(&ls->async_lock);
43 }
44
45 return 0;
46}
47
48int gdlm_init_threads(struct gdlm_ls *ls)
49{
50 struct task_struct *p;
51 int error;
52
53 p = kthread_run(gdlm_thread, ls, "lock_dlm");
54 error = IS_ERR(p);
55 if (error) {
56 log_error("can't start lock_dlm thread %d", error);
57 return error;
58 }
59 ls->thread = p;
60
61 return 0;
62}
63
64void gdlm_release_threads(struct gdlm_ls *ls)
65{
66 kthread_stop(ls->thread);
67}
68
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ad305854bdc6..98918a756410 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -14,7 +14,6 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h> 16#include <linux/crc32.h>
17#include <linux/lm_interface.h>
18#include <linux/delay.h> 17#include <linux/delay.h>
19#include <linux/kthread.h> 18#include <linux/kthread.h>
20#include <linux/freezer.h> 19#include <linux/freezer.h>
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 4390f6f4047d..80e4f5f898bb 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,7 +13,6 @@
13#include <linux/completion.h> 13#include <linux/completion.h>
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17 16
18#include "gfs2.h" 17#include "gfs2.h"
19#include "incore.h" 18#include "incore.h"
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 7cacfde32194..a6892ed0840a 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -14,7 +14,6 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18#include <asm/atomic.h> 17#include <asm/atomic.h>
19 18
20#include "gfs2.h" 19#include "gfs2.h"
@@ -23,6 +22,12 @@
23#include "sys.h" 22#include "sys.h"
24#include "util.h" 23#include "util.h"
25#include "glock.h" 24#include "glock.h"
25#include "quota.h"
26
27static struct shrinker qd_shrinker = {
28 .shrink = gfs2_shrink_qd_memory,
29 .seeks = DEFAULT_SEEKS,
30};
26 31
27static void gfs2_init_inode_once(void *foo) 32static void gfs2_init_inode_once(void *foo)
28{ 33{
@@ -41,8 +46,6 @@ static void gfs2_init_glock_once(void *foo)
41 INIT_HLIST_NODE(&gl->gl_list); 46 INIT_HLIST_NODE(&gl->gl_list);
42 spin_lock_init(&gl->gl_spin); 47 spin_lock_init(&gl->gl_spin);
43 INIT_LIST_HEAD(&gl->gl_holders); 48 INIT_LIST_HEAD(&gl->gl_holders);
44 gl->gl_lvb = NULL;
45 atomic_set(&gl->gl_lvb_count, 0);
46 INIT_LIST_HEAD(&gl->gl_lru); 49 INIT_LIST_HEAD(&gl->gl_lru);
47 INIT_LIST_HEAD(&gl->gl_ail_list); 50 INIT_LIST_HEAD(&gl->gl_ail_list);
48 atomic_set(&gl->gl_ail_count, 0); 51 atomic_set(&gl->gl_ail_count, 0);
@@ -100,6 +103,8 @@ static int __init init_gfs2_fs(void)
100 if (!gfs2_quotad_cachep) 103 if (!gfs2_quotad_cachep)
101 goto fail; 104 goto fail;
102 105
106 register_shrinker(&qd_shrinker);
107
103 error = register_filesystem(&gfs2_fs_type); 108 error = register_filesystem(&gfs2_fs_type);
104 if (error) 109 if (error)
105 goto fail; 110 goto fail;
@@ -117,6 +122,7 @@ static int __init init_gfs2_fs(void)
117fail_unregister: 122fail_unregister:
118 unregister_filesystem(&gfs2_fs_type); 123 unregister_filesystem(&gfs2_fs_type);
119fail: 124fail:
125 unregister_shrinker(&qd_shrinker);
120 gfs2_glock_exit(); 126 gfs2_glock_exit();
121 127
122 if (gfs2_quotad_cachep) 128 if (gfs2_quotad_cachep)
@@ -145,6 +151,7 @@ fail:
145 151
146static void __exit exit_gfs2_fs(void) 152static void __exit exit_gfs2_fs(void)
147{ 153{
154 unregister_shrinker(&qd_shrinker);
148 gfs2_glock_exit(); 155 gfs2_glock_exit();
149 gfs2_unregister_debugfs(); 156 gfs2_unregister_debugfs();
150 unregister_filesystem(&gfs2_fs_type); 157 unregister_filesystem(&gfs2_fs_type);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 09853620c951..8d6f13256b26 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -19,7 +19,6 @@
19#include <linux/delay.h> 19#include <linux/delay.h>
20#include <linux/bio.h> 20#include <linux/bio.h>
21#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
22#include <linux/lm_interface.h>
23 22
24#include "gfs2.h" 23#include "gfs2.h"
25#include "incore.h" 24#include "incore.h"
@@ -90,27 +89,6 @@ void gfs2_aspace_put(struct inode *aspace)
90} 89}
91 90
92/** 91/**
93 * gfs2_meta_inval - Invalidate all buffers associated with a glock
94 * @gl: the glock
95 *
96 */
97
98void gfs2_meta_inval(struct gfs2_glock *gl)
99{
100 struct gfs2_sbd *sdp = gl->gl_sbd;
101 struct inode *aspace = gl->gl_aspace;
102 struct address_space *mapping = gl->gl_aspace->i_mapping;
103
104 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
105
106 atomic_inc(&aspace->i_writecount);
107 truncate_inode_pages(mapping, 0);
108 atomic_dec(&aspace->i_writecount);
109
110 gfs2_assert_withdraw(sdp, !mapping->nrpages);
111}
112
113/**
114 * gfs2_meta_sync - Sync all buffers associated with a glock 92 * gfs2_meta_sync - Sync all buffers associated with a glock
115 * @gl: The glock 93 * @gl: The glock
116 * 94 *
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index b1a5f3674d43..de270c2f9b63 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -40,7 +40,6 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
40struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp); 40struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
41void gfs2_aspace_put(struct inode *aspace); 41void gfs2_aspace_put(struct inode *aspace);
42 42
43void gfs2_meta_inval(struct gfs2_glock *gl);
44void gfs2_meta_sync(struct gfs2_glock *gl); 43void gfs2_meta_sync(struct gfs2_glock *gl);
45 44
46struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); 45struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 3cb0a44ba023..f7e8527a21e0 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -12,12 +12,11 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/lm_interface.h>
16#include <linux/parser.h> 15#include <linux/parser.h>
17 16
18#include "gfs2.h" 17#include "gfs2.h"
19#include "incore.h" 18#include "incore.h"
20#include "mount.h" 19#include "super.h"
21#include "sys.h" 20#include "sys.h"
22#include "util.h" 21#include "util.h"
23 22
@@ -37,11 +36,15 @@ enum {
37 Opt_quota_off, 36 Opt_quota_off,
38 Opt_quota_account, 37 Opt_quota_account,
39 Opt_quota_on, 38 Opt_quota_on,
39 Opt_quota,
40 Opt_noquota,
40 Opt_suiddir, 41 Opt_suiddir,
41 Opt_nosuiddir, 42 Opt_nosuiddir,
42 Opt_data_writeback, 43 Opt_data_writeback,
43 Opt_data_ordered, 44 Opt_data_ordered,
44 Opt_meta, 45 Opt_meta,
46 Opt_discard,
47 Opt_nodiscard,
45 Opt_err, 48 Opt_err,
46}; 49};
47 50
@@ -61,11 +64,15 @@ static const match_table_t tokens = {
61 {Opt_quota_off, "quota=off"}, 64 {Opt_quota_off, "quota=off"},
62 {Opt_quota_account, "quota=account"}, 65 {Opt_quota_account, "quota=account"},
63 {Opt_quota_on, "quota=on"}, 66 {Opt_quota_on, "quota=on"},
67 {Opt_quota, "quota"},
68 {Opt_noquota, "noquota"},
64 {Opt_suiddir, "suiddir"}, 69 {Opt_suiddir, "suiddir"},
65 {Opt_nosuiddir, "nosuiddir"}, 70 {Opt_nosuiddir, "nosuiddir"},
66 {Opt_data_writeback, "data=writeback"}, 71 {Opt_data_writeback, "data=writeback"},
67 {Opt_data_ordered, "data=ordered"}, 72 {Opt_data_ordered, "data=ordered"},
68 {Opt_meta, "meta"}, 73 {Opt_meta, "meta"},
74 {Opt_discard, "discard"},
75 {Opt_nodiscard, "nodiscard"},
69 {Opt_err, NULL} 76 {Opt_err, NULL}
70}; 77};
71 78
@@ -77,101 +84,46 @@ static const match_table_t tokens = {
77 * Return: errno 84 * Return: errno
78 */ 85 */
79 86
80int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) 87int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
81{ 88{
82 struct gfs2_args *args = &sdp->sd_args; 89 char *o;
83 char *data = data_arg; 90 int token;
84 char *options, *o, *v; 91 substring_t tmp[MAX_OPT_ARGS];
85 int error = 0;
86
87 if (!remount) {
88 /* Set some defaults */
89 args->ar_quota = GFS2_QUOTA_DEFAULT;
90 args->ar_data = GFS2_DATA_DEFAULT;
91 }
92 92
93 /* Split the options into tokens with the "," character and 93 /* Split the options into tokens with the "," character and
94 process them */ 94 process them */
95 95
96 for (options = data; (o = strsep(&options, ",")); ) { 96 while (1) {
97 int token; 97 o = strsep(&options, ",");
98 substring_t tmp[MAX_OPT_ARGS]; 98 if (o == NULL)
99 99 break;
100 if (!*o) 100 if (*o == '\0')
101 continue; 101 continue;
102 102
103 token = match_token(o, tokens, tmp); 103 token = match_token(o, tokens, tmp);
104 switch (token) { 104 switch (token) {
105 case Opt_lockproto: 105 case Opt_lockproto:
106 v = match_strdup(&tmp[0]); 106 match_strlcpy(args->ar_lockproto, &tmp[0],
107 if (!v) { 107 GFS2_LOCKNAME_LEN);
108 fs_info(sdp, "no memory for lockproto\n");
109 error = -ENOMEM;
110 goto out_error;
111 }
112
113 if (remount && strcmp(v, args->ar_lockproto)) {
114 kfree(v);
115 goto cant_remount;
116 }
117
118 strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
119 args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
120 kfree(v);
121 break; 108 break;
122 case Opt_locktable: 109 case Opt_locktable:
123 v = match_strdup(&tmp[0]); 110 match_strlcpy(args->ar_locktable, &tmp[0],
124 if (!v) { 111 GFS2_LOCKNAME_LEN);
125 fs_info(sdp, "no memory for locktable\n");
126 error = -ENOMEM;
127 goto out_error;
128 }
129
130 if (remount && strcmp(v, args->ar_locktable)) {
131 kfree(v);
132 goto cant_remount;
133 }
134
135 strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
136 args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
137 kfree(v);
138 break; 112 break;
139 case Opt_hostdata: 113 case Opt_hostdata:
140 v = match_strdup(&tmp[0]); 114 match_strlcpy(args->ar_hostdata, &tmp[0],
141 if (!v) { 115 GFS2_LOCKNAME_LEN);
142 fs_info(sdp, "no memory for hostdata\n");
143 error = -ENOMEM;
144 goto out_error;
145 }
146
147 if (remount && strcmp(v, args->ar_hostdata)) {
148 kfree(v);
149 goto cant_remount;
150 }
151
152 strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
153 args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
154 kfree(v);
155 break; 116 break;
156 case Opt_spectator: 117 case Opt_spectator:
157 if (remount && !args->ar_spectator)
158 goto cant_remount;
159 args->ar_spectator = 1; 118 args->ar_spectator = 1;
160 sdp->sd_vfs->s_flags |= MS_RDONLY;
161 break; 119 break;
162 case Opt_ignore_local_fs: 120 case Opt_ignore_local_fs:
163 if (remount && !args->ar_ignore_local_fs)
164 goto cant_remount;
165 args->ar_ignore_local_fs = 1; 121 args->ar_ignore_local_fs = 1;
166 break; 122 break;
167 case Opt_localflocks: 123 case Opt_localflocks:
168 if (remount && !args->ar_localflocks)
169 goto cant_remount;
170 args->ar_localflocks = 1; 124 args->ar_localflocks = 1;
171 break; 125 break;
172 case Opt_localcaching: 126 case Opt_localcaching:
173 if (remount && !args->ar_localcaching)
174 goto cant_remount;
175 args->ar_localcaching = 1; 127 args->ar_localcaching = 1;
176 break; 128 break;
177 case Opt_debug: 129 case Opt_debug:
@@ -181,25 +133,23 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
181 args->ar_debug = 0; 133 args->ar_debug = 0;
182 break; 134 break;
183 case Opt_upgrade: 135 case Opt_upgrade:
184 if (remount && !args->ar_upgrade)
185 goto cant_remount;
186 args->ar_upgrade = 1; 136 args->ar_upgrade = 1;
187 break; 137 break;
188 case Opt_acl: 138 case Opt_acl:
189 args->ar_posix_acl = 1; 139 args->ar_posix_acl = 1;
190 sdp->sd_vfs->s_flags |= MS_POSIXACL;
191 break; 140 break;
192 case Opt_noacl: 141 case Opt_noacl:
193 args->ar_posix_acl = 0; 142 args->ar_posix_acl = 0;
194 sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
195 break; 143 break;
196 case Opt_quota_off: 144 case Opt_quota_off:
145 case Opt_noquota:
197 args->ar_quota = GFS2_QUOTA_OFF; 146 args->ar_quota = GFS2_QUOTA_OFF;
198 break; 147 break;
199 case Opt_quota_account: 148 case Opt_quota_account:
200 args->ar_quota = GFS2_QUOTA_ACCOUNT; 149 args->ar_quota = GFS2_QUOTA_ACCOUNT;
201 break; 150 break;
202 case Opt_quota_on: 151 case Opt_quota_on:
152 case Opt_quota:
203 args->ar_quota = GFS2_QUOTA_ON; 153 args->ar_quota = GFS2_QUOTA_ON;
204 break; 154 break;
205 case Opt_suiddir: 155 case Opt_suiddir:
@@ -215,29 +165,21 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
215 args->ar_data = GFS2_DATA_ORDERED; 165 args->ar_data = GFS2_DATA_ORDERED;
216 break; 166 break;
217 case Opt_meta: 167 case Opt_meta:
218 if (remount && args->ar_meta != 1)
219 goto cant_remount;
220 args->ar_meta = 1; 168 args->ar_meta = 1;
221 break; 169 break;
170 case Opt_discard:
171 args->ar_discard = 1;
172 break;
173 case Opt_nodiscard:
174 args->ar_discard = 0;
175 break;
222 case Opt_err: 176 case Opt_err:
223 default: 177 default:
224 fs_info(sdp, "unknown option: %s\n", o); 178 fs_info(sdp, "invalid mount option: %s\n", o);
225 error = -EINVAL; 179 return -EINVAL;
226 goto out_error;
227 } 180 }
228 } 181 }
229 182
230out_error: 183 return 0;
231 if (error)
232 fs_info(sdp, "invalid mount option(s)\n");
233
234 if (data != data_arg)
235 kfree(data);
236
237 return error;
238
239cant_remount:
240 fs_info(sdp, "can't remount with option %s\n", o);
241 return -EINVAL;
242} 184}
243 185
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
deleted file mode 100644
index 401288acfdf3..000000000000
--- a/fs/gfs2/mount.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __MOUNT_DOT_H__
11#define __MOUNT_DOT_H__
12
13struct gfs2_sbd;
14
15int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
16
17#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 4ddab67867eb..a6dde1751e17 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -19,7 +19,6 @@
19#include <linux/writeback.h> 19#include <linux/writeback.h>
20#include <linux/swap.h> 20#include <linux/swap.h>
21#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
22#include <linux/lm_interface.h>
23#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
24 23
25#include "gfs2.h" 24#include "gfs2.h"
@@ -442,6 +441,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
442 */ 441 */
443 if (unlikely(page->index)) { 442 if (unlikely(page->index)) {
444 zero_user(page, 0, PAGE_CACHE_SIZE); 443 zero_user(page, 0, PAGE_CACHE_SIZE);
444 SetPageUptodate(page);
445 return 0; 445 return 0;
446 } 446 }
447 447
@@ -1096,6 +1096,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
1096 .releasepage = gfs2_releasepage, 1096 .releasepage = gfs2_releasepage,
1097 .direct_IO = gfs2_direct_IO, 1097 .direct_IO = gfs2_direct_IO,
1098 .migratepage = buffer_migrate_page, 1098 .migratepage = buffer_migrate_page,
1099 .is_partially_uptodate = block_is_partially_uptodate,
1099}; 1100};
1100 1101
1101static const struct address_space_operations gfs2_ordered_aops = { 1102static const struct address_space_operations gfs2_ordered_aops = {
@@ -1111,6 +1112,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
1111 .releasepage = gfs2_releasepage, 1112 .releasepage = gfs2_releasepage,
1112 .direct_IO = gfs2_direct_IO, 1113 .direct_IO = gfs2_direct_IO,
1113 .migratepage = buffer_migrate_page, 1114 .migratepage = buffer_migrate_page,
1115 .is_partially_uptodate = block_is_partially_uptodate,
1114}; 1116};
1115 1117
1116static const struct address_space_operations gfs2_jdata_aops = { 1118static const struct address_space_operations gfs2_jdata_aops = {
@@ -1125,6 +1127,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
1125 .bmap = gfs2_bmap, 1127 .bmap = gfs2_bmap,
1126 .invalidatepage = gfs2_invalidatepage, 1128 .invalidatepage = gfs2_invalidatepage,
1127 .releasepage = gfs2_releasepage, 1129 .releasepage = gfs2_releasepage,
1130 .is_partially_uptodate = block_is_partially_uptodate,
1128}; 1131};
1129 1132
1130void gfs2_set_aops(struct inode *inode) 1133void gfs2_set_aops(struct inode *inode)
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index c2ad36330ca3..5eb57b044382 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -13,7 +13,6 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/lm_interface.h>
17 16
18#include "gfs2.h" 17#include "gfs2.h"
19#include "incore.h" 18#include "incore.h"
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 7fdeb14ddd1a..9200ef221716 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -14,7 +14,6 @@
14#include <linux/exportfs.h> 14#include <linux/exportfs.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h> 16#include <linux/crc32.h>
17#include <linux/lm_interface.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
20#include "incore.h" 19#include "incore.h"
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 93fe41b67f97..3b9e8de3500b 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -20,9 +20,10 @@
20#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
21#include <linux/ext2_fs.h> 21#include <linux/ext2_fs.h>
22#include <linux/crc32.h> 22#include <linux/crc32.h>
23#include <linux/lm_interface.h>
24#include <linux/writeback.h> 23#include <linux/writeback.h>
25#include <asm/uaccess.h> 24#include <asm/uaccess.h>
25#include <linux/dlm.h>
26#include <linux/dlm_plock.h>
26 27
27#include "gfs2.h" 28#include "gfs2.h"
28#include "incore.h" 29#include "incore.h"
@@ -354,7 +355,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
354 if (ret) 355 if (ret)
355 goto out; 356 goto out;
356 357
358 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
357 set_bit(GIF_SW_PAGED, &ip->i_flags); 359 set_bit(GIF_SW_PAGED, &ip->i_flags);
360
358 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); 361 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
359 if (ret || !alloc_required) 362 if (ret || !alloc_required)
360 goto out_unlock; 363 goto out_unlock;
@@ -560,57 +563,24 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
560 return ret; 563 return ret;
561} 564}
562 565
566#ifdef CONFIG_GFS2_FS_LOCKING_DLM
567
563/** 568/**
564 * gfs2_setlease - acquire/release a file lease 569 * gfs2_setlease - acquire/release a file lease
565 * @file: the file pointer 570 * @file: the file pointer
566 * @arg: lease type 571 * @arg: lease type
567 * @fl: file lock 572 * @fl: file lock
568 * 573 *
574 * We don't currently have a way to enforce a lease across the whole
575 * cluster; until we do, disable leases (by just returning -EINVAL),
576 * unless the administrator has requested purely local locking.
577 *
569 * Returns: errno 578 * Returns: errno
570 */ 579 */
571 580
572static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) 581static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
573{ 582{
574 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); 583 return -EINVAL;
575
576 /*
577 * We don't currently have a way to enforce a lease across the whole
578 * cluster; until we do, disable leases (by just returning -EINVAL),
579 * unless the administrator has requested purely local locking.
580 */
581 if (!sdp->sd_args.ar_localflocks)
582 return -EINVAL;
583 return generic_setlease(file, arg, fl);
584}
585
586static int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
587 struct file *file, struct file_lock *fl)
588{
589 int error = -EIO;
590 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
591 error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
592 sdp->sd_lockstruct.ls_lockspace, name, file, fl);
593 return error;
594}
595
596static int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
597 struct file *file, int cmd, struct file_lock *fl)
598{
599 int error = -EIO;
600 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
601 error = sdp->sd_lockstruct.ls_ops->lm_plock(
602 sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
603 return error;
604}
605
606static int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
607 struct file *file, struct file_lock *fl)
608{
609 int error = -EIO;
610 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
611 error = sdp->sd_lockstruct.ls_ops->lm_punlock(
612 sdp->sd_lockstruct.ls_lockspace, name, file, fl);
613 return error;
614} 584}
615 585
616/** 586/**
@@ -626,9 +596,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
626{ 596{
627 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 597 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
628 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); 598 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
629 struct lm_lockname name = 599 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
630 { .ln_number = ip->i_no_addr,
631 .ln_type = LM_TYPE_PLOCK };
632 600
633 if (!(fl->fl_flags & FL_POSIX)) 601 if (!(fl->fl_flags & FL_POSIX))
634 return -ENOLCK; 602 return -ENOLCK;
@@ -640,12 +608,14 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
640 cmd = F_SETLK; 608 cmd = F_SETLK;
641 fl->fl_type = F_UNLCK; 609 fl->fl_type = F_UNLCK;
642 } 610 }
611 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
612 return -EIO;
643 if (IS_GETLK(cmd)) 613 if (IS_GETLK(cmd))
644 return gfs2_lm_plock_get(sdp, &name, file, fl); 614 return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
645 else if (fl->fl_type == F_UNLCK) 615 else if (fl->fl_type == F_UNLCK)
646 return gfs2_lm_punlock(sdp, &name, file, fl); 616 return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
647 else 617 else
648 return gfs2_lm_plock(sdp, &name, file, cmd, fl); 618 return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
649} 619}
650 620
651static int do_flock(struct file *file, int cmd, struct file_lock *fl) 621static int do_flock(struct file *file, int cmd, struct file_lock *fl)
@@ -732,7 +702,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
732 } 702 }
733} 703}
734 704
735const struct file_operations gfs2_file_fops = { 705const struct file_operations *gfs2_file_fops = &(const struct file_operations){
736 .llseek = gfs2_llseek, 706 .llseek = gfs2_llseek,
737 .read = do_sync_read, 707 .read = do_sync_read,
738 .aio_read = generic_file_aio_read, 708 .aio_read = generic_file_aio_read,
@@ -750,7 +720,7 @@ const struct file_operations gfs2_file_fops = {
750 .setlease = gfs2_setlease, 720 .setlease = gfs2_setlease,
751}; 721};
752 722
753const struct file_operations gfs2_dir_fops = { 723const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
754 .readdir = gfs2_readdir, 724 .readdir = gfs2_readdir,
755 .unlocked_ioctl = gfs2_ioctl, 725 .unlocked_ioctl = gfs2_ioctl,
756 .open = gfs2_open, 726 .open = gfs2_open,
@@ -760,7 +730,9 @@ const struct file_operations gfs2_dir_fops = {
760 .flock = gfs2_flock, 730 .flock = gfs2_flock,
761}; 731};
762 732
763const struct file_operations gfs2_file_fops_nolock = { 733#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
734
735const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operations){
764 .llseek = gfs2_llseek, 736 .llseek = gfs2_llseek,
765 .read = do_sync_read, 737 .read = do_sync_read,
766 .aio_read = generic_file_aio_read, 738 .aio_read = generic_file_aio_read,
@@ -773,10 +745,10 @@ const struct file_operations gfs2_file_fops_nolock = {
773 .fsync = gfs2_fsync, 745 .fsync = gfs2_fsync,
774 .splice_read = generic_file_splice_read, 746 .splice_read = generic_file_splice_read,
775 .splice_write = generic_file_splice_write, 747 .splice_write = generic_file_splice_write,
776 .setlease = gfs2_setlease, 748 .setlease = generic_setlease,
777}; 749};
778 750
779const struct file_operations gfs2_dir_fops_nolock = { 751const struct file_operations *gfs2_dir_fops_nolock = &(const struct file_operations){
780 .readdir = gfs2_readdir, 752 .readdir = gfs2_readdir,
781 .unlocked_ioctl = gfs2_ioctl, 753 .unlocked_ioctl = gfs2_ioctl,
782 .open = gfs2_open, 754 .open = gfs2_open,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index f91eebdde581..51883b3ad89c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,7 +17,6 @@
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/lm_interface.h>
21 20
22#include "gfs2.h" 21#include "gfs2.h"
23#include "incore.h" 22#include "incore.h"
@@ -25,7 +24,6 @@
25#include "glock.h" 24#include "glock.h"
26#include "glops.h" 25#include "glops.h"
27#include "inode.h" 26#include "inode.h"
28#include "mount.h"
29#include "recovery.h" 27#include "recovery.h"
30#include "rgrp.h" 28#include "rgrp.h"
31#include "super.h" 29#include "super.h"
@@ -64,7 +62,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
64 gt->gt_quota_warn_period = 10; 62 gt->gt_quota_warn_period = 10;
65 gt->gt_quota_scale_num = 1; 63 gt->gt_quota_scale_num = 1;
66 gt->gt_quota_scale_den = 1; 64 gt->gt_quota_scale_den = 1;
67 gt->gt_quota_cache_secs = 300;
68 gt->gt_quota_quantum = 60; 65 gt->gt_quota_quantum = 60;
69 gt->gt_new_files_jdata = 0; 66 gt->gt_new_files_jdata = 0;
70 gt->gt_max_readahead = 1 << 18; 67 gt->gt_max_readahead = 1 << 18;
@@ -100,7 +97,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
100 mutex_init(&sdp->sd_jindex_mutex); 97 mutex_init(&sdp->sd_jindex_mutex);
101 98
102 INIT_LIST_HEAD(&sdp->sd_quota_list); 99 INIT_LIST_HEAD(&sdp->sd_quota_list);
103 spin_lock_init(&sdp->sd_quota_spin);
104 mutex_init(&sdp->sd_quota_mutex); 100 mutex_init(&sdp->sd_quota_mutex);
105 init_waitqueue_head(&sdp->sd_quota_wait); 101 init_waitqueue_head(&sdp->sd_quota_wait);
106 INIT_LIST_HEAD(&sdp->sd_trunc_list); 102 INIT_LIST_HEAD(&sdp->sd_trunc_list);
@@ -238,6 +234,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
238 234
239 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); 235 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
240 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); 236 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
237 memcpy(sb->sb_uuid, str->sb_uuid, 16);
241} 238}
242 239
243/** 240/**
@@ -299,15 +296,15 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
299 __free_page(page); 296 __free_page(page);
300 return 0; 297 return 0;
301} 298}
299
302/** 300/**
303 * gfs2_read_sb - Read super block 301 * gfs2_read_sb - Read super block
304 * @sdp: The GFS2 superblock 302 * @sdp: The GFS2 superblock
305 * @gl: the glock for the superblock (assumed to be held)
306 * @silent: Don't print message if mount fails 303 * @silent: Don't print message if mount fails
307 * 304 *
308 */ 305 */
309 306
310static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) 307static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
311{ 308{
312 u32 hash_blocks, ind_blocks, leaf_blocks; 309 u32 hash_blocks, ind_blocks, leaf_blocks;
313 u32 tmp_blocks; 310 u32 tmp_blocks;
@@ -527,7 +524,7 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
527 return ret; 524 return ret;
528 } 525 }
529 526
530 ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent); 527 ret = gfs2_read_sb(sdp, silent);
531 if (ret) { 528 if (ret) {
532 fs_err(sdp, "can't read superblock: %d\n", ret); 529 fs_err(sdp, "can't read superblock: %d\n", ret);
533 goto out; 530 goto out;
@@ -630,13 +627,13 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
630 return rc; 627 return rc;
631} 628}
632 629
633static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) 630static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
634{ 631{
635 if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount) 632 char *message = "FIRSTMOUNT=Done";
636 return; 633 char *envp[] = { message, NULL };
637 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 634 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
638 sdp->sd_lockstruct.ls_ops->lm_others_may_mount( 635 ls->ls_first_done = 1;
639 sdp->sd_lockstruct.ls_lockspace); 636 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
640} 637}
641 638
642/** 639/**
@@ -796,7 +793,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
796 } 793 }
797 } 794 }
798 795
799 gfs2_lm_others_may_mount(sdp); 796 gfs2_others_may_mount(sdp);
800 } else if (!sdp->sd_args.ar_spectator) { 797 } else if (!sdp->sd_args.ar_spectator) {
801 error = gfs2_recover_journal(sdp->sd_jdesc); 798 error = gfs2_recover_journal(sdp->sd_jdesc);
802 if (error) { 799 if (error) {
@@ -1005,7 +1002,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
1005 goto fail_quotad; 1002 goto fail_quotad;
1006 1003
1007 sdp->sd_log_flush_time = jiffies; 1004 sdp->sd_log_flush_time = jiffies;
1008 sdp->sd_jindex_refresh_time = jiffies;
1009 1005
1010 p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); 1006 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
1011 error = IS_ERR(p); 1007 error = IS_ERR(p);
@@ -1033,6 +1029,17 @@ fail:
1033 return error; 1029 return error;
1034} 1030}
1035 1031
1032static const match_table_t nolock_tokens = {
1033 { Opt_jid, "jid=%d\n", },
1034 { Opt_err, NULL },
1035};
1036
1037static const struct lm_lockops nolock_ops = {
1038 .lm_proto_name = "lock_nolock",
1039 .lm_put_lock = kmem_cache_free,
1040 .lm_tokens = &nolock_tokens,
1041};
1042
1036/** 1043/**
1037 * gfs2_lm_mount - mount a locking protocol 1044 * gfs2_lm_mount - mount a locking protocol
1038 * @sdp: the filesystem 1045 * @sdp: the filesystem
@@ -1044,31 +1051,73 @@ fail:
1044 1051
1045static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) 1052static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1046{ 1053{
1047 char *proto = sdp->sd_proto_name; 1054 const struct lm_lockops *lm;
1048 char *table = sdp->sd_table_name; 1055 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
1049 int flags = LM_MFLAG_CONV_NODROP; 1056 struct gfs2_args *args = &sdp->sd_args;
1050 int error; 1057 const char *proto = sdp->sd_proto_name;
1058 const char *table = sdp->sd_table_name;
1059 const char *fsname;
1060 char *o, *options;
1061 int ret;
1051 1062
1052 if (sdp->sd_args.ar_spectator) 1063 if (!strcmp("lock_nolock", proto)) {
1053 flags |= LM_MFLAG_SPECTATOR; 1064 lm = &nolock_ops;
1065 sdp->sd_args.ar_localflocks = 1;
1066 sdp->sd_args.ar_localcaching = 1;
1067#ifdef CONFIG_GFS2_FS_LOCKING_DLM
1068 } else if (!strcmp("lock_dlm", proto)) {
1069 lm = &gfs2_dlm_ops;
1070#endif
1071 } else {
1072 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto);
1073 return -ENOENT;
1074 }
1054 1075
1055 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table); 1076 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
1056 1077
1057 error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata, 1078 ls->ls_ops = lm;
1058 gfs2_glock_cb, sdp, 1079 ls->ls_first = 1;
1059 GFS2_MIN_LVB_SIZE, flags, 1080 ls->ls_id = 0;
1060 &sdp->sd_lockstruct, &sdp->sd_kobj);
1061 if (error) {
1062 fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
1063 proto, table, sdp->sd_args.ar_hostdata);
1064 goto out;
1065 }
1066 1081
1067 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) || 1082 for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
1068 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >= 1083 substring_t tmp[MAX_OPT_ARGS];
1069 GFS2_MIN_LVB_SIZE)) { 1084 int token, option;
1070 gfs2_unmount_lockproto(&sdp->sd_lockstruct); 1085
1071 goto out; 1086 if (!o || !*o)
1087 continue;
1088
1089 token = match_token(o, *lm->lm_tokens, tmp);
1090 switch (token) {
1091 case Opt_jid:
1092 ret = match_int(&tmp[0], &option);
1093 if (ret || option < 0)
1094 goto hostdata_error;
1095 ls->ls_jid = option;
1096 break;
1097 case Opt_id:
1098 ret = match_int(&tmp[0], &option);
1099 if (ret)
1100 goto hostdata_error;
1101 ls->ls_id = option;
1102 break;
1103 case Opt_first:
1104 ret = match_int(&tmp[0], &option);
1105 if (ret || (option != 0 && option != 1))
1106 goto hostdata_error;
1107 ls->ls_first = option;
1108 break;
1109 case Opt_nodir:
1110 ret = match_int(&tmp[0], &option);
1111 if (ret || (option != 0 && option != 1))
1112 goto hostdata_error;
1113 ls->ls_nodir = option;
1114 break;
1115 case Opt_err:
1116 default:
1117hostdata_error:
1118 fs_info(sdp, "unknown hostdata (%s)\n", o);
1119 return -EINVAL;
1120 }
1072 } 1121 }
1073 1122
1074 if (sdp->sd_args.ar_spectator) 1123 if (sdp->sd_args.ar_spectator)
@@ -1077,22 +1126,25 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1077 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table, 1126 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
1078 sdp->sd_lockstruct.ls_jid); 1127 sdp->sd_lockstruct.ls_jid);
1079 1128
1080 fs_info(sdp, "Joined cluster. Now mounting FS...\n"); 1129 fsname = strchr(table, ':');
1081 1130 if (fsname)
1082 if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) && 1131 fsname++;
1083 !sdp->sd_args.ar_ignore_local_fs) { 1132 if (lm->lm_mount == NULL) {
1084 sdp->sd_args.ar_localflocks = 1; 1133 fs_info(sdp, "Now mounting FS...\n");
1085 sdp->sd_args.ar_localcaching = 1; 1134 return 0;
1086 } 1135 }
1087 1136 ret = lm->lm_mount(sdp, fsname);
1088out: 1137 if (ret == 0)
1089 return error; 1138 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
1139 return ret;
1090} 1140}
1091 1141
1092void gfs2_lm_unmount(struct gfs2_sbd *sdp) 1142void gfs2_lm_unmount(struct gfs2_sbd *sdp)
1093{ 1143{
1094 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 1144 const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops;
1095 gfs2_unmount_lockproto(&sdp->sd_lockstruct); 1145 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
1146 lm->lm_unmount)
1147 lm->lm_unmount(sdp);
1096} 1148}
1097 1149
1098/** 1150/**
@@ -1116,12 +1168,20 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1116 return -ENOMEM; 1168 return -ENOMEM;
1117 } 1169 }
1118 1170
1119 error = gfs2_mount_args(sdp, (char *)data, 0); 1171 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
1172 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
1173
1174 error = gfs2_mount_args(sdp, &sdp->sd_args, data);
1120 if (error) { 1175 if (error) {
1121 printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); 1176 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
1122 goto fail; 1177 goto fail;
1123 } 1178 }
1124 1179
1180 if (sdp->sd_args.ar_spectator)
1181 sb->s_flags |= MS_RDONLY;
1182 if (sdp->sd_args.ar_posix_acl)
1183 sb->s_flags |= MS_POSIXACL;
1184
1125 sb->s_magic = GFS2_MAGIC; 1185 sb->s_magic = GFS2_MAGIC;
1126 sb->s_op = &gfs2_super_ops; 1186 sb->s_op = &gfs2_super_ops;
1127 sb->s_export_op = &gfs2_export_ops; 1187 sb->s_export_op = &gfs2_export_ops;
@@ -1199,6 +1259,8 @@ fail_sb:
1199 dput(sdp->sd_root_dir); 1259 dput(sdp->sd_root_dir);
1200 if (sdp->sd_master_dir) 1260 if (sdp->sd_master_dir)
1201 dput(sdp->sd_master_dir); 1261 dput(sdp->sd_master_dir);
1262 if (sb->s_root)
1263 dput(sb->s_root);
1202 sb->s_root = NULL; 1264 sb->s_root = NULL;
1203fail_locking: 1265fail_locking:
1204 init_locking(sdp, &mount_gh, UNDO); 1266 init_locking(sdp, &mount_gh, UNDO);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 49877546beb9..abd5429ae285 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,7 +18,6 @@
18#include <linux/posix_acl.h> 18#include <linux/posix_acl.h>
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/crc32.h> 20#include <linux/crc32.h>
21#include <linux/lm_interface.h>
22#include <linux/fiemap.h> 21#include <linux/fiemap.h>
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
24 23
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 320323d03479..458019569dcb 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -19,7 +19,6 @@
19#include <linux/delay.h> 19#include <linux/delay.h>
20#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h> 21#include <linux/crc32.h>
22#include <linux/lm_interface.h>
23#include <linux/time.h> 22#include <linux/time.h>
24 23
25#include "gfs2.h" 24#include "gfs2.h"
@@ -27,7 +26,6 @@
27#include "glock.h" 26#include "glock.h"
28#include "inode.h" 27#include "inode.h"
29#include "log.h" 28#include "log.h"
30#include "mount.h"
31#include "quota.h" 29#include "quota.h"
32#include "recovery.h" 30#include "recovery.h"
33#include "rgrp.h" 31#include "rgrp.h"
@@ -40,6 +38,8 @@
40#include "bmap.h" 38#include "bmap.h"
41#include "meta_io.h" 39#include "meta_io.h"
42 40
41#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
42
43/** 43/**
44 * gfs2_write_inode - Make sure the inode is stable on the disk 44 * gfs2_write_inode - Make sure the inode is stable on the disk
45 * @inode: The inode 45 * @inode: The inode
@@ -435,25 +435,45 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
435static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) 435static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
436{ 436{
437 struct gfs2_sbd *sdp = sb->s_fs_info; 437 struct gfs2_sbd *sdp = sb->s_fs_info;
438 struct gfs2_args args = sdp->sd_args; /* Default to current settings */
438 int error; 439 int error;
439 440
440 error = gfs2_mount_args(sdp, data, 1); 441 error = gfs2_mount_args(sdp, &args, data);
441 if (error) 442 if (error)
442 return error; 443 return error;
443 444
445 /* Not allowed to change locking details */
446 if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
447 strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
448 strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
449 return -EINVAL;
450
451 /* Some flags must not be changed */
452 if (args_neq(&args, &sdp->sd_args, spectator) ||
453 args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
454 args_neq(&args, &sdp->sd_args, localflocks) ||
455 args_neq(&args, &sdp->sd_args, localcaching) ||
456 args_neq(&args, &sdp->sd_args, meta))
457 return -EINVAL;
458
444 if (sdp->sd_args.ar_spectator) 459 if (sdp->sd_args.ar_spectator)
445 *flags |= MS_RDONLY; 460 *flags |= MS_RDONLY;
446 else { 461
447 if (*flags & MS_RDONLY) { 462 if ((sb->s_flags ^ *flags) & MS_RDONLY) {
448 if (!(sb->s_flags & MS_RDONLY)) 463 if (*flags & MS_RDONLY)
449 error = gfs2_make_fs_ro(sdp); 464 error = gfs2_make_fs_ro(sdp);
450 } else if (!(*flags & MS_RDONLY) && 465 else
451 (sb->s_flags & MS_RDONLY)) {
452 error = gfs2_make_fs_rw(sdp); 466 error = gfs2_make_fs_rw(sdp);
453 } 467 if (error)
468 return error;
454 } 469 }
455 470
456 return error; 471 sdp->sd_args = args;
472 if (sdp->sd_args.ar_posix_acl)
473 sb->s_flags |= MS_POSIXACL;
474 else
475 sb->s_flags &= ~MS_POSIXACL;
476 return 0;
457} 477}
458 478
459/** 479/**
@@ -588,6 +608,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
588 } 608 }
589 seq_printf(s, ",data=%s", state); 609 seq_printf(s, ",data=%s", state);
590 } 610 }
611 if (args->ar_discard)
612 seq_printf(s, ",discard");
591 613
592 return 0; 614 return 0;
593} 615}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index b08d09696b3e..8d53f66b5bcc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -45,7 +45,6 @@
45#include <linux/fs.h> 45#include <linux/fs.h>
46#include <linux/bio.h> 46#include <linux/bio.h>
47#include <linux/gfs2_ondisk.h> 47#include <linux/gfs2_ondisk.h>
48#include <linux/lm_interface.h>
49#include <linux/kthread.h> 48#include <linux/kthread.h>
50#include <linux/freezer.h> 49#include <linux/freezer.h>
51 50
@@ -80,6 +79,51 @@ struct gfs2_quota_change_host {
80 u32 qc_id; 79 u32 qc_id;
81}; 80};
82 81
82static LIST_HEAD(qd_lru_list);
83static atomic_t qd_lru_count = ATOMIC_INIT(0);
84static spinlock_t qd_lru_lock = SPIN_LOCK_UNLOCKED;
85
86int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
87{
88 struct gfs2_quota_data *qd;
89 struct gfs2_sbd *sdp;
90
91 if (nr == 0)
92 goto out;
93
94 if (!(gfp_mask & __GFP_FS))
95 return -1;
96
97 spin_lock(&qd_lru_lock);
98 while (nr && !list_empty(&qd_lru_list)) {
99 qd = list_entry(qd_lru_list.next,
100 struct gfs2_quota_data, qd_reclaim);
101 sdp = qd->qd_gl->gl_sbd;
102
103 /* Free from the filesystem-specific list */
104 list_del(&qd->qd_list);
105
106 gfs2_assert_warn(sdp, !qd->qd_change);
107 gfs2_assert_warn(sdp, !qd->qd_slot_count);
108 gfs2_assert_warn(sdp, !qd->qd_bh_count);
109
110 gfs2_glock_put(qd->qd_gl);
111 atomic_dec(&sdp->sd_quota_count);
112
113 /* Delete it from the common reclaim list */
114 list_del_init(&qd->qd_reclaim);
115 atomic_dec(&qd_lru_count);
116 spin_unlock(&qd_lru_lock);
117 kmem_cache_free(gfs2_quotad_cachep, qd);
118 spin_lock(&qd_lru_lock);
119 nr--;
120 }
121 spin_unlock(&qd_lru_lock);
122
123out:
124 return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100;
125}
126
83static u64 qd2offset(struct gfs2_quota_data *qd) 127static u64 qd2offset(struct gfs2_quota_data *qd)
84{ 128{
85 u64 offset; 129 u64 offset;
@@ -100,22 +144,18 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
100 if (!qd) 144 if (!qd)
101 return -ENOMEM; 145 return -ENOMEM;
102 146
103 qd->qd_count = 1; 147 atomic_set(&qd->qd_count, 1);
104 qd->qd_id = id; 148 qd->qd_id = id;
105 if (user) 149 if (user)
106 set_bit(QDF_USER, &qd->qd_flags); 150 set_bit(QDF_USER, &qd->qd_flags);
107 qd->qd_slot = -1; 151 qd->qd_slot = -1;
152 INIT_LIST_HEAD(&qd->qd_reclaim);
108 153
109 error = gfs2_glock_get(sdp, 2 * (u64)id + !user, 154 error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
110 &gfs2_quota_glops, CREATE, &qd->qd_gl); 155 &gfs2_quota_glops, CREATE, &qd->qd_gl);
111 if (error) 156 if (error)
112 goto fail; 157 goto fail;
113 158
114 error = gfs2_lvb_hold(qd->qd_gl);
115 gfs2_glock_put(qd->qd_gl);
116 if (error)
117 goto fail;
118
119 *qdp = qd; 159 *qdp = qd;
120 160
121 return 0; 161 return 0;
@@ -135,11 +175,17 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
135 175
136 for (;;) { 176 for (;;) {
137 found = 0; 177 found = 0;
138 spin_lock(&sdp->sd_quota_spin); 178 spin_lock(&qd_lru_lock);
139 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { 179 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
140 if (qd->qd_id == id && 180 if (qd->qd_id == id &&
141 !test_bit(QDF_USER, &qd->qd_flags) == !user) { 181 !test_bit(QDF_USER, &qd->qd_flags) == !user) {
142 qd->qd_count++; 182 if (!atomic_read(&qd->qd_count) &&
183 !list_empty(&qd->qd_reclaim)) {
184 /* Remove it from reclaim list */
185 list_del_init(&qd->qd_reclaim);
186 atomic_dec(&qd_lru_count);
187 }
188 atomic_inc(&qd->qd_count);
143 found = 1; 189 found = 1;
144 break; 190 break;
145 } 191 }
@@ -155,11 +201,11 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
155 new_qd = NULL; 201 new_qd = NULL;
156 } 202 }
157 203
158 spin_unlock(&sdp->sd_quota_spin); 204 spin_unlock(&qd_lru_lock);
159 205
160 if (qd || !create) { 206 if (qd || !create) {
161 if (new_qd) { 207 if (new_qd) {
162 gfs2_lvb_unhold(new_qd->qd_gl); 208 gfs2_glock_put(new_qd->qd_gl);
163 kmem_cache_free(gfs2_quotad_cachep, new_qd); 209 kmem_cache_free(gfs2_quotad_cachep, new_qd);
164 } 210 }
165 *qdp = qd; 211 *qdp = qd;
@@ -175,21 +221,18 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
175static void qd_hold(struct gfs2_quota_data *qd) 221static void qd_hold(struct gfs2_quota_data *qd)
176{ 222{
177 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 223 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
178 224 gfs2_assert(sdp, atomic_read(&qd->qd_count));
179 spin_lock(&sdp->sd_quota_spin); 225 atomic_inc(&qd->qd_count);
180 gfs2_assert(sdp, qd->qd_count);
181 qd->qd_count++;
182 spin_unlock(&sdp->sd_quota_spin);
183} 226}
184 227
185static void qd_put(struct gfs2_quota_data *qd) 228static void qd_put(struct gfs2_quota_data *qd)
186{ 229{
187 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 230 if (atomic_dec_and_lock(&qd->qd_count, &qd_lru_lock)) {
188 spin_lock(&sdp->sd_quota_spin); 231 /* Add to the reclaim list */
189 gfs2_assert(sdp, qd->qd_count); 232 list_add_tail(&qd->qd_reclaim, &qd_lru_list);
190 if (!--qd->qd_count) 233 atomic_inc(&qd_lru_count);
191 qd->qd_last_touched = jiffies; 234 spin_unlock(&qd_lru_lock);
192 spin_unlock(&sdp->sd_quota_spin); 235 }
193} 236}
194 237
195static int slot_get(struct gfs2_quota_data *qd) 238static int slot_get(struct gfs2_quota_data *qd)
@@ -198,10 +241,10 @@ static int slot_get(struct gfs2_quota_data *qd)
198 unsigned int c, o = 0, b; 241 unsigned int c, o = 0, b;
199 unsigned char byte = 0; 242 unsigned char byte = 0;
200 243
201 spin_lock(&sdp->sd_quota_spin); 244 spin_lock(&qd_lru_lock);
202 245
203 if (qd->qd_slot_count++) { 246 if (qd->qd_slot_count++) {
204 spin_unlock(&sdp->sd_quota_spin); 247 spin_unlock(&qd_lru_lock);
205 return 0; 248 return 0;
206 } 249 }
207 250
@@ -225,13 +268,13 @@ found:
225 268
226 sdp->sd_quota_bitmap[c][o] |= 1 << b; 269 sdp->sd_quota_bitmap[c][o] |= 1 << b;
227 270
228 spin_unlock(&sdp->sd_quota_spin); 271 spin_unlock(&qd_lru_lock);
229 272
230 return 0; 273 return 0;
231 274
232fail: 275fail:
233 qd->qd_slot_count--; 276 qd->qd_slot_count--;
234 spin_unlock(&sdp->sd_quota_spin); 277 spin_unlock(&qd_lru_lock);
235 return -ENOSPC; 278 return -ENOSPC;
236} 279}
237 280
@@ -239,23 +282,23 @@ static void slot_hold(struct gfs2_quota_data *qd)
239{ 282{
240 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 283 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
241 284
242 spin_lock(&sdp->sd_quota_spin); 285 spin_lock(&qd_lru_lock);
243 gfs2_assert(sdp, qd->qd_slot_count); 286 gfs2_assert(sdp, qd->qd_slot_count);
244 qd->qd_slot_count++; 287 qd->qd_slot_count++;
245 spin_unlock(&sdp->sd_quota_spin); 288 spin_unlock(&qd_lru_lock);
246} 289}
247 290
248static void slot_put(struct gfs2_quota_data *qd) 291static void slot_put(struct gfs2_quota_data *qd)
249{ 292{
250 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 293 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
251 294
252 spin_lock(&sdp->sd_quota_spin); 295 spin_lock(&qd_lru_lock);
253 gfs2_assert(sdp, qd->qd_slot_count); 296 gfs2_assert(sdp, qd->qd_slot_count);
254 if (!--qd->qd_slot_count) { 297 if (!--qd->qd_slot_count) {
255 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); 298 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
256 qd->qd_slot = -1; 299 qd->qd_slot = -1;
257 } 300 }
258 spin_unlock(&sdp->sd_quota_spin); 301 spin_unlock(&qd_lru_lock);
259} 302}
260 303
261static int bh_get(struct gfs2_quota_data *qd) 304static int bh_get(struct gfs2_quota_data *qd)
@@ -330,7 +373,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
330 if (sdp->sd_vfs->s_flags & MS_RDONLY) 373 if (sdp->sd_vfs->s_flags & MS_RDONLY)
331 return 0; 374 return 0;
332 375
333 spin_lock(&sdp->sd_quota_spin); 376 spin_lock(&qd_lru_lock);
334 377
335 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { 378 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
336 if (test_bit(QDF_LOCKED, &qd->qd_flags) || 379 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
@@ -341,8 +384,8 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
341 list_move_tail(&qd->qd_list, &sdp->sd_quota_list); 384 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
342 385
343 set_bit(QDF_LOCKED, &qd->qd_flags); 386 set_bit(QDF_LOCKED, &qd->qd_flags);
344 gfs2_assert_warn(sdp, qd->qd_count); 387 gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
345 qd->qd_count++; 388 atomic_inc(&qd->qd_count);
346 qd->qd_change_sync = qd->qd_change; 389 qd->qd_change_sync = qd->qd_change;
347 gfs2_assert_warn(sdp, qd->qd_slot_count); 390 gfs2_assert_warn(sdp, qd->qd_slot_count);
348 qd->qd_slot_count++; 391 qd->qd_slot_count++;
@@ -354,7 +397,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
354 if (!found) 397 if (!found)
355 qd = NULL; 398 qd = NULL;
356 399
357 spin_unlock(&sdp->sd_quota_spin); 400 spin_unlock(&qd_lru_lock);
358 401
359 if (qd) { 402 if (qd) {
360 gfs2_assert_warn(sdp, qd->qd_change_sync); 403 gfs2_assert_warn(sdp, qd->qd_change_sync);
@@ -379,24 +422,24 @@ static int qd_trylock(struct gfs2_quota_data *qd)
379 if (sdp->sd_vfs->s_flags & MS_RDONLY) 422 if (sdp->sd_vfs->s_flags & MS_RDONLY)
380 return 0; 423 return 0;
381 424
382 spin_lock(&sdp->sd_quota_spin); 425 spin_lock(&qd_lru_lock);
383 426
384 if (test_bit(QDF_LOCKED, &qd->qd_flags) || 427 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
385 !test_bit(QDF_CHANGE, &qd->qd_flags)) { 428 !test_bit(QDF_CHANGE, &qd->qd_flags)) {
386 spin_unlock(&sdp->sd_quota_spin); 429 spin_unlock(&qd_lru_lock);
387 return 0; 430 return 0;
388 } 431 }
389 432
390 list_move_tail(&qd->qd_list, &sdp->sd_quota_list); 433 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
391 434
392 set_bit(QDF_LOCKED, &qd->qd_flags); 435 set_bit(QDF_LOCKED, &qd->qd_flags);
393 gfs2_assert_warn(sdp, qd->qd_count); 436 gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
394 qd->qd_count++; 437 atomic_inc(&qd->qd_count);
395 qd->qd_change_sync = qd->qd_change; 438 qd->qd_change_sync = qd->qd_change;
396 gfs2_assert_warn(sdp, qd->qd_slot_count); 439 gfs2_assert_warn(sdp, qd->qd_slot_count);
397 qd->qd_slot_count++; 440 qd->qd_slot_count++;
398 441
399 spin_unlock(&sdp->sd_quota_spin); 442 spin_unlock(&qd_lru_lock);
400 443
401 gfs2_assert_warn(sdp, qd->qd_change_sync); 444 gfs2_assert_warn(sdp, qd->qd_change_sync);
402 if (bh_get(qd)) { 445 if (bh_get(qd)) {
@@ -556,9 +599,9 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
556 x = be64_to_cpu(qc->qc_change) + change; 599 x = be64_to_cpu(qc->qc_change) + change;
557 qc->qc_change = cpu_to_be64(x); 600 qc->qc_change = cpu_to_be64(x);
558 601
559 spin_lock(&sdp->sd_quota_spin); 602 spin_lock(&qd_lru_lock);
560 qd->qd_change = x; 603 qd->qd_change = x;
561 spin_unlock(&sdp->sd_quota_spin); 604 spin_unlock(&qd_lru_lock);
562 605
563 if (!x) { 606 if (!x) {
564 gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags)); 607 gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
@@ -802,8 +845,8 @@ restart:
802 loff_t pos; 845 loff_t pos;
803 gfs2_glock_dq_uninit(q_gh); 846 gfs2_glock_dq_uninit(q_gh);
804 error = gfs2_glock_nq_init(qd->qd_gl, 847 error = gfs2_glock_nq_init(qd->qd_gl,
805 LM_ST_EXCLUSIVE, GL_NOCACHE, 848 LM_ST_EXCLUSIVE, GL_NOCACHE,
806 q_gh); 849 q_gh);
807 if (error) 850 if (error)
808 return error; 851 return error;
809 852
@@ -820,7 +863,6 @@ restart:
820 863
821 gfs2_glock_dq_uninit(&i_gh); 864 gfs2_glock_dq_uninit(&i_gh);
822 865
823
824 gfs2_quota_in(&q, buf); 866 gfs2_quota_in(&q, buf);
825 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 867 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
826 qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); 868 qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
@@ -890,9 +932,9 @@ static int need_sync(struct gfs2_quota_data *qd)
890 if (!qd->qd_qb.qb_limit) 932 if (!qd->qd_qb.qb_limit)
891 return 0; 933 return 0;
892 934
893 spin_lock(&sdp->sd_quota_spin); 935 spin_lock(&qd_lru_lock);
894 value = qd->qd_change; 936 value = qd->qd_change;
895 spin_unlock(&sdp->sd_quota_spin); 937 spin_unlock(&qd_lru_lock);
896 938
897 spin_lock(&gt->gt_spin); 939 spin_lock(&gt->gt_spin);
898 num = gt->gt_quota_scale_num; 940 num = gt->gt_quota_scale_num;
@@ -985,9 +1027,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
985 continue; 1027 continue;
986 1028
987 value = (s64)be64_to_cpu(qd->qd_qb.qb_value); 1029 value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
988 spin_lock(&sdp->sd_quota_spin); 1030 spin_lock(&qd_lru_lock);
989 value += qd->qd_change; 1031 value += qd->qd_change;
990 spin_unlock(&sdp->sd_quota_spin); 1032 spin_unlock(&qd_lru_lock);
991 1033
992 if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { 1034 if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
993 print_message(qd, "exceeded"); 1035 print_message(qd, "exceeded");
@@ -1171,13 +1213,12 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1171 qd->qd_change = qc.qc_change; 1213 qd->qd_change = qc.qc_change;
1172 qd->qd_slot = slot; 1214 qd->qd_slot = slot;
1173 qd->qd_slot_count = 1; 1215 qd->qd_slot_count = 1;
1174 qd->qd_last_touched = jiffies;
1175 1216
1176 spin_lock(&sdp->sd_quota_spin); 1217 spin_lock(&qd_lru_lock);
1177 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); 1218 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
1178 list_add(&qd->qd_list, &sdp->sd_quota_list); 1219 list_add(&qd->qd_list, &sdp->sd_quota_list);
1179 atomic_inc(&sdp->sd_quota_count); 1220 atomic_inc(&sdp->sd_quota_count);
1180 spin_unlock(&sdp->sd_quota_spin); 1221 spin_unlock(&qd_lru_lock);
1181 1222
1182 found++; 1223 found++;
1183 } 1224 }
@@ -1197,73 +1238,48 @@ fail:
1197 return error; 1238 return error;
1198} 1239}
1199 1240
1200static void gfs2_quota_scan(struct gfs2_sbd *sdp)
1201{
1202 struct gfs2_quota_data *qd, *safe;
1203 LIST_HEAD(dead);
1204
1205 spin_lock(&sdp->sd_quota_spin);
1206 list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
1207 if (!qd->qd_count &&
1208 time_after_eq(jiffies, qd->qd_last_touched +
1209 gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
1210 list_move(&qd->qd_list, &dead);
1211 gfs2_assert_warn(sdp,
1212 atomic_read(&sdp->sd_quota_count) > 0);
1213 atomic_dec(&sdp->sd_quota_count);
1214 }
1215 }
1216 spin_unlock(&sdp->sd_quota_spin);
1217
1218 while (!list_empty(&dead)) {
1219 qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
1220 list_del(&qd->qd_list);
1221
1222 gfs2_assert_warn(sdp, !qd->qd_change);
1223 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1224 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1225
1226 gfs2_lvb_unhold(qd->qd_gl);
1227 kmem_cache_free(gfs2_quotad_cachep, qd);
1228 }
1229}
1230
1231void gfs2_quota_cleanup(struct gfs2_sbd *sdp) 1241void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1232{ 1242{
1233 struct list_head *head = &sdp->sd_quota_list; 1243 struct list_head *head = &sdp->sd_quota_list;
1234 struct gfs2_quota_data *qd; 1244 struct gfs2_quota_data *qd;
1235 unsigned int x; 1245 unsigned int x;
1236 1246
1237 spin_lock(&sdp->sd_quota_spin); 1247 spin_lock(&qd_lru_lock);
1238 while (!list_empty(head)) { 1248 while (!list_empty(head)) {
1239 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); 1249 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1240 1250
1241 if (qd->qd_count > 1 || 1251 if (atomic_read(&qd->qd_count) > 1 ||
1242 (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) { 1252 (atomic_read(&qd->qd_count) &&
1253 !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1243 list_move(&qd->qd_list, head); 1254 list_move(&qd->qd_list, head);
1244 spin_unlock(&sdp->sd_quota_spin); 1255 spin_unlock(&qd_lru_lock);
1245 schedule(); 1256 schedule();
1246 spin_lock(&sdp->sd_quota_spin); 1257 spin_lock(&qd_lru_lock);
1247 continue; 1258 continue;
1248 } 1259 }
1249 1260
1250 list_del(&qd->qd_list); 1261 list_del(&qd->qd_list);
1262 /* Also remove if this qd exists in the reclaim list */
1263 if (!list_empty(&qd->qd_reclaim)) {
1264 list_del_init(&qd->qd_reclaim);
1265 atomic_dec(&qd_lru_count);
1266 }
1251 atomic_dec(&sdp->sd_quota_count); 1267 atomic_dec(&sdp->sd_quota_count);
1252 spin_unlock(&sdp->sd_quota_spin); 1268 spin_unlock(&qd_lru_lock);
1253 1269
1254 if (!qd->qd_count) { 1270 if (!atomic_read(&qd->qd_count)) {
1255 gfs2_assert_warn(sdp, !qd->qd_change); 1271 gfs2_assert_warn(sdp, !qd->qd_change);
1256 gfs2_assert_warn(sdp, !qd->qd_slot_count); 1272 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1257 } else 1273 } else
1258 gfs2_assert_warn(sdp, qd->qd_slot_count == 1); 1274 gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
1259 gfs2_assert_warn(sdp, !qd->qd_bh_count); 1275 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1260 1276
1261 gfs2_lvb_unhold(qd->qd_gl); 1277 gfs2_glock_put(qd->qd_gl);
1262 kmem_cache_free(gfs2_quotad_cachep, qd); 1278 kmem_cache_free(gfs2_quotad_cachep, qd);
1263 1279
1264 spin_lock(&sdp->sd_quota_spin); 1280 spin_lock(&qd_lru_lock);
1265 } 1281 }
1266 spin_unlock(&sdp->sd_quota_spin); 1282 spin_unlock(&qd_lru_lock);
1267 1283
1268 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); 1284 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1269 1285
@@ -1341,9 +1357,6 @@ int gfs2_quotad(void *data)
1341 quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, 1357 quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
1342 &quotad_timeo, &tune->gt_quota_quantum); 1358 &quotad_timeo, &tune->gt_quota_quantum);
1343 1359
1344 /* FIXME: This should be turned into a shrinker */
1345 gfs2_quota_scan(sdp);
1346
1347 /* Check for & recover partially truncated inodes */ 1360 /* Check for & recover partially truncated inodes */
1348 quotad_check_trunc_list(sdp); 1361 quotad_check_trunc_list(sdp);
1349 1362
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index cec9032be97d..0fa5fa63d0e8 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -49,4 +49,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
49 return ret; 49 return ret;
50} 50}
51 51
52extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
53
52#endif /* __QUOTA_DOT_H__ */ 54#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index efd09c3d2b26..247e8f7d6b3d 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,7 +13,6 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/lm_interface.h>
17#include <linux/kthread.h> 16#include <linux/kthread.h>
18#include <linux/freezer.h> 17#include <linux/freezer.h>
19 18
@@ -427,20 +426,23 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
427} 426}
428 427
429 428
430static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, 429static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
431 unsigned int message) 430 unsigned int message)
432{ 431{
433 if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done) 432 char env_jid[20];
434 return; 433 char env_status[20];
435 434 char *envp[] = { env_jid, env_status, NULL };
436 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 435 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
437 sdp->sd_lockstruct.ls_ops->lm_recovery_done( 436 ls->ls_recover_jid_done = jid;
438 sdp->sd_lockstruct.ls_lockspace, jid, message); 437 ls->ls_recover_jid_status = message;
438 sprintf(env_jid, "JID=%d", jid);
439 sprintf(env_status, "RECOVERY=%s",
440 message == LM_RD_SUCCESS ? "Done" : "Failed");
441 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
439} 442}
440 443
441
442/** 444/**
443 * gfs2_recover_journal - recovery a given journal 445 * gfs2_recover_journal - recover a given journal
444 * @jd: the struct gfs2_jdesc describing the journal 446 * @jd: the struct gfs2_jdesc describing the journal
445 * 447 *
446 * Acquire the journal's lock, check to see if the journal is clean, and 448 * Acquire the journal's lock, check to see if the journal is clean, and
@@ -561,7 +563,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
561 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) 563 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
562 gfs2_glock_dq_uninit(&ji_gh); 564 gfs2_glock_dq_uninit(&ji_gh);
563 565
564 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); 566 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
565 567
566 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) 568 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
567 gfs2_glock_dq_uninit(&j_gh); 569 gfs2_glock_dq_uninit(&j_gh);
@@ -581,7 +583,7 @@ fail_gunlock_j:
581 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); 583 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
582 584
583fail: 585fail:
584 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); 586 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
585 return error; 587 return error;
586} 588}
587 589
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8b01c635d925..f03d024038ea 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -13,8 +13,8 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17#include <linux/prefetch.h> 16#include <linux/prefetch.h>
17#include <linux/blkdev.h>
18 18
19#include "gfs2.h" 19#include "gfs2.h"
20#include "incore.h" 20#include "incore.h"
@@ -132,81 +132,90 @@ static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
132} 132}
133 133
134/** 134/**
135 * gfs2_bit_search
136 * @ptr: Pointer to bitmap data
137 * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
138 * @state: The state we are searching for
139 *
140 * We xor the bitmap data with a patter which is the bitwise opposite
141 * of what we are looking for, this gives rise to a pattern of ones
142 * wherever there is a match. Since we have two bits per entry, we
143 * take this pattern, shift it down by one place and then and it with
144 * the original. All the even bit positions (0,2,4, etc) then represent
145 * successful matches, so we mask with 0x55555..... to remove the unwanted
146 * odd bit positions.
147 *
148 * This allows searching of a whole u64 at once (32 blocks) with a
149 * single test (on 64 bit arches).
150 */
151
152static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
153{
154 u64 tmp;
155 static const u64 search[] = {
156 [0] = 0xffffffffffffffffULL,
157 [1] = 0xaaaaaaaaaaaaaaaaULL,
158 [2] = 0x5555555555555555ULL,
159 [3] = 0x0000000000000000ULL,
160 };
161 tmp = le64_to_cpu(*ptr) ^ search[state];
162 tmp &= (tmp >> 1);
163 tmp &= mask;
164 return tmp;
165}
166
167/**
135 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing 168 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
136 * a block in a given allocation state. 169 * a block in a given allocation state.
137 * @buffer: the buffer that holds the bitmaps 170 * @buffer: the buffer that holds the bitmaps
138 * @buflen: the length (in bytes) of the buffer 171 * @len: the length (in bytes) of the buffer
139 * @goal: start search at this block's bit-pair (within @buffer) 172 * @goal: start search at this block's bit-pair (within @buffer)
140 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for. 173 * @state: GFS2_BLKST_XXX the state of the block we're looking for.
141 * 174 *
142 * Scope of @goal and returned block number is only within this bitmap buffer, 175 * Scope of @goal and returned block number is only within this bitmap buffer,
143 * not entire rgrp or filesystem. @buffer will be offset from the actual 176 * not entire rgrp or filesystem. @buffer will be offset from the actual
144 * beginning of a bitmap block buffer, skipping any header structures. 177 * beginning of a bitmap block buffer, skipping any header structures, but
178 * headers are always a multiple of 64 bits long so that the buffer is
179 * always aligned to a 64 bit boundary.
180 *
181 * The size of the buffer is in bytes, but is it assumed that it is
182 * always ok to to read a complete multiple of 64 bits at the end
183 * of the block in case the end is no aligned to a natural boundary.
145 * 184 *
146 * Return: the block number (bitmap buffer scope) that was found 185 * Return: the block number (bitmap buffer scope) that was found
147 */ 186 */
148 187
149static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal, 188static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
150 u8 old_state) 189 u32 goal, u8 state)
151{ 190{
152 const u8 *byte, *start, *end; 191 u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
153 int bit, startbit; 192 const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
154 u32 g1, g2, misaligned; 193 const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
155 unsigned long *plong; 194 u64 tmp;
156 unsigned long lskipval; 195 u64 mask = 0x5555555555555555ULL;
157 196 u32 bit;
158 lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55; 197
159 g1 = (goal / GFS2_NBBY); 198 BUG_ON(state > 3);
160 start = buffer + g1; 199
161 byte = start; 200 /* Mask off bits we don't care about at the start of the search */
162 end = buffer + buflen; 201 mask <<= spoint;
163 g2 = ALIGN(g1, sizeof(unsigned long)); 202 tmp = gfs2_bit_search(ptr, mask, state);
164 plong = (unsigned long *)(buffer + g2); 203 ptr++;
165 startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; 204 while(tmp == 0 && ptr < end) {
166 misaligned = g2 - g1; 205 tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
167 if (!misaligned) 206 ptr++;
168 goto ulong_aligned;
169/* parse the bitmap a byte at a time */
170misaligned:
171 while (byte < end) {
172 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
173 return goal +
174 (((byte - start) * GFS2_NBBY) +
175 ((bit - startbit) >> 1));
176 }
177 bit += GFS2_BIT_SIZE;
178 if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
179 bit = 0;
180 byte++;
181 misaligned--;
182 if (!misaligned) {
183 plong = (unsigned long *)byte;
184 goto ulong_aligned;
185 }
186 }
187 }
188 return BFITNOENT;
189
190/* parse the bitmap a unsigned long at a time */
191ulong_aligned:
192 /* Stop at "end - 1" or else prefetch can go past the end and segfault.
193 We could "if" it but we'd lose some of the performance gained.
194 This way will only slow down searching the very last 4/8 bytes
195 depending on architecture. I've experimented with several ways
196 of writing this section such as using an else before the goto
197 but this one seems to be the fastest. */
198 while ((unsigned char *)plong < end - sizeof(unsigned long)) {
199 prefetch(plong + 1);
200 if (((*plong) & LBITMASK) != lskipval)
201 break;
202 plong++;
203 }
204 if ((unsigned char *)plong < end) {
205 byte = (const u8 *)plong;
206 misaligned += sizeof(unsigned long) - 1;
207 goto misaligned;
208 } 207 }
209 return BFITNOENT; 208 /* Mask off any bits which are more than len bytes from the start */
209 if (ptr == end && (len & (sizeof(u64) - 1)))
210 tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
211 /* Didn't find anything, so return */
212 if (tmp == 0)
213 return BFITNOENT;
214 ptr--;
215 bit = fls64(tmp);
216 bit--; /* fls64 always adds one to the bit count */
217 bit /= 2; /* two bits per entry in the bitmap */
218 return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
210} 219}
211 220
212/** 221/**
@@ -831,6 +840,58 @@ void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
831 spin_unlock(&sdp->sd_rindex_spin); 840 spin_unlock(&sdp->sd_rindex_spin);
832} 841}
833 842
843static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
844 const struct gfs2_bitmap *bi)
845{
846 struct super_block *sb = sdp->sd_vfs;
847 struct block_device *bdev = sb->s_bdev;
848 const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
849 bdev_hardsect_size(sb->s_bdev);
850 u64 blk;
851 sector_t start = 0;
852 sector_t nr_sects = 0;
853 int rv;
854 unsigned int x;
855
856 for (x = 0; x < bi->bi_len; x++) {
857 const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x;
858 const u8 *clone = bi->bi_clone + bi->bi_offset + x;
859 u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
860 diff &= 0x55;
861 if (diff == 0)
862 continue;
863 blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
864 blk *= sects_per_blk; /* convert to sectors */
865 while(diff) {
866 if (diff & 1) {
867 if (nr_sects == 0)
868 goto start_new_extent;
869 if ((start + nr_sects) != blk) {
870 rv = blkdev_issue_discard(bdev, start,
871 nr_sects, GFP_NOFS);
872 if (rv)
873 goto fail;
874 nr_sects = 0;
875start_new_extent:
876 start = blk;
877 }
878 nr_sects += sects_per_blk;
879 }
880 diff >>= 2;
881 blk += sects_per_blk;
882 }
883 }
884 if (nr_sects) {
885 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
886 if (rv)
887 goto fail;
888 }
889 return;
890fail:
891 fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv);
892 sdp->sd_args.ar_discard = 0;
893}
894
834void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) 895void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
835{ 896{
836 struct gfs2_sbd *sdp = rgd->rd_sbd; 897 struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -841,6 +902,8 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
841 struct gfs2_bitmap *bi = rgd->rd_bits + x; 902 struct gfs2_bitmap *bi = rgd->rd_bits + x;
842 if (!bi->bi_clone) 903 if (!bi->bi_clone)
843 continue; 904 continue;
905 if (sdp->sd_args.ar_discard)
906 gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
844 memcpy(bi->bi_clone + bi->bi_offset, 907 memcpy(bi->bi_clone + bi->bi_offset,
845 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); 908 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
846 } 909 }
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 141b781f2fcc..601913e0a482 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -15,7 +15,6 @@
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <linux/bio.h> 17#include <linux/bio.h>
18#include <linux/lm_interface.h>
19 18
20#include "gfs2.h" 19#include "gfs2.h"
21#include "incore.h" 20#include "incore.h"
@@ -339,7 +338,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
339 struct gfs2_holder *t_gh) 338 struct gfs2_holder *t_gh)
340{ 339{
341 struct gfs2_inode *ip; 340 struct gfs2_inode *ip;
342 struct gfs2_holder ji_gh;
343 struct gfs2_jdesc *jd; 341 struct gfs2_jdesc *jd;
344 struct lfcc *lfcc; 342 struct lfcc *lfcc;
345 LIST_HEAD(list); 343 LIST_HEAD(list);
@@ -387,7 +385,6 @@ out:
387 gfs2_glock_dq_uninit(&lfcc->gh); 385 gfs2_glock_dq_uninit(&lfcc->gh);
388 kfree(lfcc); 386 kfree(lfcc);
389 } 387 }
390 gfs2_glock_dq_uninit(&ji_gh);
391 return error; 388 return error;
392} 389}
393 390
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index f6b8b00ad881..91abdbedcc86 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -14,7 +14,7 @@
14#include <linux/dcache.h> 14#include <linux/dcache.h>
15#include "incore.h" 15#include "incore.h"
16 16
17void gfs2_lm_unmount(struct gfs2_sbd *sdp); 17extern void gfs2_lm_unmount(struct gfs2_sbd *sdp);
18 18
19static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) 19static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
20{ 20{
@@ -27,21 +27,23 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
27 27
28void gfs2_jindex_free(struct gfs2_sbd *sdp); 28void gfs2_jindex_free(struct gfs2_sbd *sdp);
29 29
30struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); 30extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
31int gfs2_jdesc_check(struct gfs2_jdesc *jd);
32 31
33int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, 32extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
34 struct gfs2_inode **ipp); 33extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
35 34
36int gfs2_make_fs_rw(struct gfs2_sbd *sdp); 35extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
36 struct gfs2_inode **ipp);
37 37
38int gfs2_statfs_init(struct gfs2_sbd *sdp); 38extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
39void gfs2_statfs_change(struct gfs2_sbd *sdp,
40 s64 total, s64 free, s64 dinodes);
41int gfs2_statfs_sync(struct gfs2_sbd *sdp);
42 39
43int gfs2_freeze_fs(struct gfs2_sbd *sdp); 40extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
44void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); 41extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
42 s64 dinodes);
43extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
44
45extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
46extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
45 47
46extern struct file_system_type gfs2_fs_type; 48extern struct file_system_type gfs2_fs_type;
47extern struct file_system_type gfs2meta_fs_type; 49extern struct file_system_type gfs2meta_fs_type;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 26c1fa777a95..7655f5025fec 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -14,9 +14,8 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/kobject.h> 16#include <linux/kobject.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include <linux/gfs2_ondisk.h>
20 19
21#include "gfs2.h" 20#include "gfs2.h"
22#include "incore.h" 21#include "incore.h"
@@ -25,6 +24,7 @@
25#include "glock.h" 24#include "glock.h"
26#include "quota.h" 25#include "quota.h"
27#include "util.h" 26#include "util.h"
27#include "glops.h"
28 28
29static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) 29static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
30{ 30{
@@ -37,6 +37,30 @@ static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
37 return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname); 37 return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
38} 38}
39 39
40static int gfs2_uuid_valid(const u8 *uuid)
41{
42 int i;
43
44 for (i = 0; i < 16; i++) {
45 if (uuid[i])
46 return 1;
47 }
48 return 0;
49}
50
51static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
52{
53 const u8 *uuid = sdp->sd_sb.sb_uuid;
54 buf[0] = '\0';
55 if (!gfs2_uuid_valid(uuid))
56 return 0;
57 return snprintf(buf, PAGE_SIZE, "%02X%02X%02X%02X-%02X%02X-"
58 "%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
59 uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5],
60 uuid[6], uuid[7], uuid[8], uuid[9], uuid[10], uuid[11],
61 uuid[12], uuid[13], uuid[14], uuid[15]);
62}
63
40static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) 64static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
41{ 65{
42 unsigned int count; 66 unsigned int count;
@@ -148,6 +172,46 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
148 return len; 172 return len;
149} 173}
150 174
175static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
176{
177 struct gfs2_glock *gl;
178 const struct gfs2_glock_operations *glops;
179 unsigned int glmode;
180 unsigned int gltype;
181 unsigned long long glnum;
182 char mode[16];
183 int rv;
184
185 if (!capable(CAP_SYS_ADMIN))
186 return -EACCES;
187
188 rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum,
189 mode);
190 if (rv != 3)
191 return -EINVAL;
192
193 if (strcmp(mode, "EX") == 0)
194 glmode = LM_ST_UNLOCKED;
195 else if ((strcmp(mode, "CW") == 0) || (strcmp(mode, "DF") == 0))
196 glmode = LM_ST_DEFERRED;
197 else if ((strcmp(mode, "PR") == 0) || (strcmp(mode, "SH") == 0))
198 glmode = LM_ST_SHARED;
199 else
200 return -EINVAL;
201
202 if (gltype > LM_TYPE_JOURNAL)
203 return -EINVAL;
204 glops = gfs2_glops_list[gltype];
205 if (glops == NULL)
206 return -EINVAL;
207 rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
208 if (rv)
209 return rv;
210 gfs2_glock_cb(gl, glmode);
211 gfs2_glock_put(gl);
212 return len;
213}
214
151struct gfs2_attr { 215struct gfs2_attr {
152 struct attribute attr; 216 struct attribute attr;
153 ssize_t (*show)(struct gfs2_sbd *, char *); 217 ssize_t (*show)(struct gfs2_sbd *, char *);
@@ -159,22 +223,26 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
159 223
160GFS2_ATTR(id, 0444, id_show, NULL); 224GFS2_ATTR(id, 0444, id_show, NULL);
161GFS2_ATTR(fsname, 0444, fsname_show, NULL); 225GFS2_ATTR(fsname, 0444, fsname_show, NULL);
226GFS2_ATTR(uuid, 0444, uuid_show, NULL);
162GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); 227GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
163GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); 228GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
164GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); 229GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
165GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); 230GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
166GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store); 231GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store);
167GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store); 232GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store);
233GFS2_ATTR(demote_rq, 0200, NULL, demote_rq_store);
168 234
169static struct attribute *gfs2_attrs[] = { 235static struct attribute *gfs2_attrs[] = {
170 &gfs2_attr_id.attr, 236 &gfs2_attr_id.attr,
171 &gfs2_attr_fsname.attr, 237 &gfs2_attr_fsname.attr,
238 &gfs2_attr_uuid.attr,
172 &gfs2_attr_freeze.attr, 239 &gfs2_attr_freeze.attr,
173 &gfs2_attr_withdraw.attr, 240 &gfs2_attr_withdraw.attr,
174 &gfs2_attr_statfs_sync.attr, 241 &gfs2_attr_statfs_sync.attr,
175 &gfs2_attr_quota_sync.attr, 242 &gfs2_attr_quota_sync.attr,
176 &gfs2_attr_quota_refresh_user.attr, 243 &gfs2_attr_quota_refresh_user.attr,
177 &gfs2_attr_quota_refresh_group.attr, 244 &gfs2_attr_quota_refresh_group.attr,
245 &gfs2_attr_demote_rq.attr,
178 NULL, 246 NULL,
179}; 247};
180 248
@@ -224,14 +292,145 @@ static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
224 292
225LOCKSTRUCT_ATTR(jid, "%u\n"); 293LOCKSTRUCT_ATTR(jid, "%u\n");
226LOCKSTRUCT_ATTR(first, "%u\n"); 294LOCKSTRUCT_ATTR(first, "%u\n");
227LOCKSTRUCT_ATTR(lvb_size, "%u\n");
228LOCKSTRUCT_ATTR(flags, "%d\n");
229 295
230static struct attribute *lockstruct_attrs[] = { 296static struct attribute *lockstruct_attrs[] = {
231 &lockstruct_attr_jid.attr, 297 &lockstruct_attr_jid.attr,
232 &lockstruct_attr_first.attr, 298 &lockstruct_attr_first.attr,
233 &lockstruct_attr_lvb_size.attr, 299 NULL,
234 &lockstruct_attr_flags.attr, 300};
301
302/*
303 * lock_module. Originally from lock_dlm
304 */
305
306static ssize_t proto_name_show(struct gfs2_sbd *sdp, char *buf)
307{
308 const struct lm_lockops *ops = sdp->sd_lockstruct.ls_ops;
309 return sprintf(buf, "%s\n", ops->lm_proto_name);
310}
311
312static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
313{
314 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
315 ssize_t ret;
316 int val = 0;
317
318 if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))
319 val = 1;
320 ret = sprintf(buf, "%d\n", val);
321 return ret;
322}
323
324static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
325{
326 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
327 ssize_t ret = len;
328 int val;
329
330 val = simple_strtol(buf, NULL, 0);
331
332 if (val == 1)
333 set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
334 else if (val == 0) {
335 clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
336 smp_mb__after_clear_bit();
337 gfs2_glock_thaw(sdp);
338 } else {
339 ret = -EINVAL;
340 }
341 return ret;
342}
343
344static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
345{
346 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
347 return sprintf(buf, "%u\n", ls->ls_id);
348}
349
350static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
351{
352 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
353 return sprintf(buf, "%d\n", ls->ls_first);
354}
355
356static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
357{
358 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
359 return sprintf(buf, "%d\n", ls->ls_first_done);
360}
361
362static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf)
363{
364 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
365 return sprintf(buf, "%d\n", ls->ls_recover_jid);
366}
367
368static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
369{
370 struct gfs2_jdesc *jd;
371
372 spin_lock(&sdp->sd_jindex_spin);
373 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
374 if (jd->jd_jid != jid)
375 continue;
376 jd->jd_dirty = 1;
377 break;
378 }
379 spin_unlock(&sdp->sd_jindex_spin);
380}
381
382static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
383{
384 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
385 ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
386 gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
387 if (sdp->sd_recoverd_process)
388 wake_up_process(sdp->sd_recoverd_process);
389 return len;
390}
391
392static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
393{
394 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
395 return sprintf(buf, "%d\n", ls->ls_recover_jid_done);
396}
397
398static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
399{
400 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
401 return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
402}
403
404struct gdlm_attr {
405 struct attribute attr;
406 ssize_t (*show)(struct gfs2_sbd *sdp, char *);
407 ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t);
408};
409
410#define GDLM_ATTR(_name,_mode,_show,_store) \
411static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
412
413GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
414GDLM_ATTR(block, 0644, block_show, block_store);
415GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
416GDLM_ATTR(id, 0444, lkid_show, NULL);
417GDLM_ATTR(first, 0444, lkfirst_show, NULL);
418GDLM_ATTR(first_done, 0444, first_done_show, NULL);
419GDLM_ATTR(recover, 0644, recover_show, recover_store);
420GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
421GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
422
423static struct attribute *lock_module_attrs[] = {
424 &gdlm_attr_proto_name.attr,
425 &gdlm_attr_block.attr,
426 &gdlm_attr_withdraw.attr,
427 &gdlm_attr_id.attr,
428 &lockstruct_attr_jid.attr,
429 &gdlm_attr_first.attr,
430 &gdlm_attr_first_done.attr,
431 &gdlm_attr_recover.attr,
432 &gdlm_attr_recover_done.attr,
433 &gdlm_attr_recover_status.attr,
235 NULL, 434 NULL,
236}; 435};
237 436
@@ -373,7 +572,6 @@ TUNE_ATTR(complain_secs, 0);
373TUNE_ATTR(statfs_slow, 0); 572TUNE_ATTR(statfs_slow, 0);
374TUNE_ATTR(new_files_jdata, 0); 573TUNE_ATTR(new_files_jdata, 0);
375TUNE_ATTR(quota_simul_sync, 1); 574TUNE_ATTR(quota_simul_sync, 1);
376TUNE_ATTR(quota_cache_secs, 1);
377TUNE_ATTR(stall_secs, 1); 575TUNE_ATTR(stall_secs, 1);
378TUNE_ATTR(statfs_quantum, 1); 576TUNE_ATTR(statfs_quantum, 1);
379TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); 577TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
@@ -389,7 +587,6 @@ static struct attribute *tune_attrs[] = {
389 &tune_attr_complain_secs.attr, 587 &tune_attr_complain_secs.attr,
390 &tune_attr_statfs_slow.attr, 588 &tune_attr_statfs_slow.attr,
391 &tune_attr_quota_simul_sync.attr, 589 &tune_attr_quota_simul_sync.attr,
392 &tune_attr_quota_cache_secs.attr,
393 &tune_attr_stall_secs.attr, 590 &tune_attr_stall_secs.attr,
394 &tune_attr_statfs_quantum.attr, 591 &tune_attr_statfs_quantum.attr,
395 &tune_attr_recoverd_secs.attr, 592 &tune_attr_recoverd_secs.attr,
@@ -414,6 +611,11 @@ static struct attribute_group tune_group = {
414 .attrs = tune_attrs, 611 .attrs = tune_attrs,
415}; 612};
416 613
614static struct attribute_group lock_module_group = {
615 .name = "lock_module",
616 .attrs = lock_module_attrs,
617};
618
417int gfs2_sys_fs_add(struct gfs2_sbd *sdp) 619int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
418{ 620{
419 int error; 621 int error;
@@ -436,9 +638,15 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
436 if (error) 638 if (error)
437 goto fail_args; 639 goto fail_args;
438 640
641 error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
642 if (error)
643 goto fail_tune;
644
439 kobject_uevent(&sdp->sd_kobj, KOBJ_ADD); 645 kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
440 return 0; 646 return 0;
441 647
648fail_tune:
649 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
442fail_args: 650fail_args:
443 sysfs_remove_group(&sdp->sd_kobj, &args_group); 651 sysfs_remove_group(&sdp->sd_kobj, &args_group);
444fail_lockstruct: 652fail_lockstruct:
@@ -455,15 +663,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
455 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 663 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
456 sysfs_remove_group(&sdp->sd_kobj, &args_group); 664 sysfs_remove_group(&sdp->sd_kobj, &args_group);
457 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); 665 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
666 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
458 kobject_put(&sdp->sd_kobj); 667 kobject_put(&sdp->sd_kobj);
459} 668}
460 669
670
461static int gfs2_uevent(struct kset *kset, struct kobject *kobj, 671static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
462 struct kobj_uevent_env *env) 672 struct kobj_uevent_env *env)
463{ 673{
464 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); 674 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
675 const u8 *uuid = sdp->sd_sb.sb_uuid;
676
465 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); 677 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
466 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 678 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
679 if (gfs2_uuid_valid(uuid)) {
680 add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
681 "%02X%02X-%02X%02X%02X%02X%02X%02X",
682 uuid[0], uuid[1], uuid[2], uuid[3], uuid[4],
683 uuid[5], uuid[6], uuid[7], uuid[8], uuid[9],
684 uuid[10], uuid[11], uuid[12], uuid[13],
685 uuid[14], uuid[15]);
686 }
467 return 0; 687 return 0;
468} 688}
469 689
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index f677b8a83f0c..053752d4b27f 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -12,9 +12,8 @@
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/completion.h> 13#include <linux/completion.h>
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
17#include <linux/lm_interface.h> 16#include <linux/gfs2_ondisk.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
20#include "incore.h" 19#include "incore.h"
@@ -88,9 +87,11 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
88 87
89 if (!tr->tr_touched) { 88 if (!tr->tr_touched) {
90 gfs2_log_release(sdp, tr->tr_reserved); 89 gfs2_log_release(sdp, tr->tr_reserved);
91 gfs2_glock_dq(&tr->tr_t_gh); 90 if (tr->tr_t_gh.gh_gl) {
92 gfs2_holder_uninit(&tr->tr_t_gh); 91 gfs2_glock_dq(&tr->tr_t_gh);
93 kfree(tr); 92 gfs2_holder_uninit(&tr->tr_t_gh);
93 kfree(tr);
94 }
94 return; 95 return;
95 } 96 }
96 97
@@ -106,9 +107,11 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
106 } 107 }
107 108
108 gfs2_log_commit(sdp, tr); 109 gfs2_log_commit(sdp, tr);
109 gfs2_glock_dq(&tr->tr_t_gh); 110 if (tr->tr_t_gh.gh_gl) {
110 gfs2_holder_uninit(&tr->tr_t_gh); 111 gfs2_glock_dq(&tr->tr_t_gh);
111 kfree(tr); 112 gfs2_holder_uninit(&tr->tr_t_gh);
113 kfree(tr);
114 }
112 115
113 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) 116 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
114 gfs2_log_flush(sdp, NULL); 117 gfs2_log_flush(sdp, NULL);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 374f50e95496..9d12b1118ba0 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -13,7 +13,6 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/crc32.h> 14#include <linux/crc32.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17#include <asm/uaccess.h> 16#include <asm/uaccess.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
@@ -35,6 +34,8 @@ void gfs2_assert_i(struct gfs2_sbd *sdp)
35 34
36int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) 35int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
37{ 36{
37 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
38 const struct lm_lockops *lm = ls->ls_ops;
38 va_list args; 39 va_list args;
39 40
40 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) 41 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
@@ -47,8 +48,12 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
47 fs_err(sdp, "about to withdraw this file system\n"); 48 fs_err(sdp, "about to withdraw this file system\n");
48 BUG_ON(sdp->sd_args.ar_debug); 49 BUG_ON(sdp->sd_args.ar_debug);
49 50
50 fs_err(sdp, "telling LM to withdraw\n"); 51 kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
51 gfs2_withdraw_lockproto(&sdp->sd_lockstruct); 52
53 if (lm->lm_unmount) {
54 fs_err(sdp, "telling LM to unmount\n");
55 lm->lm_unmount(sdp);
56 }
52 fs_err(sdp, "withdrawn\n"); 57 fs_err(sdp, "withdrawn\n");
53 dump_stack(); 58 dump_stack();
54 59
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
new file mode 100644
index 000000000000..b77c5bc20f8a
--- /dev/null
+++ b/fs/hfs/Kconfig
@@ -0,0 +1,12 @@
1config HFS_FS
2 tristate "Apple Macintosh file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 select NLS
5 help
6 If you say Y here, you will be able to mount Macintosh-formatted
7 floppy disks and hard drive partitions with full read-write access.
8 Please read <file:Documentation/filesystems/hfs.txt> to learn about
9 the available mount options.
10
11 To compile this file system support as a module, choose M here: the
12 module will be called hfs.
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
new file mode 100644
index 000000000000..a63371815aab
--- /dev/null
+++ b/fs/hfsplus/Kconfig
@@ -0,0 +1,13 @@
1config HFSPLUS_FS
2 tristate "Apple Extended HFS file system support"
3 depends on BLOCK
4 select NLS
5 select NLS_UTF8
6 help
7 If you say Y here, you will be able to mount extended format
8 Macintosh-formatted hard drive partitions with full read-write access.
9
10 This file system is often called HFS+ and was introduced with
11 MacOS 8. It includes all Mac specific filesystem data such as
12 data forks and creator codes, but it also has several UNIX
13 style features such as file ownership and permissions.
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
new file mode 100644
index 000000000000..56bd15c5bf6c
--- /dev/null
+++ b/fs/hpfs/Kconfig
@@ -0,0 +1,14 @@
1config HPFS_FS
2 tristate "OS/2 HPFS file system support"
3 depends on BLOCK
4 help
5 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
6 is the file system used for organizing files on OS/2 hard disk
7 partitions. Say Y if you want to be able to read files from and
8 write files to an OS/2 HPFS partition on your hard drive. OS/2
9 floppies however are in regular MSDOS format, so you don't need this
10 option in order to be able to read them. Read
11 <file:Documentation/filesystems/hpfs.txt>.
12
13 To compile this file system support as a module, choose M here: the
14 module will be called hpfs. If unsure, say N.
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6903d37af037..9b800d97a687 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
108 108
109 if (hugetlb_reserve_pages(inode, 109 if (hugetlb_reserve_pages(inode,
110 vma->vm_pgoff >> huge_page_order(h), 110 vma->vm_pgoff >> huge_page_order(h),
111 len >> huge_page_shift(h), vma)) 111 len >> huge_page_shift(h), vma,
112 vma->vm_flags))
112 goto out; 113 goto out;
113 114
114 ret = 0; 115 ret = 0;
@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void)
947 can_do_mlock()); 948 can_do_mlock());
948} 949}
949 950
950struct file *hugetlb_file_setup(const char *name, size_t size) 951struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
951{ 952{
952 int error = -ENOMEM; 953 int error = -ENOMEM;
953 struct file *file; 954 struct file *file;
@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
981 982
982 error = -ENOMEM; 983 error = -ENOMEM;
983 if (hugetlb_reserve_pages(inode, 0, 984 if (hugetlb_reserve_pages(inode, 0,
984 size >> huge_page_shift(hstate_inode(inode)), NULL)) 985 size >> huge_page_shift(hstate_inode(inode)), NULL,
986 acctflag))
985 goto out_inode; 987 goto out_inode;
986 988
987 d_instantiate(dentry, inode); 989 d_instantiate(dentry, inode);
diff --git a/fs/inode.c b/fs/inode.c
index 913ab2d9a5d1..643ac43e5a5c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,7 @@
17#include <linux/hash.h> 17#include <linux/hash.h>
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/security.h> 19#include <linux/security.h>
20#include <linux/ima.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
21#include <linux/cdev.h> 22#include <linux/cdev.h>
22#include <linux/bootmem.h> 23#include <linux/bootmem.h>
@@ -147,13 +148,13 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
147 inode->i_cdev = NULL; 148 inode->i_cdev = NULL;
148 inode->i_rdev = 0; 149 inode->i_rdev = 0;
149 inode->dirtied_when = 0; 150 inode->dirtied_when = 0;
150 if (security_inode_alloc(inode)) { 151
151 if (inode->i_sb->s_op->destroy_inode) 152 if (security_inode_alloc(inode))
152 inode->i_sb->s_op->destroy_inode(inode); 153 goto out_free_inode;
153 else 154
154 kmem_cache_free(inode_cachep, (inode)); 155 /* allocate and initialize an i_integrity */
155 return NULL; 156 if (ima_inode_alloc(inode))
156 } 157 goto out_free_security;
157 158
158 spin_lock_init(&inode->i_lock); 159 spin_lock_init(&inode->i_lock);
159 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); 160 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -189,6 +190,15 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
189 inode->i_mapping = mapping; 190 inode->i_mapping = mapping;
190 191
191 return inode; 192 return inode;
193
194out_free_security:
195 security_inode_free(inode);
196out_free_inode:
197 if (inode->i_sb->s_op->destroy_inode)
198 inode->i_sb->s_op->destroy_inode(inode);
199 else
200 kmem_cache_free(inode_cachep, (inode));
201 return NULL;
192} 202}
193EXPORT_SYMBOL(inode_init_always); 203EXPORT_SYMBOL(inode_init_always);
194 204
@@ -359,6 +369,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
359 invalidate_inode_buffers(inode); 369 invalidate_inode_buffers(inode);
360 if (!atomic_read(&inode->i_count)) { 370 if (!atomic_read(&inode->i_count)) {
361 list_move(&inode->i_list, dispose); 371 list_move(&inode->i_list, dispose);
372 WARN_ON(inode->i_state & I_NEW);
362 inode->i_state |= I_FREEING; 373 inode->i_state |= I_FREEING;
363 count++; 374 count++;
364 continue; 375 continue;
@@ -460,6 +471,7 @@ static void prune_icache(int nr_to_scan)
460 continue; 471 continue;
461 } 472 }
462 list_move(&inode->i_list, &freeable); 473 list_move(&inode->i_list, &freeable);
474 WARN_ON(inode->i_state & I_NEW);
463 inode->i_state |= I_FREEING; 475 inode->i_state |= I_FREEING;
464 nr_pruned++; 476 nr_pruned++;
465 } 477 }
@@ -656,6 +668,7 @@ void unlock_new_inode(struct inode *inode)
656 * just created it (so there can be no old holders 668 * just created it (so there can be no old holders
657 * that haven't tested I_LOCK). 669 * that haven't tested I_LOCK).
658 */ 670 */
671 WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
659 inode->i_state &= ~(I_LOCK|I_NEW); 672 inode->i_state &= ~(I_LOCK|I_NEW);
660 wake_up_inode(inode); 673 wake_up_inode(inode);
661} 674}
@@ -1145,6 +1158,7 @@ void generic_delete_inode(struct inode *inode)
1145 1158
1146 list_del_init(&inode->i_list); 1159 list_del_init(&inode->i_list);
1147 list_del_init(&inode->i_sb_list); 1160 list_del_init(&inode->i_sb_list);
1161 WARN_ON(inode->i_state & I_NEW);
1148 inode->i_state |= I_FREEING; 1162 inode->i_state |= I_FREEING;
1149 inodes_stat.nr_inodes--; 1163 inodes_stat.nr_inodes--;
1150 spin_unlock(&inode_lock); 1164 spin_unlock(&inode_lock);
@@ -1186,16 +1200,19 @@ static void generic_forget_inode(struct inode *inode)
1186 spin_unlock(&inode_lock); 1200 spin_unlock(&inode_lock);
1187 return; 1201 return;
1188 } 1202 }
1203 WARN_ON(inode->i_state & I_NEW);
1189 inode->i_state |= I_WILL_FREE; 1204 inode->i_state |= I_WILL_FREE;
1190 spin_unlock(&inode_lock); 1205 spin_unlock(&inode_lock);
1191 write_inode_now(inode, 1); 1206 write_inode_now(inode, 1);
1192 spin_lock(&inode_lock); 1207 spin_lock(&inode_lock);
1208 WARN_ON(inode->i_state & I_NEW);
1193 inode->i_state &= ~I_WILL_FREE; 1209 inode->i_state &= ~I_WILL_FREE;
1194 inodes_stat.nr_unused--; 1210 inodes_stat.nr_unused--;
1195 hlist_del_init(&inode->i_hash); 1211 hlist_del_init(&inode->i_hash);
1196 } 1212 }
1197 list_del_init(&inode->i_list); 1213 list_del_init(&inode->i_list);
1198 list_del_init(&inode->i_sb_list); 1214 list_del_init(&inode->i_sb_list);
1215 WARN_ON(inode->i_state & I_NEW);
1199 inode->i_state |= I_FREEING; 1216 inode->i_state |= I_FREEING;
1200 inodes_stat.nr_inodes--; 1217 inodes_stat.nr_inodes--;
1201 spin_unlock(&inode_lock); 1218 spin_unlock(&inode_lock);
@@ -1283,6 +1300,40 @@ sector_t bmap(struct inode * inode, sector_t block)
1283} 1300}
1284EXPORT_SYMBOL(bmap); 1301EXPORT_SYMBOL(bmap);
1285 1302
1303/*
1304 * With relative atime, only update atime if the previous atime is
1305 * earlier than either the ctime or mtime or if at least a day has
1306 * passed since the last atime update.
1307 */
1308static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
1309 struct timespec now)
1310{
1311
1312 if (!(mnt->mnt_flags & MNT_RELATIME))
1313 return 1;
1314 /*
1315 * Is mtime younger than atime? If yes, update atime:
1316 */
1317 if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0)
1318 return 1;
1319 /*
1320 * Is ctime younger than atime? If yes, update atime:
1321 */
1322 if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0)
1323 return 1;
1324
1325 /*
1326 * Is the previous atime value older than a day? If yes,
1327 * update atime:
1328 */
1329 if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
1330 return 1;
1331 /*
1332 * Good, we can skip the atime update:
1333 */
1334 return 0;
1335}
1336
1286/** 1337/**
1287 * touch_atime - update the access time 1338 * touch_atime - update the access time
1288 * @mnt: mount the inode is accessed on 1339 * @mnt: mount the inode is accessed on
@@ -1310,17 +1361,12 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
1310 goto out; 1361 goto out;
1311 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) 1362 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1312 goto out; 1363 goto out;
1313 if (mnt->mnt_flags & MNT_RELATIME) {
1314 /*
1315 * With relative atime, only update atime if the previous
1316 * atime is earlier than either the ctime or mtime.
1317 */
1318 if (timespec_compare(&inode->i_mtime, &inode->i_atime) < 0 &&
1319 timespec_compare(&inode->i_ctime, &inode->i_atime) < 0)
1320 goto out;
1321 }
1322 1364
1323 now = current_fs_time(inode->i_sb); 1365 now = current_fs_time(inode->i_sb);
1366
1367 if (!relatime_need_update(mnt, inode, now))
1368 goto out;
1369
1324 if (timespec_equal(&inode->i_atime, &now)) 1370 if (timespec_equal(&inode->i_atime, &now))
1325 goto out; 1371 goto out;
1326 1372
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..0d8ac497b3d5 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -43,7 +43,7 @@ extern void __init chrdev_init(void);
43/* 43/*
44 * exec.c 44 * exec.c
45 */ 45 */
46extern void check_unsafe_exec(struct linux_binprm *); 46extern void check_unsafe_exec(struct linux_binprm *, struct files_struct *);
47 47
48/* 48/*
49 * namespace.c 49 * namespace.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 240ec63984cb..ac2d47e43926 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -404,10 +404,12 @@ static int ioctl_fionbio(struct file *filp, int __user *argp)
404 if (O_NONBLOCK != O_NDELAY) 404 if (O_NONBLOCK != O_NDELAY)
405 flag |= O_NDELAY; 405 flag |= O_NDELAY;
406#endif 406#endif
407 spin_lock(&filp->f_lock);
407 if (on) 408 if (on)
408 filp->f_flags |= flag; 409 filp->f_flags |= flag;
409 else 410 else
410 filp->f_flags &= ~flag; 411 filp->f_flags &= ~flag;
412 spin_unlock(&filp->f_lock);
411 return error; 413 return error;
412} 414}
413 415
@@ -425,18 +427,12 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
425 /* Did FASYNC state change ? */ 427 /* Did FASYNC state change ? */
426 if ((flag ^ filp->f_flags) & FASYNC) { 428 if ((flag ^ filp->f_flags) & FASYNC) {
427 if (filp->f_op && filp->f_op->fasync) 429 if (filp->f_op && filp->f_op->fasync)
430 /* fasync() adjusts filp->f_flags */
428 error = filp->f_op->fasync(fd, filp, on); 431 error = filp->f_op->fasync(fd, filp, on);
429 else 432 else
430 error = -ENOTTY; 433 error = -ENOTTY;
431 } 434 }
432 if (error) 435 return error < 0 ? error : 0;
433 return error;
434
435 if (on)
436 filp->f_flags |= FASYNC;
437 else
438 filp->f_flags &= ~FASYNC;
439 return error;
440} 436}
441 437
442static int ioctl_fsfreeze(struct file *filp) 438static int ioctl_fsfreeze(struct file *filp)
@@ -499,17 +495,11 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
499 break; 495 break;
500 496
501 case FIONBIO: 497 case FIONBIO:
502 /* BKL needed to avoid races tweaking f_flags */
503 lock_kernel();
504 error = ioctl_fionbio(filp, argp); 498 error = ioctl_fionbio(filp, argp);
505 unlock_kernel();
506 break; 499 break;
507 500
508 case FIOASYNC: 501 case FIOASYNC:
509 /* BKL needed to avoid races tweaking f_flags */
510 lock_kernel();
511 error = ioctl_fioasync(fd, filp, argp); 502 error = ioctl_fioasync(fd, filp, argp);
512 unlock_kernel();
513 break; 503 break;
514 504
515 case FIOQSIZE: 505 case FIOQSIZE:
diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig
new file mode 100644
index 000000000000..8ab9878e3671
--- /dev/null
+++ b/fs/isofs/Kconfig
@@ -0,0 +1,39 @@
1config ISO9660_FS
2 tristate "ISO 9660 CDROM file system support"
3 help
4 This is the standard file system used on CD-ROMs. It was previously
5 known as "High Sierra File System" and is called "hsfs" on other
6 Unix systems. The so-called Rock-Ridge extensions which allow for
7 long Unix filenames and symbolic links are also supported by this
8 driver. If you have a CD-ROM drive and want to do more with it than
9 just listen to audio CDs and watch its LEDs, say Y (and read
10 <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO,
11 available from <http://www.tldp.org/docs.html#howto>), thereby
12 enlarging your kernel by about 27 KB; otherwise say N.
13
14 To compile this file system support as a module, choose M here: the
15 module will be called isofs.
16
17config JOLIET
18 bool "Microsoft Joliet CDROM extensions"
19 depends on ISO9660_FS
20 select NLS
21 help
22 Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system
23 which allows for long filenames in unicode format (unicode is the
24 new 16 bit character code, successor to ASCII, which encodes the
25 characters of almost all languages of the world; see
26 <http://www.unicode.org/> for more information). Say Y here if you
27 want to be able to read Joliet CD-ROMs under Linux.
28
29config ZISOFS
30 bool "Transparent decompression extension"
31 depends on ISO9660_FS
32 select ZLIB_INFLATE
33 help
34 This is a Linux-specific extension to RockRidge which lets you store
35 data in compressed form on a CD-ROM and have it transparently
36 decompressed when the CD-ROM is accessed. See
37 <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools
38 necessary to create such a filesystem. Say Y here if you want to be
39 able to read such compressed CD-ROMs.
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9e4fa52d7dc8..e79c07812afa 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -427,7 +427,7 @@ int __log_space_left(journal_t *journal)
427} 427}
428 428
429/* 429/*
430 * Called under j_state_lock. Returns true if a transaction was started. 430 * Called under j_state_lock. Returns true if a transaction commit was started.
431 */ 431 */
432int __log_start_commit(journal_t *journal, tid_t target) 432int __log_start_commit(journal_t *journal, tid_t target)
433{ 433{
@@ -495,7 +495,8 @@ int journal_force_commit_nested(journal_t *journal)
495 495
496/* 496/*
497 * Start a commit of the current running transaction (if any). Returns true 497 * Start a commit of the current running transaction (if any). Returns true
498 * if a transaction was started, and fills its tid in at *ptid 498 * if a transaction is going to be committed (or is currently already
499 * committing), and fills its tid in at *ptid
499 */ 500 */
500int journal_start_commit(journal_t *journal, tid_t *ptid) 501int journal_start_commit(journal_t *journal, tid_t *ptid)
501{ 502{
@@ -505,15 +506,19 @@ int journal_start_commit(journal_t *journal, tid_t *ptid)
505 if (journal->j_running_transaction) { 506 if (journal->j_running_transaction) {
506 tid_t tid = journal->j_running_transaction->t_tid; 507 tid_t tid = journal->j_running_transaction->t_tid;
507 508
508 ret = __log_start_commit(journal, tid); 509 __log_start_commit(journal, tid);
509 if (ret && ptid) 510 /* There's a running transaction and we've just made sure
511 * it's commit has been scheduled. */
512 if (ptid)
510 *ptid = tid; 513 *ptid = tid;
511 } else if (journal->j_committing_transaction && ptid) { 514 ret = 1;
515 } else if (journal->j_committing_transaction) {
512 /* 516 /*
513 * If ext3_write_super() recently started a commit, then we 517 * If ext3_write_super() recently started a commit, then we
514 * have to wait for completion of that transaction 518 * have to wait for completion of that transaction
515 */ 519 */
516 *ptid = journal->j_committing_transaction->t_tid; 520 if (ptid)
521 *ptid = journal->j_committing_transaction->t_tid;
517 ret = 1; 522 ret = 1;
518 } 523 }
519 spin_unlock(&journal->j_state_lock); 524 spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 56675306ed81..58144102bf25 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -37,10 +37,10 @@
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/seq_file.h> 39#include <linux/seq_file.h>
40#include <linux/math64.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/page.h> 43#include <asm/page.h>
43#include <asm/div64.h>
44 44
45EXPORT_SYMBOL(jbd2_journal_start); 45EXPORT_SYMBOL(jbd2_journal_start);
46EXPORT_SYMBOL(jbd2_journal_restart); 46EXPORT_SYMBOL(jbd2_journal_restart);
@@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal)
450} 450}
451 451
452/* 452/*
453 * Called under j_state_lock. Returns true if a transaction was started. 453 * Called under j_state_lock. Returns true if a transaction commit was started.
454 */ 454 */
455int __jbd2_log_start_commit(journal_t *journal, tid_t target) 455int __jbd2_log_start_commit(journal_t *journal, tid_t target)
456{ 456{
@@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
518 518
519/* 519/*
520 * Start a commit of the current running transaction (if any). Returns true 520 * Start a commit of the current running transaction (if any). Returns true
521 * if a transaction was started, and fills its tid in at *ptid 521 * if a transaction is going to be committed (or is currently already
522 * committing), and fills its tid in at *ptid
522 */ 523 */
523int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) 524int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
524{ 525{
@@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
528 if (journal->j_running_transaction) { 529 if (journal->j_running_transaction) {
529 tid_t tid = journal->j_running_transaction->t_tid; 530 tid_t tid = journal->j_running_transaction->t_tid;
530 531
531 ret = __jbd2_log_start_commit(journal, tid); 532 __jbd2_log_start_commit(journal, tid);
532 if (ret && ptid) 533 /* There's a running transaction and we've just made sure
534 * it's commit has been scheduled. */
535 if (ptid)
533 *ptid = tid; 536 *ptid = tid;
534 } else if (journal->j_committing_transaction && ptid) { 537 ret = 1;
538 } else if (journal->j_committing_transaction) {
535 /* 539 /*
536 * If ext3_write_super() recently started a commit, then we 540 * If ext3_write_super() recently started a commit, then we
537 * have to wait for completion of that transaction 541 * have to wait for completion of that transaction
538 */ 542 */
539 *ptid = journal->j_committing_transaction->t_tid; 543 if (ptid)
544 *ptid = journal->j_committing_transaction->t_tid;
540 ret = 1; 545 ret = 1;
541 } 546 }
542 spin_unlock(&journal->j_state_lock); 547 spin_unlock(&journal->j_state_lock);
@@ -846,8 +851,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
846 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 851 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
847 seq_printf(seq, " %ums logging transaction\n", 852 seq_printf(seq, " %ums logging transaction\n",
848 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 853 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
849 seq_printf(seq, " %luus average transaction commit time\n", 854 seq_printf(seq, " %lluus average transaction commit time\n",
850 do_div(s->journal->j_average_commit_time, 1000)); 855 div_u64(s->journal->j_average_commit_time, 1000));
851 seq_printf(seq, " %lu handles per transaction\n", 856 seq_printf(seq, " %lu handles per transaction\n",
852 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 857 s->stats->u.run.rs_handle_count / s->stats->ts_tid);
853 seq_printf(seq, " %lu blocks per transaction\n", 858 seq_printf(seq, " %lu blocks per transaction\n",
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 46b4e347ed7d..28ce21d8598e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2129,26 +2129,46 @@ done:
2129} 2129}
2130 2130
2131/* 2131/*
2132 * This function must be called when inode is journaled in ordered mode 2132 * File truncate and transaction commit interact with each other in a
2133 * before truncation happens. It starts writeout of truncated part in 2133 * non-trivial way. If a transaction writing data block A is
2134 * case it is in the committing transaction so that we stand to ordered 2134 * committing, we cannot discard the data by truncate until we have
2135 * mode consistency guarantees. 2135 * written them. Otherwise if we crashed after the transaction with
2136 * write has committed but before the transaction with truncate has
2137 * committed, we could see stale data in block A. This function is a
2138 * helper to solve this problem. It starts writeout of the truncated
2139 * part in case it is in the committing transaction.
2140 *
2141 * Filesystem code must call this function when inode is journaled in
2142 * ordered mode before truncation happens and after the inode has been
2143 * placed on orphan list with the new inode size. The second condition
2144 * avoids the race that someone writes new data and we start
2145 * committing the transaction after this function has been called but
2146 * before a transaction for truncate is started (and furthermore it
2147 * allows us to optimize the case where the addition to orphan list
2148 * happens in the same transaction as write --- we don't have to write
2149 * any data in such case).
2136 */ 2150 */
2137int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, 2151int jbd2_journal_begin_ordered_truncate(journal_t *journal,
2152 struct jbd2_inode *jinode,
2138 loff_t new_size) 2153 loff_t new_size)
2139{ 2154{
2140 journal_t *journal; 2155 transaction_t *inode_trans, *commit_trans;
2141 transaction_t *commit_trans;
2142 int ret = 0; 2156 int ret = 0;
2143 2157
2144 if (!inode->i_transaction && !inode->i_next_transaction) 2158 /* This is a quick check to avoid locking if not necessary */
2159 if (!jinode->i_transaction)
2145 goto out; 2160 goto out;
2146 journal = inode->i_transaction->t_journal; 2161 /* Locks are here just to force reading of recent values, it is
2162 * enough that the transaction was not committing before we started
2163 * a transaction adding the inode to orphan list */
2147 spin_lock(&journal->j_state_lock); 2164 spin_lock(&journal->j_state_lock);
2148 commit_trans = journal->j_committing_transaction; 2165 commit_trans = journal->j_committing_transaction;
2149 spin_unlock(&journal->j_state_lock); 2166 spin_unlock(&journal->j_state_lock);
2150 if (inode->i_transaction == commit_trans) { 2167 spin_lock(&journal->j_list_lock);
2151 ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, 2168 inode_trans = jinode->i_transaction;
2169 spin_unlock(&journal->j_list_lock);
2170 if (inode_trans == commit_trans) {
2171 ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
2152 new_size, LLONG_MAX); 2172 new_size, LLONG_MAX);
2153 if (ret) 2173 if (ret)
2154 jbd2_journal_abort(journal, ret); 2174 jbd2_journal_abort(journal, ret);
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3cceef4ad2b7..e9580104b6ba 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -95,13 +95,17 @@ static int jffs2_garbage_collect_thread(void *_c)
95 spin_unlock(&c->erase_completion_lock); 95 spin_unlock(&c->erase_completion_lock);
96 96
97 97
98 /* This thread is purely an optimisation. But if it runs when 98 /* Problem - immediately after bootup, the GCD spends a lot
99 other things could be running, it actually makes things a 99 * of time in places like jffs2_kill_fragtree(); so much so
100 lot worse. Use yield() and put it at the back of the runqueue 100 * that userspace processes (like gdm and X) are starved
101 every time. Especially during boot, pulling an inode in 101 * despite plenty of cond_resched()s and renicing. Yield()
102 with read_inode() is much preferable to having the GC thread 102 * doesn't help, either (presumably because userspace and GCD
103 get there first. */ 103 * are generally competing for a higher latency resource -
104 yield(); 104 * disk).
105 * This forces the GCD to slow the hell down. Pulling an
106 * inode in with read_inode() is much preferable to having
107 * the GC thread get there first. */
108 schedule_timeout_interruptible(msecs_to_jiffies(50));
105 109
106 /* Put_super will send a SIGKILL and then wait on the sem. 110 /* Put_super will send a SIGKILL and then wait on the sem.
107 */ 111 */
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 6ca08ad887c0..1fc1e92356ee 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -220,7 +220,7 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
220 struct jffs2_tmp_dnode_info *tn) 220 struct jffs2_tmp_dnode_info *tn)
221{ 221{
222 uint32_t fn_end = tn->fn->ofs + tn->fn->size; 222 uint32_t fn_end = tn->fn->ofs + tn->fn->size;
223 struct jffs2_tmp_dnode_info *this; 223 struct jffs2_tmp_dnode_info *this, *ptn;
224 224
225 dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw)); 225 dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw));
226 226
@@ -251,11 +251,18 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
251 if (this) { 251 if (this) {
252 /* If the node is coincident with another at a lower address, 252 /* If the node is coincident with another at a lower address,
253 back up until the other node is found. It may be relevant */ 253 back up until the other node is found. It may be relevant */
254 while (this->overlapped) 254 while (this->overlapped) {
255 this = tn_prev(this); 255 ptn = tn_prev(this);
256 256 if (!ptn) {
257 /* First node should never be marked overlapped */ 257 /*
258 BUG_ON(!this); 258 * We killed a node which set the overlapped
259 * flags during the scan. Fix it up.
260 */
261 this->overlapped = 0;
262 break;
263 }
264 this = ptn;
265 }
259 dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole"); 266 dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole");
260 } 267 }
261 268
@@ -360,7 +367,17 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
360 } 367 }
361 if (!this->overlapped) 368 if (!this->overlapped)
362 break; 369 break;
363 this = tn_prev(this); 370
371 ptn = tn_prev(this);
372 if (!ptn) {
373 /*
374 * We killed a node which set the overlapped
375 * flags during the scan. Fix it up.
376 */
377 this->overlapped = 0;
378 break;
379 }
380 this = ptn;
364 } 381 }
365 } 382 }
366 383
@@ -456,8 +473,15 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
456 eat_last(&rii->tn_root, &last->rb); 473 eat_last(&rii->tn_root, &last->rb);
457 ver_insert(&ver_root, last); 474 ver_insert(&ver_root, last);
458 475
459 if (unlikely(last->overlapped)) 476 if (unlikely(last->overlapped)) {
460 continue; 477 if (pen)
478 continue;
479 /*
480 * We killed a node which set the overlapped
481 * flags during the scan. Fix it up.
482 */
483 last->overlapped = 0;
484 }
461 485
462 /* Now we have a bunch of nodes in reverse version 486 /* Now we have a bunch of nodes in reverse version
463 order, in the tree at ver_root. Most of the time, 487 order, in the tree at ver_root. Most of the time,
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
new file mode 100644
index 000000000000..9ff619a6f9cc
--- /dev/null
+++ b/fs/jfs/Kconfig
@@ -0,0 +1,49 @@
1config JFS_FS
2 tristate "JFS filesystem support"
3 select NLS
4 help
5 This is a port of IBM's Journaled Filesystem . More information is
6 available in the file <file:Documentation/filesystems/jfs.txt>.
7
8 If you do not intend to use the JFS filesystem, say N.
9
10config JFS_POSIX_ACL
11 bool "JFS POSIX Access Control Lists"
12 depends on JFS_FS
13 select FS_POSIX_ACL
14 help
15 Posix Access Control Lists (ACLs) support permissions for users and
16 groups beyond the owner/group/world scheme.
17
18 To learn more about Access Control Lists, visit the Posix ACLs for
19 Linux website <http://acl.bestbits.at/>.
20
21 If you don't know what Access Control Lists are, say N
22
23config JFS_SECURITY
24 bool "JFS Security Labels"
25 depends on JFS_FS
26 help
27 Security labels support alternative access control models
28 implemented by security modules like SELinux. This option
29 enables an extended attribute handler for file security
30 labels in the jfs filesystem.
31
32 If you are not using a security module that requires using
33 extended attributes for file security labels, say N.
34
35config JFS_DEBUG
36 bool "JFS debugging"
37 depends on JFS_FS
38 help
39 If you are experiencing any problems with the JFS filesystem, say
40 Y here. This will result in additional debugging messages to be
41 written to the system log. Under normal circumstances, this
42 results in very little overhead.
43
44config JFS_STATISTICS
45 bool "JFS statistics"
46 depends on JFS_FS
47 help
48 Enabling this option will cause statistics from the JFS file system
49 to be made available to the user in the /proc/fs/jfs/ directory.
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 1f3b0fc0d351..aedc47a264c1 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -139,6 +139,55 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
139 return 0; 139 return 0;
140} 140}
141 141
142#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
143static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
144 struct in6_addr *addr_mapped)
145{
146 const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
147
148 switch (sap->sa_family) {
149 case AF_INET6:
150 return &((const struct sockaddr_in6 *)sap)->sin6_addr;
151 case AF_INET:
152 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
153 return addr_mapped;
154 }
155
156 return NULL;
157}
158
159/*
160 * If lockd is using a PF_INET6 listener, all incoming requests appear
161 * to come from AF_INET6 remotes. The address of AF_INET remotes are
162 * mapped to AF_INET6 automatically by the network layer. In case the
163 * user passed an AF_INET server address at mount time, ensure both
164 * addresses are AF_INET6 before comparing them.
165 */
166static int nlmclnt_cmp_addr(const struct nlm_host *host,
167 const struct sockaddr *sap)
168{
169 const struct in6_addr *addr1;
170 const struct in6_addr *addr2;
171 struct in6_addr addr1_mapped;
172 struct in6_addr addr2_mapped;
173
174 addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
175 if (likely(addr1 != NULL)) {
176 addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
177 if (likely(addr2 != NULL))
178 return ipv6_addr_equal(addr1, addr2);
179 }
180
181 return 0;
182}
183#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
184static int nlmclnt_cmp_addr(const struct nlm_host *host,
185 const struct sockaddr *sap)
186{
187 return nlm_cmp_addr(nlm_addr(host), sap);
188}
189#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
190
142/* 191/*
143 * The server lockd has called us back to tell us the lock was granted 192 * The server lockd has called us back to tell us the lock was granted
144 */ 193 */
@@ -166,7 +215,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
166 */ 215 */
167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) 216 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
168 continue; 217 continue;
169 if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) 218 if (!nlmclnt_cmp_addr(block->b_host, addr))
170 continue; 219 continue;
171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) 220 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
172 continue; 221 continue;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 6063a8e4b9f3..763b78a6e9de 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -427,7 +427,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
427 goto out; 427 goto out;
428 case -EAGAIN: 428 case -EAGAIN:
429 ret = nlm_lck_denied; 429 ret = nlm_lck_denied;
430 goto out; 430 break;
431 case FILE_LOCK_DEFERRED: 431 case FILE_LOCK_DEFERRED:
432 if (wait) 432 if (wait)
433 break; 433 break;
@@ -443,6 +443,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
443 goto out; 443 goto out;
444 } 444 }
445 445
446 ret = nlm_lck_denied;
447 if (!wait)
448 goto out;
449
446 ret = nlm_lck_blocked; 450 ret = nlm_lck_blocked;
447 451
448 /* Append to list of blocked */ 452 /* Append to list of blocked */
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
new file mode 100644
index 000000000000..0fd7ca994264
--- /dev/null
+++ b/fs/minix/Kconfig
@@ -0,0 +1,17 @@
1config MINIX_FS
2 tristate "Minix file system support"
3 depends on BLOCK
4 help
5 Minix is a simple operating system used in many classes about OS's.
6 The minix file system (method to organize files on a hard disk
7 partition or a floppy disk) was the original file system for Linux,
8 but has been superseded by the second extended file system ext2fs.
9 You don't want to use the minix file system on your hard disk
10 because of certain built-in restrictions, but it is sometimes found
11 on older Linux floppy disks. This option will enlarge your kernel
12 by about 28 KB. If unsure, say N.
13
14 To compile this file system support as a module, choose M here: the
15 module will be called minix. Note that the file system of your root
16 partition (the one containing the directory /) cannot be compiled as
17 a module.
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index d1d1eb84679d..618865b3128b 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * 5 *
6 * Copyright (C) 1996 Gertjan van Wingerde (gertjan@cs.vu.nl) 6 * Copyright (C) 1996 Gertjan van Wingerde
7 * Minix V2 fs support. 7 * Minix V2 fs support.
8 * 8 *
9 * Modified for 680x0 by Andreas Schwab 9 * Modified for 680x0 by Andreas Schwab
diff --git a/fs/namei.c b/fs/namei.c
index bbc15c237558..199317642ad6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -24,6 +24,7 @@
24#include <linux/fsnotify.h> 24#include <linux/fsnotify.h>
25#include <linux/personality.h> 25#include <linux/personality.h>
26#include <linux/security.h> 26#include <linux/security.h>
27#include <linux/ima.h>
27#include <linux/syscalls.h> 28#include <linux/syscalls.h>
28#include <linux/mount.h> 29#include <linux/mount.h>
29#include <linux/audit.h> 30#include <linux/audit.h>
@@ -850,6 +851,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
850 if (err == -EAGAIN) 851 if (err == -EAGAIN)
851 err = inode_permission(nd->path.dentry->d_inode, 852 err = inode_permission(nd->path.dentry->d_inode,
852 MAY_EXEC); 853 MAY_EXEC);
854 if (!err)
855 err = ima_path_check(&nd->path, MAY_EXEC);
853 if (err) 856 if (err)
854 break; 857 break;
855 858
@@ -1509,6 +1512,11 @@ int may_open(struct path *path, int acc_mode, int flag)
1509 error = inode_permission(inode, acc_mode); 1512 error = inode_permission(inode, acc_mode);
1510 if (error) 1513 if (error)
1511 return error; 1514 return error;
1515
1516 error = ima_path_check(path,
1517 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
1518 if (error)
1519 return error;
1512 /* 1520 /*
1513 * An append-only file must be opened in append mode for writing. 1521 * An append-only file must be opened in append mode for writing.
1514 */ 1522 */
diff --git a/fs/namespace.c b/fs/namespace.c
index 228d8c4bfd18..f0e753097353 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -614,9 +614,11 @@ static inline void __mntput(struct vfsmount *mnt)
614 */ 614 */
615 for_each_possible_cpu(cpu) { 615 for_each_possible_cpu(cpu) {
616 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); 616 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
617 if (cpu_writer->mnt != mnt)
618 continue;
619 spin_lock(&cpu_writer->lock); 617 spin_lock(&cpu_writer->lock);
618 if (cpu_writer->mnt != mnt) {
619 spin_unlock(&cpu_writer->lock);
620 continue;
621 }
620 atomic_add(cpu_writer->count, &mnt->__mnt_writers); 622 atomic_add(cpu_writer->count, &mnt->__mnt_writers);
621 cpu_writer->count = 0; 623 cpu_writer->count = 0;
622 /* 624 /*
@@ -778,6 +780,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
778 { MNT_NOATIME, ",noatime" }, 780 { MNT_NOATIME, ",noatime" },
779 { MNT_NODIRATIME, ",nodiratime" }, 781 { MNT_NODIRATIME, ",nodiratime" },
780 { MNT_RELATIME, ",relatime" }, 782 { MNT_RELATIME, ",relatime" },
783 { MNT_STRICTATIME, ",strictatime" },
781 { 0, NULL } 784 { 0, NULL }
782 }; 785 };
783 const struct proc_fs_info *fs_infop; 786 const struct proc_fs_info *fs_infop;
@@ -1917,6 +1920,9 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1917 if (data_page) 1920 if (data_page)
1918 ((char *)data_page)[PAGE_SIZE - 1] = 0; 1921 ((char *)data_page)[PAGE_SIZE - 1] = 0;
1919 1922
1923 /* Default to relatime */
1924 mnt_flags |= MNT_RELATIME;
1925
1920 /* Separate the per-mountpoint flags */ 1926 /* Separate the per-mountpoint flags */
1921 if (flags & MS_NOSUID) 1927 if (flags & MS_NOSUID)
1922 mnt_flags |= MNT_NOSUID; 1928 mnt_flags |= MNT_NOSUID;
@@ -1928,13 +1934,14 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1928 mnt_flags |= MNT_NOATIME; 1934 mnt_flags |= MNT_NOATIME;
1929 if (flags & MS_NODIRATIME) 1935 if (flags & MS_NODIRATIME)
1930 mnt_flags |= MNT_NODIRATIME; 1936 mnt_flags |= MNT_NODIRATIME;
1931 if (flags & MS_RELATIME) 1937 if (flags & MS_STRICTATIME)
1932 mnt_flags |= MNT_RELATIME; 1938 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
1933 if (flags & MS_RDONLY) 1939 if (flags & MS_RDONLY)
1934 mnt_flags |= MNT_READONLY; 1940 mnt_flags |= MNT_READONLY;
1935 1941
1936 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | 1942 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
1937 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); 1943 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
1944 MS_STRICTATIME);
1938 1945
1939 /* ... and get the mountpoint */ 1946 /* ... and get the mountpoint */
1940 retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); 1947 retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
diff --git a/fs/ncpfs/Kconfig b/fs/ncpfs/Kconfig
index 142808427b25..c931cf22a1f6 100644
--- a/fs/ncpfs/Kconfig
+++ b/fs/ncpfs/Kconfig
@@ -1,6 +1,27 @@
1# 1#
2# NCP Filesystem configuration 2# NCP Filesystem configuration
3# 3#
4config NCP_FS
5 tristate "NCP file system support (to mount NetWare volumes)"
6 depends on IPX!=n || INET
7 help
8 NCP (NetWare Core Protocol) is a protocol that runs over IPX and is
9 used by Novell NetWare clients to talk to file servers. It is to
10 IPX what NFS is to TCP/IP, if that helps. Saying Y here allows you
11 to mount NetWare file server volumes and to access them just like
12 any other Unix directory. For details, please read the file
13 <file:Documentation/filesystems/ncpfs.txt> in the kernel source and
14 the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>.
15
16 You do not have to say Y here if you want your Linux box to act as a
17 file *server* for Novell NetWare clients.
18
19 General information about how to connect Linux, Windows machines and
20 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
21
22 To compile this as a module, choose M here: the module will be called
23 ncpfs. Say N unless you are connected to a Novell network.
24
4config NCPFS_PACKET_SIGNING 25config NCPFS_PACKET_SIGNING
5 bool "Packet signatures" 26 bool "Packet signatures"
6 depends on NCP_FS 27 depends on NCP_FS
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
new file mode 100644
index 000000000000..36fe20d6eba2
--- /dev/null
+++ b/fs/nfs/Kconfig
@@ -0,0 +1,86 @@
1config NFS_FS
2 tristate "NFS client support"
3 depends on INET
4 select LOCKD
5 select SUNRPC
6 select NFS_ACL_SUPPORT if NFS_V3_ACL
7 help
8 Choose Y here if you want to access files residing on other
9 computers using Sun's Network File System protocol. To compile
10 this file system support as a module, choose M here: the module
11 will be called nfs.
12
13 To mount file systems exported by NFS servers, you also need to
14 install the user space mount.nfs command which can be found in
15 the Linux nfs-utils package, available from http://linux-nfs.org/.
16 Information about using the mount command is available in the
17 mount(8) man page. More detail about the Linux NFS client
18 implementation is available via the nfs(5) man page.
19
20 Below you can choose which versions of the NFS protocol are
21 available in the kernel to mount NFS servers. Support for NFS
22 version 2 (RFC 1094) is always available when NFS_FS is selected.
23
24 To configure a system which mounts its root file system via NFS
25 at boot time, say Y here, select "Kernel level IP
26 autoconfiguration" in the NETWORK menu, and select "Root file
27 system on NFS" below. You cannot compile this file system as a
28 module in this case.
29
30 If unsure, say N.
31
32config NFS_V3
33 bool "NFS client support for NFS version 3"
34 depends on NFS_FS
35 help
36 This option enables support for version 3 of the NFS protocol
37 (RFC 1813) in the kernel's NFS client.
38
39 If unsure, say Y.
40
41config NFS_V3_ACL
42 bool "NFS client support for the NFSv3 ACL protocol extension"
43 depends on NFS_V3
44 help
45 Some NFS servers support an auxiliary NFSv3 ACL protocol that
46 Sun added to Solaris but never became an official part of the
47 NFS version 3 protocol. This protocol extension allows
48 applications on NFS clients to manipulate POSIX Access Control
49 Lists on files residing on NFS servers. NFS servers enforce
50 ACLs on local files whether this protocol is available or not.
51
52 Choose Y here if your NFS server supports the Solaris NFSv3 ACL
53 protocol extension and you want your NFS client to allow
54 applications to access and modify ACLs on files on the server.
55
56 Most NFS servers don't support the Solaris NFSv3 ACL protocol
57 extension. You can choose N here or specify the "noacl" mount
58 option to prevent your NFS client from trying to use the NFSv3
59 ACL protocol.
60
61 If unsure, say N.
62
63config NFS_V4
64 bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
65 depends on NFS_FS && EXPERIMENTAL
66 select RPCSEC_GSS_KRB5
67 help
68 This option enables support for version 4 of the NFS protocol
69 (RFC 3530) in the kernel's NFS client.
70
71 To mount NFS servers using NFSv4, you also need to install user
72 space programs which can be found in the Linux nfs-utils package,
73 available from http://linux-nfs.org/.
74
75 If unsure, say N.
76
77config ROOT_NFS
78 bool "Root file system on NFS"
79 depends on NFS_FS=y && IP_PNP
80 help
81 If you want your system to mount its root file system via NFS,
82 choose Y here. This is common practice for managing systems
83 without local permanent storage. For details, read
84 <file:Documentation/filesystems/nfsroot.txt>.
85
86 Most people say N here.
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 9b728f3565a1..574158ae2398 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -255,6 +255,32 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
255 } 255 }
256 return 0; 256 return 0;
257} 257}
258
259/*
260 * Test if two ip6 socket addresses refer to the same socket by
261 * comparing relevant fields. The padding bytes specifically, are not
262 * compared. sin6_flowinfo is not compared because it only affects QoS
263 * and sin6_scope_id is only compared if the address is "link local"
264 * because "link local" addresses need only be unique to a specific
265 * link. Conversely, ordinary unicast addresses might have different
266 * sin6_scope_id.
267 *
268 * The caller should ensure both socket addresses are AF_INET6.
269 */
270static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
271 const struct sockaddr *sa2)
272{
273 const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1;
274 const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2;
275
276 if (!ipv6_addr_equal(&saddr1->sin6_addr,
277 &saddr1->sin6_addr))
278 return 0;
279 if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
280 saddr1->sin6_scope_id != saddr2->sin6_scope_id)
281 return 0;
282 return saddr1->sin6_port == saddr2->sin6_port;
283}
258#else 284#else
259static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, 285static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
260 const struct sockaddr_in *sa2) 286 const struct sockaddr_in *sa2)
@@ -270,9 +296,52 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
270 return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, 296 return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
271 (const struct sockaddr_in *)sa2); 297 (const struct sockaddr_in *)sa2);
272} 298}
299
300static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
301 const struct sockaddr * sa2)
302{
303 return 0;
304}
273#endif 305#endif
274 306
275/* 307/*
308 * Test if two ip4 socket addresses refer to the same socket, by
309 * comparing relevant fields. The padding bytes specifically, are
310 * not compared.
311 *
312 * The caller should ensure both socket addresses are AF_INET.
313 */
314static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
315 const struct sockaddr *sa2)
316{
317 const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1;
318 const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2;
319
320 if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr)
321 return 0;
322 return saddr1->sin_port == saddr2->sin_port;
323}
324
325/*
326 * Test if two socket addresses represent the same actual socket,
327 * by comparing (only) relevant fields.
328 */
329static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
330 const struct sockaddr *sa2)
331{
332 if (sa1->sa_family != sa2->sa_family)
333 return 0;
334
335 switch (sa1->sa_family) {
336 case AF_INET:
337 return nfs_sockaddr_cmp_ip4(sa1, sa2);
338 case AF_INET6:
339 return nfs_sockaddr_cmp_ip6(sa1, sa2);
340 }
341 return 0;
342}
343
344/*
276 * Find a client by IP address and protocol version 345 * Find a client by IP address and protocol version
277 * - returns NULL if no such client 346 * - returns NULL if no such client
278 */ 347 */
@@ -344,8 +413,10 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
344static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) 413static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
345{ 414{
346 struct nfs_client *clp; 415 struct nfs_client *clp;
416 const struct sockaddr *sap = data->addr;
347 417
348 list_for_each_entry(clp, &nfs_client_list, cl_share_link) { 418 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
419 const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
349 /* Don't match clients that failed to initialise properly */ 420 /* Don't match clients that failed to initialise properly */
350 if (clp->cl_cons_state < 0) 421 if (clp->cl_cons_state < 0)
351 continue; 422 continue;
@@ -358,7 +429,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
358 continue; 429 continue;
359 430
360 /* Match the full socket address */ 431 /* Match the full socket address */
361 if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0) 432 if (!nfs_sockaddr_cmp(sap, clap))
362 continue; 433 continue;
363 434
364 atomic_inc(&clp->cl_count); 435 atomic_inc(&clp->cl_count);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e35c8199f82f..672368f865ca 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1892,8 +1892,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
1892 cache.cred = cred; 1892 cache.cred = cred;
1893 cache.jiffies = jiffies; 1893 cache.jiffies = jiffies;
1894 status = NFS_PROTO(inode)->access(inode, &cache); 1894 status = NFS_PROTO(inode)->access(inode, &cache);
1895 if (status != 0) 1895 if (status != 0) {
1896 if (status == -ESTALE) {
1897 nfs_zap_caches(inode);
1898 if (!S_ISDIR(inode->i_mode))
1899 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
1900 }
1896 return status; 1901 return status;
1902 }
1897 nfs_access_add_cache(inode, &cache); 1903 nfs_access_add_cache(inode, &cache);
1898out: 1904out:
1899 if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 1905 if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index cef62557c87d..6bbf0e6daad2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -292,7 +292,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
292{ 292{
293 struct nfs_server *server = NFS_SERVER(inode); 293 struct nfs_server *server = NFS_SERVER(inode);
294 struct nfs_fattr fattr; 294 struct nfs_fattr fattr;
295 struct page *pages[NFSACL_MAXPAGES] = { }; 295 struct page *pages[NFSACL_MAXPAGES];
296 struct nfs3_setaclargs args = { 296 struct nfs3_setaclargs args = {
297 .inode = inode, 297 .inode = inode,
298 .mask = NFS_ACL, 298 .mask = NFS_ACL,
@@ -303,7 +303,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
303 .rpc_argp = &args, 303 .rpc_argp = &args,
304 .rpc_resp = &fattr, 304 .rpc_resp = &fattr,
305 }; 305 };
306 int status, count; 306 int status;
307 307
308 status = -EOPNOTSUPP; 308 status = -EOPNOTSUPP;
309 if (!nfs_server_capable(inode, NFS_CAP_ACLS)) 309 if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -319,6 +319,20 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
319 if (S_ISDIR(inode->i_mode)) { 319 if (S_ISDIR(inode->i_mode)) {
320 args.mask |= NFS_DFACL; 320 args.mask |= NFS_DFACL;
321 args.acl_default = dfacl; 321 args.acl_default = dfacl;
322 args.len = nfsacl_size(acl, dfacl);
323 } else
324 args.len = nfsacl_size(acl, NULL);
325
326 if (args.len > NFS_ACL_INLINE_BUFSIZE) {
327 unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT);
328
329 status = -ENOMEM;
330 do {
331 args.pages[args.npages] = alloc_page(GFP_KERNEL);
332 if (args.pages[args.npages] == NULL)
333 goto out_freepages;
334 args.npages++;
335 } while (args.npages < npages);
322 } 336 }
323 337
324 dprintk("NFS call setacl\n"); 338 dprintk("NFS call setacl\n");
@@ -329,10 +343,6 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
329 nfs_zap_acl_cache(inode); 343 nfs_zap_acl_cache(inode);
330 dprintk("NFS reply setacl: %d\n", status); 344 dprintk("NFS reply setacl: %d\n", status);
331 345
332 /* pages may have been allocated at the xdr layer. */
333 for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
334 __free_page(args.pages[count]);
335
336 switch (status) { 346 switch (status) {
337 case 0: 347 case 0:
338 status = nfs_refresh_inode(inode, &fattr); 348 status = nfs_refresh_inode(inode, &fattr);
@@ -346,6 +356,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
346 case -ENOTSUPP: 356 case -ENOTSUPP:
347 status = -EOPNOTSUPP; 357 status = -EOPNOTSUPP;
348 } 358 }
359out_freepages:
360 while (args.npages != 0) {
361 args.npages--;
362 __free_page(args.pages[args.npages]);
363 }
349out: 364out:
350 return status; 365 return status;
351} 366}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 11cdddec1432..6cdeacffde46 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -82,8 +82,10 @@
82#define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) 82#define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2)
83 83
84#define ACL3_getaclargs_sz (NFS3_fh_sz+1) 84#define ACL3_getaclargs_sz (NFS3_fh_sz+1)
85#define ACL3_setaclargs_sz (NFS3_fh_sz+1+2*(2+5*3)) 85#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \
86#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+2*(2+5*3)) 86 XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
87#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \
88 XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
87#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) 89#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz)
88 90
89/* 91/*
@@ -703,28 +705,18 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
703 struct nfs3_setaclargs *args) 705 struct nfs3_setaclargs *args)
704{ 706{
705 struct xdr_buf *buf = &req->rq_snd_buf; 707 struct xdr_buf *buf = &req->rq_snd_buf;
706 unsigned int base, len_in_head, len = nfsacl_size( 708 unsigned int base;
707 (args->mask & NFS_ACL) ? args->acl_access : NULL, 709 int err;
708 (args->mask & NFS_DFACL) ? args->acl_default : NULL);
709 int count, err;
710 710
711 p = xdr_encode_fhandle(p, NFS_FH(args->inode)); 711 p = xdr_encode_fhandle(p, NFS_FH(args->inode));
712 *p++ = htonl(args->mask); 712 *p++ = htonl(args->mask);
713 base = (char *)p - (char *)buf->head->iov_base; 713 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
714 /* put as much of the acls into head as possible. */ 714 base = req->rq_slen;
715 len_in_head = min_t(unsigned int, buf->head->iov_len - base, len); 715
716 len -= len_in_head; 716 if (args->npages != 0)
717 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2)); 717 xdr_encode_pages(buf, args->pages, 0, args->len);
718 718 else
719 for (count = 0; (count << PAGE_SHIFT) < len; count++) { 719 req->rq_slen += args->len;
720 args->pages[count] = alloc_page(GFP_KERNEL);
721 if (!args->pages[count]) {
722 while (count)
723 __free_page(args->pages[--count]);
724 return -ENOMEM;
725 }
726 }
727 xdr_encode_pages(buf, args->pages, 0, len);
728 720
729 err = nfsacl_encode(buf, base, args->inode, 721 err = nfsacl_encode(buf, base, args->inode,
730 (args->mask & NFS_ACL) ? 722 (args->mask & NFS_ACL) ?
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 30befc39b3c6..2a2a0a7143ad 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -21,7 +21,9 @@
21#define NFSDBG_FACILITY NFSDBG_VFS 21#define NFSDBG_FACILITY NFSDBG_VFS
22 22
23/* 23/*
24 * Check if fs_root is valid 24 * Convert the NFSv4 pathname components into a standard posix path.
25 *
26 * Note that the resulting string will be placed at the end of the buffer
25 */ 27 */
26static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname, 28static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname,
27 char *buffer, ssize_t buflen) 29 char *buffer, ssize_t buflen)
@@ -99,21 +101,20 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
99{ 101{
100 struct vfsmount *mnt = ERR_PTR(-ENOENT); 102 struct vfsmount *mnt = ERR_PTR(-ENOENT);
101 char *mnt_path; 103 char *mnt_path;
102 int page2len; 104 unsigned int maxbuflen;
103 unsigned int s; 105 unsigned int s;
104 106
105 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); 107 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
106 if (IS_ERR(mnt_path)) 108 if (IS_ERR(mnt_path))
107 return mnt; 109 return mnt;
108 mountdata->mnt_path = mnt_path; 110 mountdata->mnt_path = mnt_path;
109 page2 += strlen(mnt_path) + 1; 111 maxbuflen = mnt_path - 1 - page2;
110 page2len = PAGE_SIZE - strlen(mnt_path) - 1;
111 112
112 for (s = 0; s < location->nservers; s++) { 113 for (s = 0; s < location->nservers; s++) {
113 const struct nfs4_string *buf = &location->servers[s]; 114 const struct nfs4_string *buf = &location->servers[s];
114 struct sockaddr_storage addr; 115 struct sockaddr_storage addr;
115 116
116 if (buf->len <= 0 || buf->len >= PAGE_SIZE) 117 if (buf->len <= 0 || buf->len >= maxbuflen)
117 continue; 118 continue;
118 119
119 mountdata->addr = (struct sockaddr *)&addr; 120 mountdata->addr = (struct sockaddr *)&addr;
@@ -126,8 +127,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
126 continue; 127 continue;
127 nfs_set_port(mountdata->addr, NFS_PORT); 128 nfs_set_port(mountdata->addr, NFS_PORT);
128 129
129 strncpy(page2, buf->data, page2len); 130 memcpy(page2, buf->data, buf->len);
130 page2[page2len] = '\0'; 131 page2[buf->len] = '\0';
131 mountdata->hostname = page2; 132 mountdata->hostname = page2;
132 133
133 snprintf(page, PAGE_SIZE, "%s:%s", 134 snprintf(page, PAGE_SIZE, "%s:%s",
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
new file mode 100644
index 000000000000..44d7d04dab95
--- /dev/null
+++ b/fs/nfsd/Kconfig
@@ -0,0 +1,80 @@
1config NFSD
2 tristate "NFS server support"
3 depends on INET
4 select LOCKD
5 select SUNRPC
6 select EXPORTFS
7 select NFS_ACL_SUPPORT if NFSD_V2_ACL
8 help
9 Choose Y here if you want to allow other computers to access
10 files residing on this system using Sun's Network File System
11 protocol. To compile the NFS server support as a module,
12 choose M here: the module will be called nfsd.
13
14 You may choose to use a user-space NFS server instead, in which
15 case you can choose N here.
16
17 To export local file systems using NFS, you also need to install
18 user space programs which can be found in the Linux nfs-utils
19 package, available from http://linux-nfs.org/. More detail about
20 the Linux NFS server implementation is available via the
21 exports(5) man page.
22
23 Below you can choose which versions of the NFS protocol are
24 available to clients mounting the NFS server on this system.
25 Support for NFS version 2 (RFC 1094) is always available when
26 CONFIG_NFSD is selected.
27
28 If unsure, say N.
29
30config NFSD_V2_ACL
31 bool
32 depends on NFSD
33
34config NFSD_V3
35 bool "NFS server support for NFS version 3"
36 depends on NFSD
37 help
38 This option enables support in your system's NFS server for
39 version 3 of the NFS protocol (RFC 1813).
40
41 If unsure, say Y.
42
43config NFSD_V3_ACL
44 bool "NFS server support for the NFSv3 ACL protocol extension"
45 depends on NFSD_V3
46 select NFSD_V2_ACL
47 help
48 Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
49 never became an official part of the NFS version 3 protocol.
50 This protocol extension allows applications on NFS clients to
51 manipulate POSIX Access Control Lists on files residing on NFS
52 servers. NFS servers enforce POSIX ACLs on local files whether
53 this protocol is available or not.
54
55 This option enables support in your system's NFS server for the
56 NFSv3 ACL protocol extension allowing NFS clients to manipulate
57 POSIX ACLs on files exported by your system's NFS server. NFS
58 clients which support the Solaris NFSv3 ACL protocol can then
59 access and modify ACLs on your NFS server.
60
61 To store ACLs on your NFS server, you also need to enable ACL-
62 related CONFIG options for your local file systems of choice.
63
64 If unsure, say N.
65
66config NFSD_V4
67 bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
68 depends on NFSD && PROC_FS && EXPERIMENTAL
69 select NFSD_V3
70 select FS_POSIX_ACL
71 select RPCSEC_GSS_KRB5
72 help
73 This option enables support in your system's NFS server for
74 version 4 of the NFS protocol (RFC 3530).
75
76 To export files using NFSv4, you need to install additional user
77 space programs which can be found in the Linux nfs-utils package,
78 available from http://linux-nfs.org/.
79
80 If unsure, say N.
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index c903e04aa217..5573508f707f 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -49,6 +49,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
49 new->fsuid = exp->ex_anon_uid; 49 new->fsuid = exp->ex_anon_uid;
50 new->fsgid = exp->ex_anon_gid; 50 new->fsgid = exp->ex_anon_gid;
51 gi = groups_alloc(0); 51 gi = groups_alloc(0);
52 if (!gi)
53 goto oom;
52 } else if (flags & NFSEXP_ROOTSQUASH) { 54 } else if (flags & NFSEXP_ROOTSQUASH) {
53 if (!new->fsuid) 55 if (!new->fsuid)
54 new->fsuid = exp->ex_anon_uid; 56 new->fsuid = exp->ex_anon_uid;
@@ -85,6 +87,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
85 new->cap_effective = cap_raise_nfsd_set(new->cap_effective, 87 new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
86 new->cap_permitted); 88 new->cap_permitted);
87 put_cred(override_creds(new)); 89 put_cred(override_creds(new));
90 put_cred(new);
88 return 0; 91 return 0;
89 92
90oom: 93oom:
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 88db7d3ec120..b6f60f48e94b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2871,7 +2871,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2871 file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; 2871 file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
2872 file_lock.fl_pid = current->tgid; 2872 file_lock.fl_pid = current->tgid;
2873 file_lock.fl_flags = FL_POSIX; 2873 file_lock.fl_flags = FL_POSIX;
2874 file_lock.fl_lmops = &nfsd_posix_mng_ops;
2875 2874
2876 file_lock.fl_start = lockt->lt_offset; 2875 file_lock.fl_start = lockt->lt_offset;
2877 file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); 2876 file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f65953be39c0..9250067943d8 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2596,6 +2596,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
2596 [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, 2596 [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop,
2597 [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, 2597 [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop,
2598 [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, 2598 [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open,
2599 [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop,
2599 [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, 2600 [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm,
2600 [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, 2601 [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade,
2601 [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, 2602 [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6e50aaa56ca2..c165a6403df0 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -998,8 +998,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
998 998
999 if (!EX_ISSYNC(exp)) 999 if (!EX_ISSYNC(exp))
1000 stable = 0; 1000 stable = 0;
1001 if (stable && !EX_WGATHER(exp)) 1001 if (stable && !EX_WGATHER(exp)) {
1002 spin_lock(&file->f_lock);
1002 file->f_flags |= O_SYNC; 1003 file->f_flags |= O_SYNC;
1004 spin_unlock(&file->f_lock);
1005 }
1003 1006
1004 /* Write the data. */ 1007 /* Write the data. */
1005 oldfs = get_fs(); set_fs(KERNEL_DS); 1008 oldfs = get_fs(); set_fs(KERNEL_DS);
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d4..331f2e88e284 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -156,7 +156,7 @@ static int inotify_handle_get_wd(struct inotify_handle *ih,
156 int ret; 156 int ret;
157 157
158 do { 158 do {
159 if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL))) 159 if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
160 return -ENOSPC; 160 return -ENOSPC;
161 ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd); 161 ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
162 } while (ret == -EAGAIN); 162 } while (ret == -EAGAIN);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index d53a1838d6e8..bed766e435b5 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -427,10 +427,61 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
427 return ret; 427 return ret;
428} 428}
429 429
430/*
431 * Get an inotify_kernel_event if one exists and is small
432 * enough to fit in "count". Return an error pointer if
433 * not large enough.
434 *
435 * Called with the device ev_mutex held.
436 */
437static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
438 size_t count)
439{
440 size_t event_size = sizeof(struct inotify_event);
441 struct inotify_kernel_event *kevent;
442
443 if (list_empty(&dev->events))
444 return NULL;
445
446 kevent = inotify_dev_get_event(dev);
447 if (kevent->name)
448 event_size += kevent->event.len;
449
450 if (event_size > count)
451 return ERR_PTR(-EINVAL);
452
453 remove_kevent(dev, kevent);
454 return kevent;
455}
456
457/*
458 * Copy an event to user space, returning how much we copied.
459 *
460 * We already checked that the event size is smaller than the
461 * buffer we had in "get_one_event()" above.
462 */
463static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
464 char __user *buf)
465{
466 size_t event_size = sizeof(struct inotify_event);
467
468 if (copy_to_user(buf, &kevent->event, event_size))
469 return -EFAULT;
470
471 if (kevent->name) {
472 buf += event_size;
473
474 if (copy_to_user(buf, kevent->name, kevent->event.len))
475 return -EFAULT;
476
477 event_size += kevent->event.len;
478 }
479 return event_size;
480}
481
430static ssize_t inotify_read(struct file *file, char __user *buf, 482static ssize_t inotify_read(struct file *file, char __user *buf,
431 size_t count, loff_t *pos) 483 size_t count, loff_t *pos)
432{ 484{
433 size_t event_size = sizeof (struct inotify_event);
434 struct inotify_device *dev; 485 struct inotify_device *dev;
435 char __user *start; 486 char __user *start;
436 int ret; 487 int ret;
@@ -440,81 +491,43 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
440 dev = file->private_data; 491 dev = file->private_data;
441 492
442 while (1) { 493 while (1) {
494 struct inotify_kernel_event *kevent;
443 495
444 prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); 496 prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
445 497
446 mutex_lock(&dev->ev_mutex); 498 mutex_lock(&dev->ev_mutex);
447 if (!list_empty(&dev->events)) { 499 kevent = get_one_event(dev, count);
448 ret = 0;
449 break;
450 }
451 mutex_unlock(&dev->ev_mutex); 500 mutex_unlock(&dev->ev_mutex);
452 501
453 if (file->f_flags & O_NONBLOCK) { 502 if (kevent) {
454 ret = -EAGAIN; 503 ret = PTR_ERR(kevent);
455 break; 504 if (IS_ERR(kevent))
456 } 505 break;
457 506 ret = copy_event_to_user(kevent, buf);
458 if (signal_pending(current)) { 507 free_kevent(kevent);
459 ret = -EINTR; 508 if (ret < 0)
460 break; 509 break;
510 buf += ret;
511 count -= ret;
512 continue;
461 } 513 }
462 514
463 schedule(); 515 ret = -EAGAIN;
464 } 516 if (file->f_flags & O_NONBLOCK)
465
466 finish_wait(&dev->wq, &wait);
467 if (ret)
468 return ret;
469
470 while (1) {
471 struct inotify_kernel_event *kevent;
472
473 ret = buf - start;
474 if (list_empty(&dev->events))
475 break; 517 break;
476 518 ret = -EINTR;
477 kevent = inotify_dev_get_event(dev); 519 if (signal_pending(current))
478 if (event_size + kevent->event.len > count) {
479 if (ret == 0 && count > 0) {
480 /*
481 * could not get a single event because we
482 * didn't have enough buffer space.
483 */
484 ret = -EINVAL;
485 }
486 break; 520 break;
487 }
488 remove_kevent(dev, kevent);
489 521
490 /* 522 if (start != buf)
491 * Must perform the copy_to_user outside the mutex in order
492 * to avoid a lock order reversal with mmap_sem.
493 */
494 mutex_unlock(&dev->ev_mutex);
495
496 if (copy_to_user(buf, &kevent->event, event_size)) {
497 ret = -EFAULT;
498 break; 523 break;
499 }
500 buf += event_size;
501 count -= event_size;
502
503 if (kevent->name) {
504 if (copy_to_user(buf, kevent->name, kevent->event.len)){
505 ret = -EFAULT;
506 break;
507 }
508 buf += kevent->event.len;
509 count -= kevent->event.len;
510 }
511
512 free_kevent(kevent);
513 524
514 mutex_lock(&dev->ev_mutex); 525 schedule();
515 } 526 }
516 mutex_unlock(&dev->ev_mutex);
517 527
528 finish_wait(&dev->wq, &wait);
529 if (start != buf && ret != -EFAULT)
530 ret = buf - start;
518 return ret; 531 return ret;
519} 532}
520 533
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
new file mode 100644
index 000000000000..f5a868cc9152
--- /dev/null
+++ b/fs/ntfs/Kconfig
@@ -0,0 +1,78 @@
1config NTFS_FS
2 tristate "NTFS file system support"
3 select NLS
4 help
5 NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
6
7 Saying Y or M here enables read support. There is partial, but
8 safe, write support available. For write support you must also
9 say Y to "NTFS write support" below.
10
11 There are also a number of user-space tools available, called
12 ntfsprogs. These include ntfsundelete and ntfsresize, that work
13 without NTFS support enabled in the kernel.
14
15 This is a rewrite from scratch of Linux NTFS support and replaced
16 the old NTFS code starting with Linux 2.5.11. A backport to
17 the Linux 2.4 kernel series is separately available as a patch
18 from the project web site.
19
20 For more information see <file:Documentation/filesystems/ntfs.txt>
21 and <http://www.linux-ntfs.org/>.
22
23 To compile this file system support as a module, choose M here: the
24 module will be called ntfs.
25
26 If you are not using Windows NT, 2000, XP or 2003 in addition to
27 Linux on your computer it is safe to say N.
28
29config NTFS_DEBUG
30 bool "NTFS debugging support"
31 depends on NTFS_FS
32 help
33 If you are experiencing any problems with the NTFS file system, say
34 Y here. This will result in additional consistency checks to be
35 performed by the driver as well as additional debugging messages to
36 be written to the system log. Note that debugging messages are
37 disabled by default. To enable them, supply the option debug_msgs=1
38 at the kernel command line when booting the kernel or as an option
39 to insmod when loading the ntfs module. Once the driver is active,
40 you can enable debugging messages by doing (as root):
41 echo 1 > /proc/sys/fs/ntfs-debug
42 Replacing the "1" with "0" would disable debug messages.
43
44 If you leave debugging messages disabled, this results in little
45 overhead, but enabling debug messages results in very significant
46 slowdown of the system.
47
48 When reporting bugs, please try to have available a full dump of
49 debugging messages while the misbehaviour was occurring.
50
51config NTFS_RW
52 bool "NTFS write support"
53 depends on NTFS_FS
54 help
55 This enables the partial, but safe, write support in the NTFS driver.
56
57 The only supported operation is overwriting existing files, without
58 changing the file length. No file or directory creation, deletion or
59 renaming is possible. Note only non-resident files can be written to
60 so you may find that some very small files (<500 bytes or so) cannot
61 be written to.
62
63 While we cannot guarantee that it will not damage any data, we have
64 so far not received a single report where the driver would have
65 damaged someones data so we assume it is perfectly safe to use.
66
67 Note: While write support is safe in this version (a rewrite from
68 scratch of the NTFS support), it should be noted that the old NTFS
69 write support, included in Linux 2.5.10 and before (since 1997),
70 is not safe.
71
72 This is currently useful with TopologiLinux. TopologiLinux is run
73 on top of any DOS/Microsoft Windows system without partitioning your
74 hard disk. Unlike other Linux distributions TopologiLinux does not
75 need its own partition. For more information see
76 <http://topologi-linux.sourceforge.net/>
77
78 It is perfectly safe to say N here.
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
new file mode 100644
index 000000000000..701b7a3a872e
--- /dev/null
+++ b/fs/ocfs2/Kconfig
@@ -0,0 +1,85 @@
1config OCFS2_FS
2 tristate "OCFS2 file system support"
3 depends on NET && SYSFS
4 select CONFIGFS_FS
5 select JBD2
6 select CRC32
7 select QUOTA
8 select QUOTA_TREE
9 help
10 OCFS2 is a general purpose extent based shared disk cluster file
11 system with many similarities to ext3. It supports 64 bit inode
12 numbers, and has automatically extending metadata groups which may
13 also make it attractive for non-clustered use.
14
15 You'll want to install the ocfs2-tools package in order to at least
16 get "mount.ocfs2".
17
18 Project web page: http://oss.oracle.com/projects/ocfs2
19 Tools web page: http://oss.oracle.com/projects/ocfs2-tools
20 OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
21
22 For more information on OCFS2, see the file
23 <file:Documentation/filesystems/ocfs2.txt>.
24
25config OCFS2_FS_O2CB
26 tristate "O2CB Kernelspace Clustering"
27 depends on OCFS2_FS
28 default y
29 help
30 OCFS2 includes a simple kernelspace clustering package, the OCFS2
31 Cluster Base. It only requires a very small userspace component
32 to configure it. This comes with the standard ocfs2-tools package.
33 O2CB is limited to maintaining a cluster for OCFS2 file systems.
34 It cannot manage any other cluster applications.
35
36 It is always safe to say Y here, as the clustering method is
37 run-time selectable.
38
39config OCFS2_FS_USERSPACE_CLUSTER
40 tristate "OCFS2 Userspace Clustering"
41 depends on OCFS2_FS && DLM
42 default y
43 help
44 This option will allow OCFS2 to use userspace clustering services
45 in conjunction with the DLM in fs/dlm. If you are using a
46 userspace cluster manager, say Y here.
47
48 It is safe to say Y, as the clustering method is run-time
49 selectable.
50
51config OCFS2_FS_STATS
52 bool "OCFS2 statistics"
53 depends on OCFS2_FS
54 default y
55 help
56 This option allows some fs statistics to be captured. Enabling
57 this option may increase the memory consumption.
58
59config OCFS2_DEBUG_MASKLOG
60 bool "OCFS2 logging support"
61 depends on OCFS2_FS
62 default y
63 help
64 The ocfs2 filesystem has an extensive logging system. The system
65 allows selection of events to log via files in /sys/o2cb/logmask/.
66 This option will enlarge your kernel, but it allows debugging of
67 ocfs2 filesystem issues.
68
69config OCFS2_DEBUG_FS
70 bool "OCFS2 expensive checks"
71 depends on OCFS2_FS
72 default n
73 help
74 This option will enable expensive consistency checks. Enable
75 this option for debugging only as it is likely to decrease
76 performance of the filesystem.
77
78config OCFS2_FS_POSIX_ACL
79 bool "OCFS2 POSIX Access Control Lists"
80 depends on OCFS2_FS
81 select FS_POSIX_ACL
82 default n
83 help
84 Posix Access Control Lists (ACLs) support permissions for users and
85 groups beyond the owner/group/world scheme.
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d861096c9d81..19e3a96aa02c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -176,7 +176,8 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
176 176
177 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); 177 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
178 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 178 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
179 (OCFS2_I(inode)->ip_clusters != rec->e_cpos), 179 (OCFS2_I(inode)->ip_clusters !=
180 le32_to_cpu(rec->e_cpos)),
180 "Device %s, asking for sparse allocation: inode %llu, " 181 "Device %s, asking for sparse allocation: inode %llu, "
181 "cpos %u, clusters %u\n", 182 "cpos %u, clusters %u\n",
182 osb->dev_str, 183 osb->dev_str,
@@ -4796,6 +4797,29 @@ out:
4796 return ret; 4797 return ret;
4797} 4798}
4798 4799
4800static int ocfs2_replace_extent_rec(struct inode *inode,
4801 handle_t *handle,
4802 struct ocfs2_path *path,
4803 struct ocfs2_extent_list *el,
4804 int split_index,
4805 struct ocfs2_extent_rec *split_rec)
4806{
4807 int ret;
4808
4809 ret = ocfs2_path_bh_journal_access(handle, inode, path,
4810 path_num_items(path) - 1);
4811 if (ret) {
4812 mlog_errno(ret);
4813 goto out;
4814 }
4815
4816 el->l_recs[split_index] = *split_rec;
4817
4818 ocfs2_journal_dirty(handle, path_leaf_bh(path));
4819out:
4820 return ret;
4821}
4822
4799/* 4823/*
4800 * Mark part or all of the extent record at split_index in the leaf 4824 * Mark part or all of the extent record at split_index in the leaf
4801 * pointed to by path as written. This removes the unwritten 4825 * pointed to by path as written. This removes the unwritten
@@ -4885,7 +4909,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4885 4909
4886 if (ctxt.c_contig_type == CONTIG_NONE) { 4910 if (ctxt.c_contig_type == CONTIG_NONE) {
4887 if (ctxt.c_split_covers_rec) 4911 if (ctxt.c_split_covers_rec)
4888 el->l_recs[split_index] = *split_rec; 4912 ret = ocfs2_replace_extent_rec(inode, handle,
4913 path, el,
4914 split_index, split_rec);
4889 else 4915 else
4890 ret = ocfs2_split_and_insert(inode, handle, path, et, 4916 ret = ocfs2_split_and_insert(inode, handle, path, et,
4891 &last_eb_bh, split_index, 4917 &last_eb_bh, split_index,
@@ -5390,6 +5416,9 @@ int ocfs2_remove_btree_range(struct inode *inode,
5390 goto out; 5416 goto out;
5391 } 5417 }
5392 5418
5419 vfs_dq_free_space_nodirty(inode,
5420 ocfs2_clusters_to_bytes(inode->i_sb, len));
5421
5393 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, 5422 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5394 dealloc); 5423 dealloc);
5395 if (ret) { 5424 if (ret) {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a067a6cffb01..8e1709a679b7 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
227 size = i_size_read(inode); 227 size = i_size_read(inode);
228 228
229 if (size > PAGE_CACHE_SIZE || 229 if (size > PAGE_CACHE_SIZE ||
230 size > ocfs2_max_inline_data(inode->i_sb)) { 230 size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
231 ocfs2_error(inode->i_sb, 231 ocfs2_error(inode->i_sb,
232 "Inode %llu has with inline data has bad size: %Lu", 232 "Inode %llu has with inline data has bad size: %Lu",
233 (unsigned long long)OCFS2_I(inode)->ip_blkno, 233 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1555,6 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1555 int ret, written = 0; 1555 int ret, written = 0;
1556 loff_t end = pos + len; 1556 loff_t end = pos + len;
1557 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1557 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1558 struct ocfs2_dinode *di = NULL;
1558 1559
1559 mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", 1560 mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
1560 (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, 1561 (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
@@ -1587,7 +1588,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1587 /* 1588 /*
1588 * Check whether the write can fit. 1589 * Check whether the write can fit.
1589 */ 1590 */
1590 if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb)) 1591 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1592 if (mmap_page ||
1593 end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
1591 return 0; 1594 return 0;
1592 1595
1593do_inline_write: 1596do_inline_write:
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b1cc7c381e88..e9d7c2038c0f 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -38,6 +38,7 @@
38#include "dlmglue.h" 38#include "dlmglue.h"
39#include "file.h" 39#include "file.h"
40#include "inode.h" 40#include "inode.h"
41#include "super.h"
41 42
42 43
43static int ocfs2_dentry_revalidate(struct dentry *dentry, 44static int ocfs2_dentry_revalidate(struct dentry *dentry,
@@ -294,6 +295,34 @@ out_attach:
294 return ret; 295 return ret;
295} 296}
296 297
298static DEFINE_SPINLOCK(dentry_list_lock);
299
300/* We limit the number of dentry locks to drop in one go. We have
301 * this limit so that we don't starve other users of ocfs2_wq. */
302#define DL_INODE_DROP_COUNT 64
303
304/* Drop inode references from dentry locks */
305void ocfs2_drop_dl_inodes(struct work_struct *work)
306{
307 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
308 dentry_lock_work);
309 struct ocfs2_dentry_lock *dl;
310 int drop_count = DL_INODE_DROP_COUNT;
311
312 spin_lock(&dentry_list_lock);
313 while (osb->dentry_lock_list && drop_count--) {
314 dl = osb->dentry_lock_list;
315 osb->dentry_lock_list = dl->dl_next;
316 spin_unlock(&dentry_list_lock);
317 iput(dl->dl_inode);
318 kfree(dl);
319 spin_lock(&dentry_list_lock);
320 }
321 if (osb->dentry_lock_list)
322 queue_work(ocfs2_wq, &osb->dentry_lock_work);
323 spin_unlock(&dentry_list_lock);
324}
325
297/* 326/*
298 * ocfs2_dentry_iput() and friends. 327 * ocfs2_dentry_iput() and friends.
299 * 328 *
@@ -318,16 +347,23 @@ out_attach:
318static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, 347static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
319 struct ocfs2_dentry_lock *dl) 348 struct ocfs2_dentry_lock *dl)
320{ 349{
321 iput(dl->dl_inode);
322 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); 350 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
323 ocfs2_lock_res_free(&dl->dl_lockres); 351 ocfs2_lock_res_free(&dl->dl_lockres);
324 kfree(dl); 352
353 /* We leave dropping of inode reference to ocfs2_wq as that can
354 * possibly lead to inode deletion which gets tricky */
355 spin_lock(&dentry_list_lock);
356 if (!osb->dentry_lock_list)
357 queue_work(ocfs2_wq, &osb->dentry_lock_work);
358 dl->dl_next = osb->dentry_lock_list;
359 osb->dentry_lock_list = dl;
360 spin_unlock(&dentry_list_lock);
325} 361}
326 362
327void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 363void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
328 struct ocfs2_dentry_lock *dl) 364 struct ocfs2_dentry_lock *dl)
329{ 365{
330 int unlock = 0; 366 int unlock;
331 367
332 BUG_ON(dl->dl_count == 0); 368 BUG_ON(dl->dl_count == 0);
333 369
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index c091c34d9883..d06e16c06640 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,8 +29,13 @@
29extern struct dentry_operations ocfs2_dentry_ops; 29extern struct dentry_operations ocfs2_dentry_ops;
30 30
31struct ocfs2_dentry_lock { 31struct ocfs2_dentry_lock {
32 /* Use count of dentry lock */
32 unsigned int dl_count; 33 unsigned int dl_count;
33 u64 dl_parent_blkno; 34 union {
35 /* Linked list of dentry locks to release */
36 struct ocfs2_dentry_lock *dl_next;
37 u64 dl_parent_blkno;
38 };
34 39
35 /* 40 /*
36 * The ocfs2_dentry_lock keeps an inode reference until 41 * The ocfs2_dentry_lock keeps an inode reference until
@@ -47,6 +52,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
47void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 52void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
48 struct ocfs2_dentry_lock *dl); 53 struct ocfs2_dentry_lock *dl);
49 54
55void ocfs2_drop_dl_inodes(struct work_struct *work);
56
50struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, 57struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
51 int skip_unhashed); 58 int skip_unhashed);
52 59
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 54e182a27caf..0a2813947853 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1849,12 +1849,12 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1849 if (!mle) { 1849 if (!mle) {
1850 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && 1850 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
1851 res->owner != assert->node_idx) { 1851 res->owner != assert->node_idx) {
1852 mlog(ML_ERROR, "assert_master from " 1852 mlog(ML_ERROR, "DIE! Mastery assert from %u, "
1853 "%u, but current owner is " 1853 "but current owner is %u! (%.*s)\n",
1854 "%u! (%.*s)\n", 1854 assert->node_idx, res->owner, namelen,
1855 assert->node_idx, res->owner, 1855 name);
1856 namelen, name); 1856 __dlm_print_one_lock_resource(res);
1857 goto kill; 1857 BUG();
1858 } 1858 }
1859 } else if (mle->type != DLM_MLE_MIGRATION) { 1859 } else if (mle->type != DLM_MLE_MIGRATION) {
1860 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { 1860 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d1295203029f..4060bb328bc8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,8 +181,7 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
181 181
182 spin_lock(&res->spinlock); 182 spin_lock(&res->spinlock);
183 /* This ensures that clear refmap is sent after the set */ 183 /* This ensures that clear refmap is sent after the set */
184 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG | 184 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
185 DLM_LOCK_RES_MIGRATING));
186 spin_unlock(&res->spinlock); 185 spin_unlock(&res->spinlock);
187 186
188 /* clear our bit from the master's refmap, ignore errors */ 187 /* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 86ca085ef324..fcf879ed6930 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -117,11 +117,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
117 else 117 else
118 BUG_ON(res->owner == dlm->node_num); 118 BUG_ON(res->owner == dlm->node_num);
119 119
120 spin_lock(&dlm->spinlock); 120 spin_lock(&dlm->ast_lock);
121 /* We want to be sure that we're not freeing a lock 121 /* We want to be sure that we're not freeing a lock
122 * that still has AST's pending... */ 122 * that still has AST's pending... */
123 in_use = !list_empty(&lock->ast_list); 123 in_use = !list_empty(&lock->ast_list);
124 spin_unlock(&dlm->spinlock); 124 spin_unlock(&dlm->ast_lock);
125 if (in_use) { 125 if (in_use) {
126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " 126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
127 "while waiting for an ast!", res->lockname.len, 127 "while waiting for an ast!", res->lockname.len,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b0c4cadd4c45..7219a86d34cc 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -320,9 +320,14 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
320 struct ocfs2_lock_res *lockres); 320 struct ocfs2_lock_res *lockres);
321static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 321static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
322 int convert); 322 int convert);
323#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \ 323#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \
324 mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \ 324 if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY) \
325 _err, _func, _lockres->l_name); \ 325 mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
326 _err, _func, _lockres->l_name); \
327 else \
328 mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n", \
329 _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name, \
330 (unsigned int)ocfs2_get_dentry_lock_ino(_lockres)); \
326} while (0) 331} while (0)
327static int ocfs2_downconvert_thread(void *arg); 332static int ocfs2_downconvert_thread(void *arg);
328static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, 333static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
@@ -2860,6 +2865,10 @@ static void ocfs2_unlock_ast(void *opaque, int error)
2860 case OCFS2_UNLOCK_CANCEL_CONVERT: 2865 case OCFS2_UNLOCK_CANCEL_CONVERT:
2861 mlog(0, "Cancel convert success for %s\n", lockres->l_name); 2866 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2862 lockres->l_action = OCFS2_AST_INVALID; 2867 lockres->l_action = OCFS2_AST_INVALID;
2868 /* Downconvert thread may have requeued this lock, we
2869 * need to wake it. */
2870 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2871 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
2863 break; 2872 break;
2864 case OCFS2_UNLOCK_DROP_LOCK: 2873 case OCFS2_UNLOCK_DROP_LOCK:
2865 lockres->l_level = DLM_LOCK_IV; 2874 lockres->l_level = DLM_LOCK_IV;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3c3532e1307c..172850a9a12a 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
513static inline int ocfs2_begin_ordered_truncate(struct inode *inode, 513static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
514 loff_t new_size) 514 loff_t new_size)
515{ 515{
516 return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode, 516 return jbd2_journal_begin_ordered_truncate(
517 new_size); 517 OCFS2_SB(inode->i_sb)->journal->j_journal,
518 &OCFS2_I(inode)->ip_jinode,
519 new_size);
518} 520}
519 521
520#endif /* OCFS2_JOURNAL_H */ 522#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 084aba86c3b2..4b11762f249e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -532,7 +532,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
532 532
533 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); 533 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
534 534
535 fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb)); 535 fe->id2.i_data.id_count = cpu_to_le16(
536 ocfs2_max_inline_data_with_xattr(osb->sb, fe));
536 } else { 537 } else {
537 fel = &fe->id2.i_list; 538 fel = &fe->id2.i_list;
538 fel->l_tree_depth = 0; 539 fel->l_tree_depth = 0;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index ad5c24a29edd..946d3c34b90b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -210,6 +210,7 @@ struct ocfs2_journal;
210struct ocfs2_slot_info; 210struct ocfs2_slot_info;
211struct ocfs2_recovery_map; 211struct ocfs2_recovery_map;
212struct ocfs2_quota_recovery; 212struct ocfs2_quota_recovery;
213struct ocfs2_dentry_lock;
213struct ocfs2_super 214struct ocfs2_super
214{ 215{
215 struct task_struct *commit_task; 216 struct task_struct *commit_task;
@@ -325,6 +326,11 @@ struct ocfs2_super
325 struct list_head blocked_lock_list; 326 struct list_head blocked_lock_list;
326 unsigned long blocked_lock_count; 327 unsigned long blocked_lock_count;
327 328
329 /* List of dentry locks to release. Anyone can add locks to
330 * the list, ocfs2_wq processes the list */
331 struct ocfs2_dentry_lock *dentry_lock_list;
332 struct work_struct dentry_lock_work;
333
328 wait_queue_head_t osb_mount_event; 334 wait_queue_head_t osb_mount_event;
329 335
330 /* Truncate log info */ 336 /* Truncate log info */
@@ -335,6 +341,9 @@ struct ocfs2_super
335 struct ocfs2_node_map osb_recovering_orphan_dirs; 341 struct ocfs2_node_map osb_recovering_orphan_dirs;
336 unsigned int *osb_orphan_wipes; 342 unsigned int *osb_orphan_wipes;
337 wait_queue_head_t osb_wipe_event; 343 wait_queue_head_t osb_wipe_event;
344
345 /* used to protect metaecc calculation check of xattr. */
346 spinlock_t osb_xattr_lock;
338}; 347};
339 348
340#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 349#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c7ae45aaa36c..2332ef740f4f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -1070,12 +1070,6 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
1070 offsetof(struct ocfs2_dinode, id2.i_symlink); 1070 offsetof(struct ocfs2_dinode, id2.i_symlink);
1071} 1071}
1072 1072
1073static inline int ocfs2_max_inline_data(struct super_block *sb)
1074{
1075 return sb->s_blocksize -
1076 offsetof(struct ocfs2_dinode, id2.i_data.id_data);
1077}
1078
1079static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, 1073static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
1080 struct ocfs2_dinode *di) 1074 struct ocfs2_dinode *di)
1081{ 1075{
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 6aff8f2d3e49..1ed0f7c86869 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -754,7 +754,9 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
754 if (dquot->dq_flags & mask) 754 if (dquot->dq_flags & mask)
755 sync = 1; 755 sync = 1;
756 spin_unlock(&dq_data_lock); 756 spin_unlock(&dq_data_lock);
757 if (!sync) { 757 /* This is a slight hack but we can't afford getting global quota
758 * lock if we already have a transaction started. */
759 if (!sync || journal_current_handle()) {
758 status = ocfs2_write_dquot(dquot); 760 status = ocfs2_write_dquot(dquot);
759 goto out; 761 goto out;
760 } 762 }
@@ -810,171 +812,6 @@ out:
810 return status; 812 return status;
811} 813}
812 814
813/* This is difficult. We have to lock quota inode and start transaction
814 * in this function but we don't want to take the penalty of exlusive
815 * quota file lock when we are just going to use cached structures. So
816 * we just take read lock check whether we have dquot cached and if so,
817 * we don't have to take the write lock... */
818static int ocfs2_dquot_initialize(struct inode *inode, int type)
819{
820 handle_t *handle = NULL;
821 int status = 0;
822 struct super_block *sb = inode->i_sb;
823 struct ocfs2_mem_dqinfo *oinfo;
824 int exclusive = 0;
825 int cnt;
826 qid_t id;
827
828 mlog_entry_void();
829
830 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
831 if (type != -1 && cnt != type)
832 continue;
833 if (!sb_has_quota_active(sb, cnt))
834 continue;
835 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
836 status = ocfs2_lock_global_qf(oinfo, 0);
837 if (status < 0)
838 goto out;
839 /* This is just a performance optimization not a reliable test.
840 * Since we hold an inode lock, noone can actually release
841 * the structure until we are finished with initialization. */
842 if (inode->i_dquot[cnt] != NODQUOT) {
843 ocfs2_unlock_global_qf(oinfo, 0);
844 continue;
845 }
846 /* When we have inode lock, we know that no dquot_release() can
847 * run and thus we can safely check whether we need to
848 * read+modify global file to get quota information or whether
849 * our node already has it. */
850 if (cnt == USRQUOTA)
851 id = inode->i_uid;
852 else if (cnt == GRPQUOTA)
853 id = inode->i_gid;
854 else
855 BUG();
856 /* Obtain exclusion from quota off... */
857 down_write(&sb_dqopt(sb)->dqptr_sem);
858 exclusive = !dquot_is_cached(sb, id, cnt);
859 up_write(&sb_dqopt(sb)->dqptr_sem);
860 if (exclusive) {
861 status = ocfs2_lock_global_qf(oinfo, 1);
862 if (status < 0) {
863 exclusive = 0;
864 mlog_errno(status);
865 goto out_ilock;
866 }
867 handle = ocfs2_start_trans(OCFS2_SB(sb),
868 ocfs2_calc_qinit_credits(sb, cnt));
869 if (IS_ERR(handle)) {
870 status = PTR_ERR(handle);
871 mlog_errno(status);
872 goto out_ilock;
873 }
874 }
875 dquot_initialize(inode, cnt);
876 if (exclusive) {
877 ocfs2_commit_trans(OCFS2_SB(sb), handle);
878 ocfs2_unlock_global_qf(oinfo, 1);
879 }
880 ocfs2_unlock_global_qf(oinfo, 0);
881 }
882 mlog_exit(0);
883 return 0;
884out_ilock:
885 if (exclusive)
886 ocfs2_unlock_global_qf(oinfo, 1);
887 ocfs2_unlock_global_qf(oinfo, 0);
888out:
889 mlog_exit(status);
890 return status;
891}
892
893static int ocfs2_dquot_drop_slow(struct inode *inode)
894{
895 int status = 0;
896 int cnt;
897 int got_lock[MAXQUOTAS] = {0, 0};
898 handle_t *handle;
899 struct super_block *sb = inode->i_sb;
900 struct ocfs2_mem_dqinfo *oinfo;
901
902 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
903 if (!sb_has_quota_active(sb, cnt))
904 continue;
905 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
906 status = ocfs2_lock_global_qf(oinfo, 1);
907 if (status < 0)
908 goto out;
909 got_lock[cnt] = 1;
910 }
911 handle = ocfs2_start_trans(OCFS2_SB(sb),
912 ocfs2_calc_qinit_credits(sb, USRQUOTA) +
913 ocfs2_calc_qinit_credits(sb, GRPQUOTA));
914 if (IS_ERR(handle)) {
915 status = PTR_ERR(handle);
916 mlog_errno(status);
917 goto out;
918 }
919 dquot_drop(inode);
920 ocfs2_commit_trans(OCFS2_SB(sb), handle);
921out:
922 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
923 if (got_lock[cnt]) {
924 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
925 ocfs2_unlock_global_qf(oinfo, 1);
926 }
927 return status;
928}
929
930/* See the comment before ocfs2_dquot_initialize. */
931static int ocfs2_dquot_drop(struct inode *inode)
932{
933 int status = 0;
934 struct super_block *sb = inode->i_sb;
935 struct ocfs2_mem_dqinfo *oinfo;
936 int exclusive = 0;
937 int cnt;
938 int got_lock[MAXQUOTAS] = {0, 0};
939
940 mlog_entry_void();
941 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
942 if (!sb_has_quota_active(sb, cnt))
943 continue;
944 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
945 status = ocfs2_lock_global_qf(oinfo, 0);
946 if (status < 0)
947 goto out;
948 got_lock[cnt] = 1;
949 }
950 /* Lock against anyone releasing references so that when when we check
951 * we know we are not going to be last ones to release dquot */
952 down_write(&sb_dqopt(sb)->dqptr_sem);
953 /* Urgh, this is a terrible hack :( */
954 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
955 if (inode->i_dquot[cnt] != NODQUOT &&
956 atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) {
957 exclusive = 1;
958 break;
959 }
960 }
961 if (!exclusive)
962 dquot_drop_locked(inode);
963 up_write(&sb_dqopt(sb)->dqptr_sem);
964out:
965 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
966 if (got_lock[cnt]) {
967 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
968 ocfs2_unlock_global_qf(oinfo, 0);
969 }
970 /* In case we bailed out because we had to do expensive locking
971 * do it now... */
972 if (exclusive)
973 status = ocfs2_dquot_drop_slow(inode);
974 mlog_exit(status);
975 return status;
976}
977
978static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type) 815static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
979{ 816{
980 struct ocfs2_dquot *dquot = 817 struct ocfs2_dquot *dquot =
@@ -991,8 +828,8 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
991} 828}
992 829
993struct dquot_operations ocfs2_quota_operations = { 830struct dquot_operations ocfs2_quota_operations = {
994 .initialize = ocfs2_dquot_initialize, 831 .initialize = dquot_initialize,
995 .drop = ocfs2_dquot_drop, 832 .drop = dquot_drop,
996 .alloc_space = dquot_alloc_space, 833 .alloc_space = dquot_alloc_space,
997 .alloc_inode = dquot_alloc_inode, 834 .alloc_inode = dquot_alloc_inode,
998 .free_space = dquot_free_space, 835 .free_space = dquot_free_space,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 43ed11345b59..7ac83a81ee55 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1537,6 +1537,13 @@ static int ocfs2_get_sector(struct super_block *sb,
1537 unlock_buffer(*bh); 1537 unlock_buffer(*bh);
1538 ll_rw_block(READ, 1, bh); 1538 ll_rw_block(READ, 1, bh);
1539 wait_on_buffer(*bh); 1539 wait_on_buffer(*bh);
1540 if (!buffer_uptodate(*bh)) {
1541 mlog_errno(-EIO);
1542 brelse(*bh);
1543 *bh = NULL;
1544 return -EIO;
1545 }
1546
1540 return 0; 1547 return 0;
1541} 1548}
1542 1549
@@ -1747,6 +1754,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1747 INIT_LIST_HEAD(&osb->blocked_lock_list); 1754 INIT_LIST_HEAD(&osb->blocked_lock_list);
1748 osb->blocked_lock_count = 0; 1755 osb->blocked_lock_count = 0;
1749 spin_lock_init(&osb->osb_lock); 1756 spin_lock_init(&osb->osb_lock);
1757 spin_lock_init(&osb->osb_xattr_lock);
1750 ocfs2_init_inode_steal_slot(osb); 1758 ocfs2_init_inode_steal_slot(osb);
1751 1759
1752 atomic_set(&osb->alloc_stats.moves, 0); 1760 atomic_set(&osb->alloc_stats.moves, 0);
@@ -1887,6 +1895,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
1887 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); 1895 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
1888 journal->j_state = OCFS2_JOURNAL_FREE; 1896 journal->j_state = OCFS2_JOURNAL_FREE;
1889 1897
1898 INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
1899 osb->dentry_lock_list = NULL;
1900
1890 /* get some pseudo constants for clustersize bits */ 1901 /* get some pseudo constants for clustersize bits */
1891 osb->s_clustersize_bits = 1902 osb->s_clustersize_bits =
1892 le32_to_cpu(di->id2.i_super.s_clustersize_bits); 1903 le32_to_cpu(di->id2.i_super.s_clustersize_bits);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e1d638af6ac3..2563df89fc2a 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -82,13 +82,14 @@ struct ocfs2_xattr_set_ctxt {
82 82
83#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) 83#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
84#define OCFS2_XATTR_INLINE_SIZE 80 84#define OCFS2_XATTR_INLINE_SIZE 80
85#define OCFS2_XATTR_HEADER_GAP 4
85#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \ 86#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \
86 - sizeof(struct ocfs2_xattr_header) \ 87 - sizeof(struct ocfs2_xattr_header) \
87 - sizeof(__u32)) 88 - OCFS2_XATTR_HEADER_GAP)
88#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \ 89#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \
89 - sizeof(struct ocfs2_xattr_block) \ 90 - sizeof(struct ocfs2_xattr_block) \
90 - sizeof(struct ocfs2_xattr_header) \ 91 - sizeof(struct ocfs2_xattr_header) \
91 - sizeof(__u32)) 92 - OCFS2_XATTR_HEADER_GAP)
92 93
93static struct ocfs2_xattr_def_value_root def_xv = { 94static struct ocfs2_xattr_def_value_root def_xv = {
94 .xv.xr_list.l_count = cpu_to_le16(1), 95 .xv.xr_list.l_count = cpu_to_le16(1),
@@ -274,10 +275,12 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
274 bucket->bu_blocks, bucket->bu_bhs, 0, 275 bucket->bu_blocks, bucket->bu_bhs, 0,
275 NULL); 276 NULL);
276 if (!rc) { 277 if (!rc) {
278 spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
277 rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb, 279 rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
278 bucket->bu_bhs, 280 bucket->bu_bhs,
279 bucket->bu_blocks, 281 bucket->bu_blocks,
280 &bucket_xh(bucket)->xh_check); 282 &bucket_xh(bucket)->xh_check);
283 spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
281 if (rc) 284 if (rc)
282 mlog_errno(rc); 285 mlog_errno(rc);
283 } 286 }
@@ -310,9 +313,11 @@ static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
310{ 313{
311 int i; 314 int i;
312 315
316 spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
313 ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb, 317 ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
314 bucket->bu_bhs, bucket->bu_blocks, 318 bucket->bu_bhs, bucket->bu_blocks,
315 &bucket_xh(bucket)->xh_check); 319 &bucket_xh(bucket)->xh_check);
320 spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
316 321
317 for (i = 0; i < bucket->bu_blocks; i++) 322 for (i = 0; i < bucket->bu_blocks; i++)
318 ocfs2_journal_dirty(handle, bucket->bu_bhs[i]); 323 ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
@@ -542,8 +547,12 @@ int ocfs2_calc_xattr_init(struct inode *dir,
542 * when blocksize = 512, may reserve one more cluser for 547 * when blocksize = 512, may reserve one more cluser for
543 * xattr bucket, otherwise reserve one metadata block 548 * xattr bucket, otherwise reserve one metadata block
544 * for them is ok. 549 * for them is ok.
550 * If this is a new directory with inline data,
551 * we choose to reserve the entire inline area for
552 * directory contents and force an external xattr block.
545 */ 553 */
546 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || 554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
547 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { 556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
548 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); 557 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
549 if (ret) { 558 if (ret) {
@@ -1507,7 +1516,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1507 last += 1; 1516 last += 1;
1508 } 1517 }
1509 1518
1510 free = min_offs - ((void *)last - xs->base) - sizeof(__u32); 1519 free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
1511 if (free < 0) 1520 if (free < 0)
1512 return -EIO; 1521 return -EIO;
1513 1522
@@ -2190,7 +2199,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2190 last += 1; 2199 last += 1;
2191 } 2200 }
2192 2201
2193 free = min_offs - ((void *)last - xs->base) - sizeof(__u32); 2202 free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
2194 if (free < 0) 2203 if (free < 0)
2195 return 0; 2204 return 0;
2196 2205
@@ -2592,8 +2601,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2592 2601
2593 if (!ret) { 2602 if (!ret) {
2594 /* Update inode ctime. */ 2603 /* Update inode ctime. */
2595 ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh, 2604 ret = ocfs2_journal_access_di(ctxt->handle, inode,
2596 OCFS2_JOURNAL_ACCESS_WRITE); 2605 xis->inode_bh,
2606 OCFS2_JOURNAL_ACCESS_WRITE);
2597 if (ret) { 2607 if (ret) {
2598 mlog_errno(ret); 2608 mlog_errno(ret);
2599 goto out; 2609 goto out;
@@ -4729,13 +4739,6 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4729 vb.vb_xv = (struct ocfs2_xattr_value_root *) 4739 vb.vb_xv = (struct ocfs2_xattr_value_root *)
4730 (vb.vb_bh->b_data + offset % blocksize); 4740 (vb.vb_bh->b_data + offset % blocksize);
4731 4741
4732 ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
4733 OCFS2_JOURNAL_ACCESS_WRITE);
4734 if (ret) {
4735 mlog_errno(ret);
4736 goto out;
4737 }
4738
4739 /* 4742 /*
4740 * From here on out we have to dirty the bucket. The generic 4743 * From here on out we have to dirty the bucket. The generic
4741 * value calls only modify one of the bucket's bhs, but we need 4744 * value calls only modify one of the bucket's bhs, but we need
@@ -4748,12 +4751,18 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4748 ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt); 4751 ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
4749 if (ret) { 4752 if (ret) {
4750 mlog_errno(ret); 4753 mlog_errno(ret);
4751 goto out_dirty; 4754 goto out;
4755 }
4756
4757 ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
4758 OCFS2_JOURNAL_ACCESS_WRITE);
4759 if (ret) {
4760 mlog_errno(ret);
4761 goto out;
4752 } 4762 }
4753 4763
4754 xe->xe_value_size = cpu_to_le64(len); 4764 xe->xe_value_size = cpu_to_le64(len);
4755 4765
4756out_dirty:
4757 ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket); 4766 ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
4758 4767
4759out: 4768out:
@@ -4786,19 +4795,33 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4786 char *val, 4795 char *val,
4787 int value_len) 4796 int value_len)
4788{ 4797{
4789 int offset; 4798 int ret, offset, block_off;
4790 struct ocfs2_xattr_value_root *xv; 4799 struct ocfs2_xattr_value_root *xv;
4791 struct ocfs2_xattr_entry *xe = xs->here; 4800 struct ocfs2_xattr_entry *xe = xs->here;
4801 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
4802 void *base;
4792 4803
4793 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); 4804 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
4794 4805
4795 offset = le16_to_cpu(xe->xe_name_offset) + 4806 ret = ocfs2_xattr_bucket_get_name_value(inode, xh,
4796 OCFS2_XATTR_SIZE(xe->xe_name_len); 4807 xe - xh->xh_entries,
4808 &block_off,
4809 &offset);
4810 if (ret) {
4811 mlog_errno(ret);
4812 goto out;
4813 }
4797 4814
4798 xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); 4815 base = bucket_block(xs->bucket, block_off);
4816 xv = (struct ocfs2_xattr_value_root *)(base + offset +
4817 OCFS2_XATTR_SIZE(xe->xe_name_len));
4799 4818
4800 return __ocfs2_xattr_set_value_outside(inode, handle, 4819 ret = __ocfs2_xattr_set_value_outside(inode, handle,
4801 xv, val, value_len); 4820 xv, val, value_len);
4821 if (ret)
4822 mlog_errno(ret);
4823out:
4824 return ret;
4802} 4825}
4803 4826
4804static int ocfs2_rm_xattr_cluster(struct inode *inode, 4827static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -5061,8 +5084,8 @@ try_again:
5061 xh_free_start = le16_to_cpu(xh->xh_free_start); 5084 xh_free_start = le16_to_cpu(xh->xh_free_start);
5062 header_size = sizeof(struct ocfs2_xattr_header) + 5085 header_size = sizeof(struct ocfs2_xattr_header) +
5063 count * sizeof(struct ocfs2_xattr_entry); 5086 count * sizeof(struct ocfs2_xattr_entry);
5064 max_free = OCFS2_XATTR_BUCKET_SIZE - 5087 max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
5065 le16_to_cpu(xh->xh_name_value_len) - header_size; 5088 le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
5066 5089
5067 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " 5090 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
5068 "of %u which exceed block size\n", 5091 "of %u which exceed block size\n",
@@ -5095,7 +5118,7 @@ try_again:
5095 need = 0; 5118 need = 0;
5096 } 5119 }
5097 5120
5098 free = xh_free_start - header_size; 5121 free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
5099 /* 5122 /*
5100 * We need to make sure the new name/value pair 5123 * We need to make sure the new name/value pair
5101 * can exist in the same block. 5124 * can exist in the same block.
@@ -5128,7 +5151,8 @@ try_again:
5128 } 5151 }
5129 5152
5130 xh_free_start = le16_to_cpu(xh->xh_free_start); 5153 xh_free_start = le16_to_cpu(xh->xh_free_start);
5131 free = xh_free_start - header_size; 5154 free = xh_free_start - header_size
5155 - OCFS2_XATTR_HEADER_GAP;
5132 if (xh_free_start % blocksize < need) 5156 if (xh_free_start % blocksize < need)
5133 free -= xh_free_start % blocksize; 5157 free -= xh_free_start % blocksize;
5134 5158
diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig
new file mode 100644
index 000000000000..b1b9a0aba6fd
--- /dev/null
+++ b/fs/omfs/Kconfig
@@ -0,0 +1,13 @@
1config OMFS_FS
2 tristate "SonicBlue Optimized MPEG File System support"
3 depends on BLOCK
4 select CRC_ITU_T
5 help
6 This is the proprietary file system used by the Rio Karma music
7 player and ReplayTV DVR. Despite the name, this filesystem is not
8 more efficient than a standard FS for MPEG files, in fact likely
9 the opposite is true. Say Y if you have either of these devices
10 and wish to mount its disk.
11
12 To compile this file system support as a module, choose M here: the
13 module will be called omfs. If unsure, say N.
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d720243f5f4..38e337d51ced 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -400,7 +400,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
400 pdev->devt = devt; 400 pdev->devt = devt;
401 401
402 /* delay uevent until 'holders' subdir is created */ 402 /* delay uevent until 'holders' subdir is created */
403 pdev->uevent_suppress = 1; 403 dev_set_uevent_suppress(pdev, 1);
404 err = device_add(pdev); 404 err = device_add(pdev);
405 if (err) 405 if (err)
406 goto out_put; 406 goto out_put;
@@ -410,7 +410,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
410 if (!p->holder_dir) 410 if (!p->holder_dir)
411 goto out_del; 411 goto out_del;
412 412
413 pdev->uevent_suppress = 0; 413 dev_set_uevent_suppress(pdev, 0);
414 if (flags & ADDPART_FLAG_WHOLEDISK) { 414 if (flags & ADDPART_FLAG_WHOLEDISK) {
415 err = device_create_file(pdev, &dev_attr_whole_disk); 415 err = device_create_file(pdev, &dev_attr_whole_disk);
416 if (err) 416 if (err)
@@ -422,7 +422,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
422 rcu_assign_pointer(ptbl->part[partno], p); 422 rcu_assign_pointer(ptbl->part[partno], p);
423 423
424 /* suppress uevent if the disk supresses it */ 424 /* suppress uevent if the disk supresses it */
425 if (!ddev->uevent_suppress) 425 if (!dev_get_uevent_suppress(pdev))
426 kobject_uevent(&pdev->kobj, KOBJ_ADD); 426 kobject_uevent(&pdev->kobj, KOBJ_ADD);
427 427
428 return p; 428 return p;
@@ -455,7 +455,7 @@ void register_disk(struct gendisk *disk)
455 dev_set_name(ddev, disk->disk_name); 455 dev_set_name(ddev, disk->disk_name);
456 456
457 /* delay uevents, until we scanned partition table */ 457 /* delay uevents, until we scanned partition table */
458 ddev->uevent_suppress = 1; 458 dev_set_uevent_suppress(ddev, 1);
459 459
460 if (device_add(ddev)) 460 if (device_add(ddev))
461 return; 461 return;
@@ -490,7 +490,7 @@ void register_disk(struct gendisk *disk)
490 490
491exit: 491exit:
492 /* announce disk after possible partitions are created */ 492 /* announce disk after possible partitions are created */
493 ddev->uevent_suppress = 0; 493 dev_set_uevent_suppress(ddev, 0);
494 kobject_uevent(&ddev->kobj, KOBJ_ADD); 494 kobject_uevent(&ddev->kobj, KOBJ_ADD);
495 495
496 /* announce possible partitions */ 496 /* announce possible partitions */
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 1e064c4a4f86..46297683cd34 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -21,20 +21,38 @@
21 * compute the block number from a 21 * compute the block number from a
22 * cyl-cyl-head-head structure 22 * cyl-cyl-head-head structure
23 */ 23 */
24static inline int 24static sector_t
25cchh2blk (struct vtoc_cchh *ptr, struct hd_geometry *geo) { 25cchh2blk (struct vtoc_cchh *ptr, struct hd_geometry *geo) {
26 return ptr->cc * geo->heads * geo->sectors + 26
27 ptr->hh * geo->sectors; 27 sector_t cyl;
28 __u16 head;
29
30 /*decode cylinder and heads for large volumes */
31 cyl = ptr->hh & 0xFFF0;
32 cyl <<= 12;
33 cyl |= ptr->cc;
34 head = ptr->hh & 0x000F;
35 return cyl * geo->heads * geo->sectors +
36 head * geo->sectors;
28} 37}
29 38
30/* 39/*
31 * compute the block number from a 40 * compute the block number from a
32 * cyl-cyl-head-head-block structure 41 * cyl-cyl-head-head-block structure
33 */ 42 */
34static inline int 43static sector_t
35cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) { 44cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
36 return ptr->cc * geo->heads * geo->sectors + 45
37 ptr->hh * geo->sectors + 46 sector_t cyl;
47 __u16 head;
48
49 /*decode cylinder and heads for large volumes */
50 cyl = ptr->hh & 0xFFF0;
51 cyl <<= 12;
52 cyl |= ptr->cc;
53 head = ptr->hh & 0x000F;
54 return cyl * geo->heads * geo->sectors +
55 head * geo->sectors +
38 ptr->b; 56 ptr->b;
39} 57}
40 58
@@ -43,14 +61,15 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
43int 61int
44ibm_partition(struct parsed_partitions *state, struct block_device *bdev) 62ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
45{ 63{
46 int blocksize, offset, size,res; 64 int blocksize, res;
47 loff_t i_size; 65 loff_t i_size, offset, size, fmt_size;
48 dasd_information2_t *info; 66 dasd_information2_t *info;
49 struct hd_geometry *geo; 67 struct hd_geometry *geo;
50 char type[5] = {0,}; 68 char type[5] = {0,};
51 char name[7] = {0,}; 69 char name[7] = {0,};
52 union label_t { 70 union label_t {
53 struct vtoc_volume_label vol; 71 struct vtoc_volume_label_cdl vol;
72 struct vtoc_volume_label_ldl lnx;
54 struct vtoc_cms_label cms; 73 struct vtoc_cms_label cms;
55 } *label; 74 } *label;
56 unsigned char *data; 75 unsigned char *data;
@@ -85,14 +104,16 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
85 if (data == NULL) 104 if (data == NULL)
86 goto out_readerr; 105 goto out_readerr;
87 106
88 strncpy (type, data, 4);
89 if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD")))
90 strncpy(name, data + 8, 6);
91 else
92 strncpy(name, data + 4, 6);
93 memcpy(label, data, sizeof(union label_t)); 107 memcpy(label, data, sizeof(union label_t));
94 put_dev_sector(sect); 108 put_dev_sector(sect);
95 109
110 if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) {
111 strncpy(type, label->vol.vollbl, 4);
112 strncpy(name, label->vol.volid, 6);
113 } else {
114 strncpy(type, label->lnx.vollbl, 4);
115 strncpy(name, label->lnx.volid, 6);
116 }
96 EBCASC(type, 4); 117 EBCASC(type, 4);
97 EBCASC(name, 6); 118 EBCASC(name, 6);
98 119
@@ -110,36 +131,54 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
110 /* 131 /*
111 * VM style CMS1 labeled disk 132 * VM style CMS1 labeled disk
112 */ 133 */
134 blocksize = label->cms.block_size;
113 if (label->cms.disk_offset != 0) { 135 if (label->cms.disk_offset != 0) {
114 printk("CMS1/%8s(MDSK):", name); 136 printk("CMS1/%8s(MDSK):", name);
115 /* disk is reserved minidisk */ 137 /* disk is reserved minidisk */
116 blocksize = label->cms.block_size;
117 offset = label->cms.disk_offset; 138 offset = label->cms.disk_offset;
118 size = (label->cms.block_count - 1) 139 size = (label->cms.block_count - 1)
119 * (blocksize >> 9); 140 * (blocksize >> 9);
120 } else { 141 } else {
121 printk("CMS1/%8s:", name); 142 printk("CMS1/%8s:", name);
122 offset = (info->label_block + 1); 143 offset = (info->label_block + 1);
123 size = i_size >> 9; 144 size = label->cms.block_count
145 * (blocksize >> 9);
124 } 146 }
147 put_partition(state, 1, offset*(blocksize >> 9),
148 size-offset*(blocksize >> 9));
125 } else { 149 } else {
126 /* 150 if (strncmp(type, "LNX1", 4) == 0) {
127 * Old style LNX1 or unlabeled disk 151 printk("LNX1/%8s:", name);
128 */ 152 if (label->lnx.ldl_version == 0xf2) {
129 if (strncmp(type, "LNX1", 4) == 0) 153 fmt_size = label->lnx.formatted_blocks
130 printk ("LNX1/%8s:", name); 154 * (blocksize >> 9);
131 else 155 } else if (!strcmp(info->type, "ECKD")) {
156 /* formated w/o large volume support */
157 fmt_size = geo->cylinders * geo->heads
158 * geo->sectors * (blocksize >> 9);
159 } else {
160 /* old label and no usable disk geometry
161 * (e.g. DIAG) */
162 fmt_size = i_size >> 9;
163 }
164 size = i_size >> 9;
165 if (fmt_size < size)
166 size = fmt_size;
167 offset = (info->label_block + 1);
168 } else {
169 /* unlabeled disk */
132 printk("(nonl)"); 170 printk("(nonl)");
133 offset = (info->label_block + 1); 171 size = i_size >> 9;
134 size = i_size >> 9; 172 offset = (info->label_block + 1);
135 } 173 }
136 put_partition(state, 1, offset*(blocksize >> 9), 174 put_partition(state, 1, offset*(blocksize >> 9),
137 size-offset*(blocksize >> 9)); 175 size-offset*(blocksize >> 9));
176 }
138 } else if (info->format == DASD_FORMAT_CDL) { 177 } else if (info->format == DASD_FORMAT_CDL) {
139 /* 178 /*
140 * New style CDL formatted disk 179 * New style CDL formatted disk
141 */ 180 */
142 unsigned int blk; 181 sector_t blk;
143 int counter; 182 int counter;
144 183
145 /* 184 /*
@@ -166,7 +205,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
166 /* skip FMT4 / FMT5 / FMT7 labels */ 205 /* skip FMT4 / FMT5 / FMT7 labels */
167 if (f1.DS1FMTID == _ascebc['4'] 206 if (f1.DS1FMTID == _ascebc['4']
168 || f1.DS1FMTID == _ascebc['5'] 207 || f1.DS1FMTID == _ascebc['5']
169 || f1.DS1FMTID == _ascebc['7']) { 208 || f1.DS1FMTID == _ascebc['7']
209 || f1.DS1FMTID == _ascebc['9']) {
170 blk++; 210 blk++;
171 data = read_dev_sector(bdev, blk * 211 data = read_dev_sector(bdev, blk *
172 (blocksize/512), 212 (blocksize/512),
@@ -174,8 +214,9 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
174 continue; 214 continue;
175 } 215 }
176 216
177 /* only FMT1 valid at this point */ 217 /* only FMT1 and 8 labels valid at this point */
178 if (f1.DS1FMTID != _ascebc['1']) 218 if (f1.DS1FMTID != _ascebc['1'] &&
219 f1.DS1FMTID != _ascebc['8'])
179 break; 220 break;
180 221
181 /* OK, we got valid partition data */ 222 /* OK, we got valid partition data */
diff --git a/fs/pipe.c b/fs/pipe.c
index 3a48ba5179d5..94ad15967cf9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -667,10 +667,7 @@ pipe_read_fasync(int fd, struct file *filp, int on)
667 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); 667 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
668 mutex_unlock(&inode->i_mutex); 668 mutex_unlock(&inode->i_mutex);
669 669
670 if (retval < 0) 670 return retval;
671 return retval;
672
673 return 0;
674} 671}
675 672
676 673
@@ -684,10 +681,7 @@ pipe_write_fasync(int fd, struct file *filp, int on)
684 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); 681 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
685 mutex_unlock(&inode->i_mutex); 682 mutex_unlock(&inode->i_mutex);
686 683
687 if (retval < 0) 684 return retval;
688 return retval;
689
690 return 0;
691} 685}
692 686
693 687
@@ -699,18 +693,14 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
699 int retval; 693 int retval;
700 694
701 mutex_lock(&inode->i_mutex); 695 mutex_lock(&inode->i_mutex);
702
703 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 696 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
704 697 if (retval >= 0) {
705 if (retval >= 0)
706 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 698 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
707 699 if (retval < 0) /* this can happen only if on == T */
700 fasync_helper(-1, filp, 0, &pipe->fasync_readers);
701 }
708 mutex_unlock(&inode->i_mutex); 702 mutex_unlock(&inode->i_mutex);
709 703 return retval;
710 if (retval < 0)
711 return retval;
712
713 return 0;
714} 704}
715 705
716 706
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0c9de19a1633..beaa0ce3b82e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3066,7 +3066,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
3066 int retval = -ENOENT; 3066 int retval = -ENOENT;
3067 ino_t ino; 3067 ino_t ino;
3068 int tid; 3068 int tid;
3069 unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */
3070 struct pid_namespace *ns; 3069 struct pid_namespace *ns;
3071 3070
3072 task = get_proc_task(inode); 3071 task = get_proc_task(inode);
@@ -3083,18 +3082,18 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
3083 goto out_no_task; 3082 goto out_no_task;
3084 retval = 0; 3083 retval = 0;
3085 3084
3086 switch (pos) { 3085 switch ((unsigned long)filp->f_pos) {
3087 case 0: 3086 case 0:
3088 ino = inode->i_ino; 3087 ino = inode->i_ino;
3089 if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) 3088 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
3090 goto out; 3089 goto out;
3091 pos++; 3090 filp->f_pos++;
3092 /* fall through */ 3091 /* fall through */
3093 case 1: 3092 case 1:
3094 ino = parent_ino(dentry); 3093 ino = parent_ino(dentry);
3095 if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) 3094 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
3096 goto out; 3095 goto out;
3097 pos++; 3096 filp->f_pos++;
3098 /* fall through */ 3097 /* fall through */
3099 } 3098 }
3100 3099
@@ -3104,9 +3103,9 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
3104 ns = filp->f_dentry->d_sb->s_fs_info; 3103 ns = filp->f_dentry->d_sb->s_fs_info;
3105 tid = (int)filp->f_version; 3104 tid = (int)filp->f_version;
3106 filp->f_version = 0; 3105 filp->f_version = 0;
3107 for (task = first_tid(leader, tid, pos - 2, ns); 3106 for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
3108 task; 3107 task;
3109 task = next_tid(task), pos++) { 3108 task = next_tid(task), filp->f_pos++) {
3110 tid = task_pid_nr_ns(task, ns); 3109 tid = task_pid_nr_ns(task, ns);
3111 if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { 3110 if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
3112 /* returning this tgid failed, save it as the first 3111 /* returning this tgid failed, save it as the first
@@ -3117,7 +3116,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
3117 } 3116 }
3118 } 3117 }
3119out: 3118out:
3120 filp->f_pos = pos;
3121 put_task_struct(leader); 3119 put_task_struct(leader);
3122out_no_task: 3120out_no_task:
3123 return retval; 3121 return retval;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3e76bb9b3ad6..d8bb5c671f42 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -485,8 +485,10 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
485 } 485 }
486 } 486 }
487 unlock_new_inode(inode); 487 unlock_new_inode(inode);
488 } else 488 } else {
489 module_put(de->owner); 489 module_put(de->owner);
490 de_put(de);
491 }
490 return inode; 492 return inode;
491 493
492out_ino: 494out_ino:
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 767d95a6d1b1..e9983837d08d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -80,7 +80,7 @@ static const struct file_operations proc_kpagecount_operations = {
80#define KPF_RECLAIM 9 80#define KPF_RECLAIM 9
81#define KPF_BUDDY 10 81#define KPF_BUDDY 10
82 82
83#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos) 83#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos)
84 84
85static ssize_t kpageflags_read(struct file *file, char __user *buf, 85static ssize_t kpageflags_read(struct file *file, char __user *buf,
86 size_t count, loff_t *ppos) 86 size_t count, loff_t *ppos)
@@ -107,7 +107,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
107 else 107 else
108 kflags = ppage->flags; 108 kflags = ppage->flags;
109 109
110 uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) | 110 uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) |
111 kpf_copy_bit(kflags, KPF_ERROR, PG_error) | 111 kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
112 kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) | 112 kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
113 kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) | 113 kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
new file mode 100644
index 000000000000..be8e0e1445b6
--- /dev/null
+++ b/fs/qnx4/Kconfig
@@ -0,0 +1,25 @@
1config QNX4FS_FS
2 tristate "QNX4 file system support (read only)"
3 depends on BLOCK
4 help
5 This is the file system used by the real-time operating systems
6 QNX 4 and QNX 6 (the latter is also called QNX RTP).
7 Further information is available at <http://www.qnx.com/>.
8 Say Y if you intend to mount QNX hard disks or floppies.
9 Unless you say Y to "QNX4FS read-write support" below, you will
10 only be able to read these file systems.
11
12 To compile this file system support as a module, choose M here: the
13 module will be called qnx4.
14
15 If you don't know whether you need it, then you don't need it:
16 answer N.
17
18config QNX4FS_RW
19 bool "QNX4FS write support (DANGEROUS)"
20 depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
21 help
22 Say Y if you want to test write support for QNX4 file systems.
23
24 It's currently broken, so for now:
25 answer N.
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index b9b567a28376..5d7c7ececa64 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -114,6 +114,9 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
114 if (!pagevec_add(&lru_pvec, page)) 114 if (!pagevec_add(&lru_pvec, page))
115 __pagevec_lru_add_file(&lru_pvec); 115 __pagevec_lru_add_file(&lru_pvec);
116 116
117 /* prevent the page from being discarded on memory pressure */
118 SetPageDirty(page);
119
117 unlock_page(page); 120 unlock_page(page);
118 } 121 }
119 122
@@ -126,6 +129,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
126 return -EFBIG; 129 return -EFBIG;
127 130
128 add_error: 131 add_error:
132 pagevec_lru_add_file(&lru_pvec);
129 page_cache_release(pages + loop); 133 page_cache_release(pages + loop);
130 for (loop++; loop < npages; loop++) 134 for (loop++; loop < npages; loop++)
131 __free_page(pages + loop); 135 __free_page(pages + loop);
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
new file mode 100644
index 000000000000..949b8c6addc8
--- /dev/null
+++ b/fs/reiserfs/Kconfig
@@ -0,0 +1,85 @@
1config REISERFS_FS
2 tristate "Reiserfs support"
3 help
4 Stores not just filenames but the files themselves in a balanced
5 tree. Uses journalling.
6
7 Balanced trees are more efficient than traditional file system
8 architectural foundations.
9
10 In general, ReiserFS is as fast as ext2, but is very efficient with
11 large directories and small files. Additional patches are needed
12 for NFS and quotas, please see <http://www.namesys.com/> for links.
13
14 It is more easily extended to have features currently found in
15 database and keyword search systems than block allocation based file
16 systems are. The next version will be so extended, and will support
17 plugins consistent with our motto ``It takes more than a license to
18 make source code open.''
19
20 Read <http://www.namesys.com/> to learn more about reiserfs.
21
22 Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
23
24 If you like it, you can pay us to add new features to it that you
25 need, buy a support contract, or pay us to port it to another OS.
26
27config REISERFS_CHECK
28 bool "Enable reiserfs debug mode"
29 depends on REISERFS_FS
30 help
31 If you set this to Y, then ReiserFS will perform every check it can
32 possibly imagine of its internal consistency throughout its
33 operation. It will also go substantially slower. More than once we
34 have forgotten that this was on, and then gone despondent over the
35 latest benchmarks.:-) Use of this option allows our team to go all
36 out in checking for consistency when debugging without fear of its
37 effect on end users. If you are on the verge of sending in a bug
38 report, say Y and you might get a useful error message. Almost
39 everyone should say N.
40
41config REISERFS_PROC_INFO
42 bool "Stats in /proc/fs/reiserfs"
43 depends on REISERFS_FS && PROC_FS
44 help
45 Create under /proc/fs/reiserfs a hierarchy of files, displaying
46 various ReiserFS statistics and internal data at the expense of
47 making your kernel or module slightly larger (+8 KB). This also
48 increases the amount of kernel memory required for each mount.
49 Almost everyone but ReiserFS developers and people fine-tuning
50 reiserfs or tracing problems should say N.
51
52config REISERFS_FS_XATTR
53 bool "ReiserFS extended attributes"
54 depends on REISERFS_FS
55 help
56 Extended attributes are name:value pairs associated with inodes by
57 the kernel or by users (see the attr(5) manual page, or visit
58 <http://acl.bestbits.at/> for details).
59
60 If unsure, say N.
61
62config REISERFS_FS_POSIX_ACL
63 bool "ReiserFS POSIX Access Control Lists"
64 depends on REISERFS_FS_XATTR
65 select FS_POSIX_ACL
66 help
67 Posix Access Control Lists (ACLs) support permissions for users and
68 groups beyond the owner/group/world scheme.
69
70 To learn more about Access Control Lists, visit the Posix ACLs for
71 Linux website <http://acl.bestbits.at/>.
72
73 If you don't know what Access Control Lists are, say N
74
75config REISERFS_FS_SECURITY
76 bool "ReiserFS Security Labels"
77 depends on REISERFS_FS_XATTR
78 help
79 Security labels support alternative access control models
80 implemented by security modules like SELinux. This option
81 enables an extended attribute handler for file security
82 labels in the ReiserFS filesystem.
83
84 If you are not using a security module that requires using
85 extended attributes for file security labels, say N.
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
new file mode 100644
index 000000000000..1a17020f9faf
--- /dev/null
+++ b/fs/romfs/Kconfig
@@ -0,0 +1,16 @@
1config ROMFS_FS
2 tristate "ROM file system support"
3 depends on BLOCK
4 ---help---
5 This is a very small read-only file system mainly intended for
6 initial ram disks of installation disks, but it could be used for
7 other read-only media as well. Read
8 <file:Documentation/filesystems/romfs.txt> for details.
9
10 To compile this file system support as a module, choose M here: the
11 module will be called romfs. Note that the file system of your
12 root partition (the one containing the directory /) cannot be a
13 module.
14
15 If you don't know whether you need it, then you don't need it:
16 answer N.
diff --git a/fs/seq_file.c b/fs/seq_file.c
index b569ff1c4dc8..a1a4cfe19210 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,12 +48,78 @@ int seq_open(struct file *file, const struct seq_operations *op)
48 */ 48 */
49 file->f_version = 0; 49 file->f_version = 0;
50 50
51 /* SEQ files support lseek, but not pread/pwrite */ 51 /*
52 file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE); 52 * seq_files support lseek() and pread(). They do not implement
53 * write() at all, but we clear FMODE_PWRITE here for historical
54 * reasons.
55 *
56 * If a client of seq_files a) implements file.write() and b) wishes to
57 * support pwrite() then that client will need to implement its own
58 * file.open() which calls seq_open() and then sets FMODE_PWRITE.
59 */
60 file->f_mode &= ~FMODE_PWRITE;
53 return 0; 61 return 0;
54} 62}
55EXPORT_SYMBOL(seq_open); 63EXPORT_SYMBOL(seq_open);
56 64
65static int traverse(struct seq_file *m, loff_t offset)
66{
67 loff_t pos = 0, index;
68 int error = 0;
69 void *p;
70
71 m->version = 0;
72 index = 0;
73 m->count = m->from = 0;
74 if (!offset) {
75 m->index = index;
76 return 0;
77 }
78 if (!m->buf) {
79 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
80 if (!m->buf)
81 return -ENOMEM;
82 }
83 p = m->op->start(m, &index);
84 while (p) {
85 error = PTR_ERR(p);
86 if (IS_ERR(p))
87 break;
88 error = m->op->show(m, p);
89 if (error < 0)
90 break;
91 if (unlikely(error)) {
92 error = 0;
93 m->count = 0;
94 }
95 if (m->count == m->size)
96 goto Eoverflow;
97 if (pos + m->count > offset) {
98 m->from = offset - pos;
99 m->count -= m->from;
100 m->index = index;
101 break;
102 }
103 pos += m->count;
104 m->count = 0;
105 if (pos == offset) {
106 index++;
107 m->index = index;
108 break;
109 }
110 p = m->op->next(m, p, &index);
111 }
112 m->op->stop(m, p);
113 m->index = index;
114 return error;
115
116Eoverflow:
117 m->op->stop(m, p);
118 kfree(m->buf);
119 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
120 return !m->buf ? -ENOMEM : -EAGAIN;
121}
122
57/** 123/**
58 * seq_read - ->read() method for sequential files. 124 * seq_read - ->read() method for sequential files.
59 * @file: the file to read from 125 * @file: the file to read from
@@ -73,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
73 int err = 0; 139 int err = 0;
74 140
75 mutex_lock(&m->lock); 141 mutex_lock(&m->lock);
142
143 /* Don't assume *ppos is where we left it */
144 if (unlikely(*ppos != m->read_pos)) {
145 m->read_pos = *ppos;
146 while ((err = traverse(m, *ppos)) == -EAGAIN)
147 ;
148 if (err) {
149 /* With prejudice... */
150 m->read_pos = 0;
151 m->version = 0;
152 m->index = 0;
153 m->count = 0;
154 goto Done;
155 }
156 }
157
76 /* 158 /*
77 * seq_file->op->..m_start/m_stop/m_next may do special actions 159 * seq_file->op->..m_start/m_stop/m_next may do special actions
78 * or optimisations based on the file->f_version, so we want to 160 * or optimisations based on the file->f_version, so we want to
@@ -172,8 +254,10 @@ Fill:
172Done: 254Done:
173 if (!copied) 255 if (!copied)
174 copied = err; 256 copied = err;
175 else 257 else {
176 *ppos += copied; 258 *ppos += copied;
259 m->read_pos += copied;
260 }
177 file->f_version = m->version; 261 file->f_version = m->version;
178 mutex_unlock(&m->lock); 262 mutex_unlock(&m->lock);
179 return copied; 263 return copied;
@@ -186,63 +270,6 @@ Efault:
186} 270}
187EXPORT_SYMBOL(seq_read); 271EXPORT_SYMBOL(seq_read);
188 272
189static int traverse(struct seq_file *m, loff_t offset)
190{
191 loff_t pos = 0, index;
192 int error = 0;
193 void *p;
194
195 m->version = 0;
196 index = 0;
197 m->count = m->from = 0;
198 if (!offset) {
199 m->index = index;
200 return 0;
201 }
202 if (!m->buf) {
203 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
204 if (!m->buf)
205 return -ENOMEM;
206 }
207 p = m->op->start(m, &index);
208 while (p) {
209 error = PTR_ERR(p);
210 if (IS_ERR(p))
211 break;
212 error = m->op->show(m, p);
213 if (error < 0)
214 break;
215 if (unlikely(error)) {
216 error = 0;
217 m->count = 0;
218 }
219 if (m->count == m->size)
220 goto Eoverflow;
221 if (pos + m->count > offset) {
222 m->from = offset - pos;
223 m->count -= m->from;
224 m->index = index;
225 break;
226 }
227 pos += m->count;
228 m->count = 0;
229 if (pos == offset) {
230 index++;
231 m->index = index;
232 break;
233 }
234 p = m->op->next(m, p, &index);
235 }
236 m->op->stop(m, p);
237 return error;
238
239Eoverflow:
240 m->op->stop(m, p);
241 kfree(m->buf);
242 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
243 return !m->buf ? -ENOMEM : -EAGAIN;
244}
245
246/** 273/**
247 * seq_lseek - ->llseek() method for sequential files. 274 * seq_lseek - ->llseek() method for sequential files.
248 * @file: the file in question 275 * @file: the file in question
@@ -265,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
265 if (offset < 0) 292 if (offset < 0)
266 break; 293 break;
267 retval = offset; 294 retval = offset;
268 if (offset != file->f_pos) { 295 if (offset != m->read_pos) {
269 while ((retval=traverse(m, offset)) == -EAGAIN) 296 while ((retval=traverse(m, offset)) == -EAGAIN)
270 ; 297 ;
271 if (retval) { 298 if (retval) {
272 /* with extreme prejudice... */ 299 /* with extreme prejudice... */
273 file->f_pos = 0; 300 file->f_pos = 0;
301 m->read_pos = 0;
274 m->version = 0; 302 m->version = 0;
275 m->index = 0; 303 m->index = 0;
276 m->count = 0; 304 m->count = 0;
277 } else { 305 } else {
306 m->read_pos = offset;
278 retval = file->f_pos = offset; 307 retval = file->f_pos = offset;
279 } 308 }
280 } 309 }
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
new file mode 100644
index 000000000000..e668127c8b2e
--- /dev/null
+++ b/fs/smbfs/Kconfig
@@ -0,0 +1,55 @@
1config SMB_FS
2 tristate "SMB file system support (OBSOLETE, please use CIFS)"
3 depends on INET
4 select NLS
5 help
6 SMB (Server Message Block) is the protocol Windows for Workgroups
7 (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
8 files and printers over local networks. Saying Y here allows you to
9 mount their file systems (often called "shares" in this context) and
10 access them just like any other Unix directory. Currently, this
11 works only if the Windows machines use TCP/IP as the underlying
12 transport protocol, and not NetBEUI. For details, read
13 <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
14 available from <http://www.tldp.org/docs.html#howto>.
15
16 Note: if you just want your box to act as an SMB *server* and make
17 files and printing services available to Windows clients (which need
18 to have a TCP/IP stack), you don't need to say Y here; you can use
19 the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
20 for that.
21
22 General information about how to connect Linux, Windows machines and
23 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
24
25 To compile the SMB support as a module, choose M here:
26 the module will be called smbfs. Most people say N, however.
27
28config SMB_NLS_DEFAULT
29 bool "Use a default NLS"
30 depends on SMB_FS
31 help
32 Enabling this will make smbfs use nls translations by default. You
33 need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
34 settings and you need to give the default nls for the SMB server as
35 CONFIG_SMB_NLS_REMOTE.
36
37 The nls settings can be changed at mount time, if your smbmount
38 supports that, using the codepage and iocharset parameters.
39
40 smbmount from samba 2.2.0 or later supports this.
41
42config SMB_NLS_REMOTE
43 string "Default Remote NLS Option"
44 depends on SMB_NLS_DEFAULT
45 default "cp437"
46 help
47 This setting allows you to specify a default value for which
48 codepage the server uses. If this field is left blank no
49 translations will be done by default. The local codepage/charset
50 default to CONFIG_NLS_DEFAULT.
51
52 The nls settings can be changed at mount time, if your smbmount
53 supports that, using the codepage and iocharset parameters.
54
55 smbmount from samba 2.2.0 or later supports this.
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
new file mode 100644
index 000000000000..25a00d19d686
--- /dev/null
+++ b/fs/squashfs/Kconfig
@@ -0,0 +1,51 @@
1config SQUASHFS
2 tristate "SquashFS 4.0 - Squashed file system support"
3 depends on BLOCK
4 select ZLIB_INFLATE
5 help
6 Saying Y here includes support for SquashFS 4.0 (a Compressed
7 Read-Only File System). Squashfs is a highly compressed read-only
8 filesystem for Linux. It uses zlib compression to compress both
9 files, inodes and directories. Inodes in the system are very small
10 and all blocks are packed to minimise data overhead. Block sizes
11 greater than 4K are supported up to a maximum of 1 Mbytes (default
12 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files
13 (larger than 4GB), full uid/gid information, hard links and
14 timestamps.
15
16 Squashfs is intended for general read-only filesystem use, for
17 archival use (i.e. in cases where a .tar.gz file may be used), and in
18 embedded systems where low overhead is needed. Further information
19 and tools are available from http://squashfs.sourceforge.net.
20
21 If you want to compile this as a module ( = code which can be
22 inserted in and removed from the running kernel whenever you want),
23 say M here and read <file:Documentation/modules.txt>. The module
24 will be called squashfs. Note that the root file system (the one
25 containing the directory /) cannot be compiled as a module.
26
27 If unsure, say N.
28
29config SQUASHFS_EMBEDDED
30
31 bool "Additional option for memory-constrained systems"
32 depends on SQUASHFS
33 default n
34 help
35 Saying Y here allows you to specify cache size.
36
37 If unsure, say N.
38
39config SQUASHFS_FRAGMENT_CACHE_SIZE
40 int "Number of fragments cached" if SQUASHFS_EMBEDDED
41 depends on SQUASHFS
42 default "3"
43 help
44 By default SquashFS caches the last 3 fragments read from
45 the filesystem. Increasing this amount may mean SquashFS
46 has to re-read fragments less often from disk, at the expense
47 of extra system memory. Decreasing this amount will mean
48 SquashFS uses less memory at the expense of extra reads from disk.
49
50 Note there must be at least one cached fragment. Anything
51 much more than three will probably not make much difference.
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index c837dfc2b3c6..2a7960310349 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -80,7 +80,7 @@ static struct buffer_head *get_block_length(struct super_block *sb,
80 * generated a larger block - this does occasionally happen with zlib). 80 * generated a larger block - this does occasionally happen with zlib).
81 */ 81 */
82int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, 82int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
83 int length, u64 *next_index, int srclength) 83 int length, u64 *next_index, int srclength, int pages)
84{ 84{
85 struct squashfs_sb_info *msblk = sb->s_fs_info; 85 struct squashfs_sb_info *msblk = sb->s_fs_info;
86 struct buffer_head **bh; 86 struct buffer_head **bh;
@@ -184,7 +184,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
184 offset = 0; 184 offset = 0;
185 } 185 }
186 186
187 if (msblk->stream.avail_out == 0) { 187 if (msblk->stream.avail_out == 0 && page < pages) {
188 msblk->stream.next_out = buffer[page++]; 188 msblk->stream.next_out = buffer[page++];
189 msblk->stream.avail_out = PAGE_CACHE_SIZE; 189 msblk->stream.avail_out = PAGE_CACHE_SIZE;
190 } 190 }
@@ -201,25 +201,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
201 zlib_init = 1; 201 zlib_init = 1;
202 } 202 }
203 203
204 zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH); 204 zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
205 205
206 if (msblk->stream.avail_in == 0 && k < b) 206 if (msblk->stream.avail_in == 0 && k < b)
207 put_bh(bh[k++]); 207 put_bh(bh[k++]);
208 } while (zlib_err == Z_OK); 208 } while (zlib_err == Z_OK);
209 209
210 if (zlib_err != Z_STREAM_END) { 210 if (zlib_err != Z_STREAM_END) {
211 ERROR("zlib_inflate returned unexpected result" 211 ERROR("zlib_inflate error, data probably corrupt\n");
212 " 0x%x, srclength %d, avail_in %d,"
213 " avail_out %d\n", zlib_err, srclength,
214 msblk->stream.avail_in,
215 msblk->stream.avail_out);
216 goto release_mutex; 212 goto release_mutex;
217 } 213 }
218 214
219 zlib_err = zlib_inflateEnd(&msblk->stream); 215 zlib_err = zlib_inflateEnd(&msblk->stream);
220 if (zlib_err != Z_OK) { 216 if (zlib_err != Z_OK) {
221 ERROR("zlib_inflateEnd returned unexpected result 0x%x," 217 ERROR("zlib_inflate error, data probably corrupt\n");
222 " srclength %d\n", zlib_err, srclength);
223 goto release_mutex; 218 goto release_mutex;
224 } 219 }
225 length = msblk->stream.total_out; 220 length = msblk->stream.total_out;
@@ -268,7 +263,8 @@ block_release:
268 put_bh(bh[k]); 263 put_bh(bh[k]);
269 264
270read_failure: 265read_failure:
271 ERROR("sb_bread failed reading block 0x%llx\n", cur_index); 266 ERROR("squashfs_read_data failed to read block 0x%llx\n",
267 (unsigned long long) index);
272 kfree(bh); 268 kfree(bh);
273 return -EIO; 269 return -EIO;
274} 270}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f29eda16d25e..1c4739e33af6 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -119,7 +119,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
119 119
120 entry->length = squashfs_read_data(sb, entry->data, 120 entry->length = squashfs_read_data(sb, entry->data,
121 block, length, &entry->next_index, 121 block, length, &entry->next_index,
122 cache->block_size); 122 cache->block_size, cache->pages);
123 123
124 spin_lock(&cache->lock); 124 spin_lock(&cache->lock);
125 125
@@ -406,7 +406,7 @@ int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
406 for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) 406 for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
407 data[i] = buffer; 407 data[i] = buffer;
408 res = squashfs_read_data(sb, data, block, length | 408 res = squashfs_read_data(sb, data, block, length |
409 SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length); 409 SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
410 kfree(data); 410 kfree(data);
411 return res; 411 return res;
412} 412}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 7a63398bb855..9101dbde39ec 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -133,7 +133,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
133 type = le16_to_cpu(sqshb_ino->inode_type); 133 type = le16_to_cpu(sqshb_ino->inode_type);
134 switch (type) { 134 switch (type) {
135 case SQUASHFS_REG_TYPE: { 135 case SQUASHFS_REG_TYPE: {
136 unsigned int frag_offset, frag_size, frag; 136 unsigned int frag_offset, frag;
137 int frag_size;
137 u64 frag_blk; 138 u64 frag_blk;
138 struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; 139 struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
139 140
@@ -175,7 +176,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
175 break; 176 break;
176 } 177 }
177 case SQUASHFS_LREG_TYPE: { 178 case SQUASHFS_LREG_TYPE: {
178 unsigned int frag_offset, frag_size, frag; 179 unsigned int frag_offset, frag;
180 int frag_size;
179 u64 frag_blk; 181 u64 frag_blk;
180 struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; 182 struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
181 183
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 6b2515d027d5..0e9feb6adf7e 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -34,7 +34,7 @@ static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
34 34
35/* block.c */ 35/* block.c */
36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, 36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
37 int); 37 int, int);
38 38
39/* cache.c */ 39/* cache.c */
40extern struct squashfs_cache *squashfs_cache_init(char *, int, int); 40extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 071df5b5b491..681ec0d83799 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -389,7 +389,7 @@ static int __init init_squashfs_fs(void)
389 return err; 389 return err;
390 } 390 }
391 391
392 printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) " 392 printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) "
393 "Phillip Lougher\n"); 393 "Phillip Lougher\n");
394 394
395 return 0; 395 return 0;
diff --git a/fs/super.c b/fs/super.c
index 645e5403f2a0..dd4acb158b5e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -82,7 +82,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
82 * lock ordering than usbfs: 82 * lock ordering than usbfs:
83 */ 83 */
84 lockdep_set_class(&s->s_lock, &type->s_lock_key); 84 lockdep_set_class(&s->s_lock, &type->s_lock_key);
85 down_write(&s->s_umount); 85 /*
86 * sget() can have s_umount recursion.
87 *
88 * When it cannot find a suitable sb, it allocates a new
89 * one (this one), and tries again to find a suitable old
90 * one.
91 *
92 * In case that succeeds, it will acquire the s_umount
93 * lock of the old one. Since these are clearly distrinct
94 * locks, and this object isn't exposed yet, there's no
95 * risk of deadlocks.
96 *
97 * Annotate this by putting this lock in a different
98 * subclass.
99 */
100 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
86 s->s_count = S_BIAS; 101 s->s_count = S_BIAS;
87 atomic_set(&s->s_active, 1); 102 atomic_set(&s->s_active, 1);
88 mutex_init(&s->s_vfs_rename_mutex); 103 mutex_init(&s->s_vfs_rename_mutex);
@@ -301,7 +316,7 @@ void generic_shutdown_super(struct super_block *sb)
301 /* 316 /*
302 * wait for asynchronous fs operations to finish before going further 317 * wait for asynchronous fs operations to finish before going further
303 */ 318 */
304 async_synchronize_full_special(&sb->s_async_list); 319 async_synchronize_full_domain(&sb->s_async_list);
305 320
306 /* bad name - it should be evict_inodes() */ 321 /* bad name - it should be evict_inodes() */
307 invalidate_inodes(sb); 322 invalidate_inodes(sb);
@@ -356,8 +371,10 @@ retry:
356 continue; 371 continue;
357 if (!grab_super(old)) 372 if (!grab_super(old))
358 goto retry; 373 goto retry;
359 if (s) 374 if (s) {
375 up_write(&s->s_umount);
360 destroy_super(s); 376 destroy_super(s);
377 }
361 return old; 378 return old;
362 } 379 }
363 } 380 }
@@ -372,6 +389,7 @@ retry:
372 err = set(s, data); 389 err = set(s, data);
373 if (err) { 390 if (err) {
374 spin_unlock(&sb_lock); 391 spin_unlock(&sb_lock);
392 up_write(&s->s_umount);
375 destroy_super(s); 393 destroy_super(s);
376 return ERR_PTR(err); 394 return ERR_PTR(err);
377 } 395 }
@@ -470,7 +488,7 @@ restart:
470 sb->s_count++; 488 sb->s_count++;
471 spin_unlock(&sb_lock); 489 spin_unlock(&sb_lock);
472 down_read(&sb->s_umount); 490 down_read(&sb->s_umount);
473 async_synchronize_full_special(&sb->s_async_list); 491 async_synchronize_full_domain(&sb->s_async_list);
474 if (sb->s_root && (wait || sb->s_dirt)) 492 if (sb->s_root && (wait || sb->s_dirt))
475 sb->s_op->sync_fs(sb, wait); 493 sb->s_op->sync_fs(sb, wait);
476 up_read(&sb->s_umount); 494 up_read(&sb->s_umount);
@@ -656,7 +674,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
656 return 0; 674 return 0;
657} 675}
658 676
659static void do_emergency_remount(unsigned long foo) 677static void do_emergency_remount(struct work_struct *work)
660{ 678{
661 struct super_block *sb; 679 struct super_block *sb;
662 680
@@ -679,12 +697,19 @@ static void do_emergency_remount(unsigned long foo)
679 spin_lock(&sb_lock); 697 spin_lock(&sb_lock);
680 } 698 }
681 spin_unlock(&sb_lock); 699 spin_unlock(&sb_lock);
700 kfree(work);
682 printk("Emergency Remount complete\n"); 701 printk("Emergency Remount complete\n");
683} 702}
684 703
685void emergency_remount(void) 704void emergency_remount(void)
686{ 705{
687 pdflush_operation(do_emergency_remount, 0); 706 struct work_struct *work;
707
708 work = kmalloc(sizeof(*work), GFP_ATOMIC);
709 if (work) {
710 INIT_WORK(work, do_emergency_remount);
711 schedule_work(work);
712 }
688} 713}
689 714
690/* 715/*
diff --git a/fs/sync.c b/fs/sync.c
index a16d53e5fe9d..ec95a69d17aa 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,9 +42,21 @@ SYSCALL_DEFINE0(sync)
42 return 0; 42 return 0;
43} 43}
44 44
45static void do_sync_work(struct work_struct *work)
46{
47 do_sync(0);
48 kfree(work);
49}
50
45void emergency_sync(void) 51void emergency_sync(void)
46{ 52{
47 pdflush_operation(do_sync, 0); 53 struct work_struct *work;
54
55 work = kmalloc(sizeof(*work), GFP_ATOMIC);
56 if (work) {
57 INIT_WORK(work, do_sync_work);
58 schedule_work(work);
59 }
48} 60}
49 61
50/* 62/*
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
new file mode 100644
index 000000000000..f4b67588b9d6
--- /dev/null
+++ b/fs/sysfs/Kconfig
@@ -0,0 +1,23 @@
1config SYSFS
2 bool "sysfs file system support" if EMBEDDED
3 default y
4 help
5 The sysfs filesystem is a virtual filesystem that the kernel uses to
6 export internal kernel objects, their attributes, and their
7 relationships to one another.
8
9 Users can use sysfs to ascertain useful information about the running
10 kernel, such as the devices the kernel has discovered on each bus and
11 which driver each is bound to. sysfs can also be used to tune devices
12 and other kernel subsystems.
13
14 Some system agents rely on the information in sysfs to operate.
15 /sbin/hotplug uses device and object attributes in sysfs to assist in
16 delegating policy decisions, like persistently naming devices.
17
18 sysfs is currently used by the block subsystem to mount the root
19 partition. If sysfs is disabled you must specify the boot device on
20 the kernel boot command line via its major and minor numbers. For
21 example, "root=03:01" for /dev/hda1.
22
23 Designers of embedded systems may wish to say N here to conserve space.
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 66f6e58a7e4b..07703d3ff4a1 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -21,15 +21,28 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/mm.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26 27
27#include "sysfs.h" 28#include "sysfs.h"
28 29
30/*
31 * There's one bin_buffer for each open file.
32 *
33 * filp->private_data points to bin_buffer and
34 * sysfs_dirent->s_bin_attr.buffers points to a the bin_buffer s
35 * sysfs_dirent->s_bin_attr.buffers is protected by sysfs_bin_lock
36 */
37static DEFINE_MUTEX(sysfs_bin_lock);
38
29struct bin_buffer { 39struct bin_buffer {
30 struct mutex mutex; 40 struct mutex mutex;
31 void *buffer; 41 void *buffer;
32 int mmapped; 42 int mmapped;
43 struct vm_operations_struct *vm_ops;
44 struct file *file;
45 struct hlist_node list;
33}; 46};
34 47
35static int 48static int
@@ -63,6 +76,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
63 int count = min_t(size_t, bytes, PAGE_SIZE); 76 int count = min_t(size_t, bytes, PAGE_SIZE);
64 char *temp; 77 char *temp;
65 78
79 if (!bytes)
80 return 0;
81
66 if (size) { 82 if (size) {
67 if (offs > size) 83 if (offs > size)
68 return 0; 84 return 0;
@@ -131,6 +147,9 @@ static ssize_t write(struct file *file, const char __user *userbuf,
131 int count = min_t(size_t, bytes, PAGE_SIZE); 147 int count = min_t(size_t, bytes, PAGE_SIZE);
132 char *temp; 148 char *temp;
133 149
150 if (!bytes)
151 return 0;
152
134 if (size) { 153 if (size) {
135 if (offs > size) 154 if (offs > size)
136 return 0; 155 return 0;
@@ -162,6 +181,175 @@ out_free:
162 return count; 181 return count;
163} 182}
164 183
184static void bin_vma_open(struct vm_area_struct *vma)
185{
186 struct file *file = vma->vm_file;
187 struct bin_buffer *bb = file->private_data;
188 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
189
190 if (!bb->vm_ops || !bb->vm_ops->open)
191 return;
192
193 if (!sysfs_get_active_two(attr_sd))
194 return;
195
196 bb->vm_ops->open(vma);
197
198 sysfs_put_active_two(attr_sd);
199}
200
201static void bin_vma_close(struct vm_area_struct *vma)
202{
203 struct file *file = vma->vm_file;
204 struct bin_buffer *bb = file->private_data;
205 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
206
207 if (!bb->vm_ops || !bb->vm_ops->close)
208 return;
209
210 if (!sysfs_get_active_two(attr_sd))
211 return;
212
213 bb->vm_ops->close(vma);
214
215 sysfs_put_active_two(attr_sd);
216}
217
218static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
219{
220 struct file *file = vma->vm_file;
221 struct bin_buffer *bb = file->private_data;
222 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
223 int ret;
224
225 if (!bb->vm_ops || !bb->vm_ops->fault)
226 return VM_FAULT_SIGBUS;
227
228 if (!sysfs_get_active_two(attr_sd))
229 return VM_FAULT_SIGBUS;
230
231 ret = bb->vm_ops->fault(vma, vmf);
232
233 sysfs_put_active_two(attr_sd);
234 return ret;
235}
236
237static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page)
238{
239 struct file *file = vma->vm_file;
240 struct bin_buffer *bb = file->private_data;
241 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
242 int ret;
243
244 if (!bb->vm_ops)
245 return -EINVAL;
246
247 if (!bb->vm_ops->page_mkwrite)
248 return 0;
249
250 if (!sysfs_get_active_two(attr_sd))
251 return -EINVAL;
252
253 ret = bb->vm_ops->page_mkwrite(vma, page);
254
255 sysfs_put_active_two(attr_sd);
256 return ret;
257}
258
259static int bin_access(struct vm_area_struct *vma, unsigned long addr,
260 void *buf, int len, int write)
261{
262 struct file *file = vma->vm_file;
263 struct bin_buffer *bb = file->private_data;
264 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
265 int ret;
266
267 if (!bb->vm_ops || !bb->vm_ops->access)
268 return -EINVAL;
269
270 if (!sysfs_get_active_two(attr_sd))
271 return -EINVAL;
272
273 ret = bb->vm_ops->access(vma, addr, buf, len, write);
274
275 sysfs_put_active_two(attr_sd);
276 return ret;
277}
278
279#ifdef CONFIG_NUMA
280static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
281{
282 struct file *file = vma->vm_file;
283 struct bin_buffer *bb = file->private_data;
284 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
285 int ret;
286
287 if (!bb->vm_ops || !bb->vm_ops->set_policy)
288 return 0;
289
290 if (!sysfs_get_active_two(attr_sd))
291 return -EINVAL;
292
293 ret = bb->vm_ops->set_policy(vma, new);
294
295 sysfs_put_active_two(attr_sd);
296 return ret;
297}
298
299static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
300 unsigned long addr)
301{
302 struct file *file = vma->vm_file;
303 struct bin_buffer *bb = file->private_data;
304 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
305 struct mempolicy *pol;
306
307 if (!bb->vm_ops || !bb->vm_ops->get_policy)
308 return vma->vm_policy;
309
310 if (!sysfs_get_active_two(attr_sd))
311 return vma->vm_policy;
312
313 pol = bb->vm_ops->get_policy(vma, addr);
314
315 sysfs_put_active_two(attr_sd);
316 return pol;
317}
318
319static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
320 const nodemask_t *to, unsigned long flags)
321{
322 struct file *file = vma->vm_file;
323 struct bin_buffer *bb = file->private_data;
324 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
325 int ret;
326
327 if (!bb->vm_ops || !bb->vm_ops->migrate)
328 return 0;
329
330 if (!sysfs_get_active_two(attr_sd))
331 return 0;
332
333 ret = bb->vm_ops->migrate(vma, from, to, flags);
334
335 sysfs_put_active_two(attr_sd);
336 return ret;
337}
338#endif
339
340static struct vm_operations_struct bin_vm_ops = {
341 .open = bin_vma_open,
342 .close = bin_vma_close,
343 .fault = bin_fault,
344 .page_mkwrite = bin_page_mkwrite,
345 .access = bin_access,
346#ifdef CONFIG_NUMA
347 .set_policy = bin_set_policy,
348 .get_policy = bin_get_policy,
349 .migrate = bin_migrate,
350#endif
351};
352
165static int mmap(struct file *file, struct vm_area_struct *vma) 353static int mmap(struct file *file, struct vm_area_struct *vma)
166{ 354{
167 struct bin_buffer *bb = file->private_data; 355 struct bin_buffer *bb = file->private_data;
@@ -173,18 +361,37 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
173 mutex_lock(&bb->mutex); 361 mutex_lock(&bb->mutex);
174 362
175 /* need attr_sd for attr, its parent for kobj */ 363 /* need attr_sd for attr, its parent for kobj */
364 rc = -ENODEV;
176 if (!sysfs_get_active_two(attr_sd)) 365 if (!sysfs_get_active_two(attr_sd))
177 return -ENODEV; 366 goto out_unlock;
178 367
179 rc = -EINVAL; 368 rc = -EINVAL;
180 if (attr->mmap) 369 if (!attr->mmap)
181 rc = attr->mmap(kobj, attr, vma); 370 goto out_put;
371
372 rc = attr->mmap(kobj, attr, vma);
373 if (rc)
374 goto out_put;
182 375
183 if (rc == 0 && !bb->mmapped) 376 /*
184 bb->mmapped = 1; 377 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
185 else 378 * to satisfy versions of X which crash if the mmap fails: that
186 sysfs_put_active_two(attr_sd); 379 * substitutes a new vm_file, and we don't then want bin_vm_ops.
380 */
381 if (vma->vm_file != file)
382 goto out_put;
187 383
384 rc = -EINVAL;
385 if (bb->mmapped && bb->vm_ops != vma->vm_ops)
386 goto out_put;
387
388 rc = 0;
389 bb->mmapped = 1;
390 bb->vm_ops = vma->vm_ops;
391 vma->vm_ops = &bin_vm_ops;
392out_put:
393 sysfs_put_active_two(attr_sd);
394out_unlock:
188 mutex_unlock(&bb->mutex); 395 mutex_unlock(&bb->mutex);
189 396
190 return rc; 397 return rc;
@@ -217,8 +424,13 @@ static int open(struct inode * inode, struct file * file)
217 goto err_out; 424 goto err_out;
218 425
219 mutex_init(&bb->mutex); 426 mutex_init(&bb->mutex);
427 bb->file = file;
220 file->private_data = bb; 428 file->private_data = bb;
221 429
430 mutex_lock(&sysfs_bin_lock);
431 hlist_add_head(&bb->list, &attr_sd->s_bin_attr.buffers);
432 mutex_unlock(&sysfs_bin_lock);
433
222 /* open succeeded, put active references */ 434 /* open succeeded, put active references */
223 sysfs_put_active_two(attr_sd); 435 sysfs_put_active_two(attr_sd);
224 return 0; 436 return 0;
@@ -231,11 +443,12 @@ static int open(struct inode * inode, struct file * file)
231 443
232static int release(struct inode * inode, struct file * file) 444static int release(struct inode * inode, struct file * file)
233{ 445{
234 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
235 struct bin_buffer *bb = file->private_data; 446 struct bin_buffer *bb = file->private_data;
236 447
237 if (bb->mmapped) 448 mutex_lock(&sysfs_bin_lock);
238 sysfs_put_active_two(attr_sd); 449 hlist_del(&bb->list);
450 mutex_unlock(&sysfs_bin_lock);
451
239 kfree(bb->buffer); 452 kfree(bb->buffer);
240 kfree(bb); 453 kfree(bb);
241 return 0; 454 return 0;
@@ -250,6 +463,26 @@ const struct file_operations bin_fops = {
250 .release = release, 463 .release = release,
251}; 464};
252 465
466
467void unmap_bin_file(struct sysfs_dirent *attr_sd)
468{
469 struct bin_buffer *bb;
470 struct hlist_node *tmp;
471
472 if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR)
473 return;
474
475 mutex_lock(&sysfs_bin_lock);
476
477 hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) {
478 struct inode *inode = bb->file->f_path.dentry->d_inode;
479
480 unmap_mapping_range(inode->i_mapping, 0, 0, 1);
481 }
482
483 mutex_unlock(&sysfs_bin_lock);
484}
485
253/** 486/**
254 * sysfs_create_bin_file - create binary file for object. 487 * sysfs_create_bin_file - create binary file for object.
255 * @kobj: object. 488 * @kobj: object.
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 82d3b79d0e08..66aeb4fff0c3 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -434,6 +434,26 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
434} 434}
435 435
436/** 436/**
437 * sysfs_pathname - return full path to sysfs dirent
438 * @sd: sysfs_dirent whose path we want
439 * @path: caller allocated buffer
440 *
441 * Gives the name "/" to the sysfs_root entry; any path returned
442 * is relative to wherever sysfs is mounted.
443 *
444 * XXX: does no error checking on @path size
445 */
446static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
447{
448 if (sd->s_parent) {
449 sysfs_pathname(sd->s_parent, path);
450 strcat(path, "/");
451 }
452 strcat(path, sd->s_name);
453 return path;
454}
455
456/**
437 * sysfs_add_one - add sysfs_dirent to parent 457 * sysfs_add_one - add sysfs_dirent to parent
438 * @acxt: addrm context to use 458 * @acxt: addrm context to use
439 * @sd: sysfs_dirent to be added 459 * @sd: sysfs_dirent to be added
@@ -458,8 +478,16 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
458 int ret; 478 int ret;
459 479
460 ret = __sysfs_add_one(acxt, sd); 480 ret = __sysfs_add_one(acxt, sd);
461 WARN(ret == -EEXIST, KERN_WARNING "sysfs: duplicate filename '%s' " 481 if (ret == -EEXIST) {
462 "can not be created\n", sd->s_name); 482 char *path = kzalloc(PATH_MAX, GFP_KERNEL);
483 WARN(1, KERN_WARNING
484 "sysfs: cannot create duplicate filename '%s'\n",
485 (path == NULL) ? sd->s_name :
486 strcat(strcat(sysfs_pathname(acxt->parent_sd, path), "/"),
487 sd->s_name));
488 kfree(path);
489 }
490
463 return ret; 491 return ret;
464} 492}
465 493
@@ -581,6 +609,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
581 609
582 sysfs_drop_dentry(sd); 610 sysfs_drop_dentry(sd);
583 sysfs_deactivate(sd); 611 sysfs_deactivate(sd);
612 unmap_bin_file(sd);
584 sysfs_put(sd); 613 sysfs_put(sd);
585 } 614 }
586} 615}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1f4a3f877262..289c43a47263 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -659,13 +659,16 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
659EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); 659EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
660 660
661struct sysfs_schedule_callback_struct { 661struct sysfs_schedule_callback_struct {
662 struct kobject *kobj; 662 struct list_head workq_list;
663 struct kobject *kobj;
663 void (*func)(void *); 664 void (*func)(void *);
664 void *data; 665 void *data;
665 struct module *owner; 666 struct module *owner;
666 struct work_struct work; 667 struct work_struct work;
667}; 668};
668 669
670static DEFINE_MUTEX(sysfs_workq_mutex);
671static LIST_HEAD(sysfs_workq);
669static void sysfs_schedule_callback_work(struct work_struct *work) 672static void sysfs_schedule_callback_work(struct work_struct *work)
670{ 673{
671 struct sysfs_schedule_callback_struct *ss = container_of(work, 674 struct sysfs_schedule_callback_struct *ss = container_of(work,
@@ -674,6 +677,9 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
674 (ss->func)(ss->data); 677 (ss->func)(ss->data);
675 kobject_put(ss->kobj); 678 kobject_put(ss->kobj);
676 module_put(ss->owner); 679 module_put(ss->owner);
680 mutex_lock(&sysfs_workq_mutex);
681 list_del(&ss->workq_list);
682 mutex_unlock(&sysfs_workq_mutex);
677 kfree(ss); 683 kfree(ss);
678} 684}
679 685
@@ -695,15 +701,25 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
695 * until @func returns. 701 * until @func returns.
696 * 702 *
697 * Returns 0 if the request was submitted, -ENOMEM if storage could not 703 * Returns 0 if the request was submitted, -ENOMEM if storage could not
698 * be allocated, -ENODEV if a reference to @owner isn't available. 704 * be allocated, -ENODEV if a reference to @owner isn't available,
705 * -EAGAIN if a callback has already been scheduled for @kobj.
699 */ 706 */
700int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), 707int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
701 void *data, struct module *owner) 708 void *data, struct module *owner)
702{ 709{
703 struct sysfs_schedule_callback_struct *ss; 710 struct sysfs_schedule_callback_struct *ss, *tmp;
704 711
705 if (!try_module_get(owner)) 712 if (!try_module_get(owner))
706 return -ENODEV; 713 return -ENODEV;
714
715 mutex_lock(&sysfs_workq_mutex);
716 list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
717 if (ss->kobj == kobj) {
718 mutex_unlock(&sysfs_workq_mutex);
719 return -EAGAIN;
720 }
721 mutex_unlock(&sysfs_workq_mutex);
722
707 ss = kmalloc(sizeof(*ss), GFP_KERNEL); 723 ss = kmalloc(sizeof(*ss), GFP_KERNEL);
708 if (!ss) { 724 if (!ss) {
709 module_put(owner); 725 module_put(owner);
@@ -715,6 +731,10 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
715 ss->data = data; 731 ss->data = data;
716 ss->owner = owner; 732 ss->owner = owner;
717 INIT_WORK(&ss->work, sysfs_schedule_callback_work); 733 INIT_WORK(&ss->work, sysfs_schedule_callback_work);
734 INIT_LIST_HEAD(&ss->workq_list);
735 mutex_lock(&sysfs_workq_mutex);
736 list_add_tail(&ss->workq_list, &sysfs_workq);
737 mutex_unlock(&sysfs_workq_mutex);
718 schedule_work(&ss->work); 738 schedule_work(&ss->work);
719 return 0; 739 return 0;
720} 740}
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index dfa3d94cfc74..555f0ff988df 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -147,6 +147,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
147{ 147{
148 struct bin_attribute *bin_attr; 148 struct bin_attribute *bin_attr;
149 149
150 inode->i_private = sysfs_get(sd);
150 inode->i_mapping->a_ops = &sysfs_aops; 151 inode->i_mapping->a_ops = &sysfs_aops;
151 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; 152 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
152 inode->i_op = &sysfs_inode_operations; 153 inode->i_op = &sysfs_inode_operations;
@@ -214,6 +215,22 @@ struct inode * sysfs_get_inode(struct sysfs_dirent *sd)
214 return inode; 215 return inode;
215} 216}
216 217
218/*
219 * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
220 * To prevent the sysfs inode numbers from being freed prematurely we take a
221 * reference to sysfs_dirent from the sysfs inode. A
222 * super_operations.delete_inode() implementation is needed to drop that
223 * reference upon inode destruction.
224 */
225void sysfs_delete_inode(struct inode *inode)
226{
227 struct sysfs_dirent *sd = inode->i_private;
228
229 truncate_inode_pages(&inode->i_data, 0);
230 clear_inode(inode);
231 sysfs_put(sd);
232}
233
217int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) 234int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
218{ 235{
219 struct sysfs_addrm_cxt acxt; 236 struct sysfs_addrm_cxt acxt;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index ab343e371d64..49749955ccaf 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -17,11 +17,10 @@
17#include <linux/pagemap.h> 17#include <linux/pagemap.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/magic.h>
20 21
21#include "sysfs.h" 22#include "sysfs.h"
22 23
23/* Random magic number */
24#define SYSFS_MAGIC 0x62656572
25 24
26static struct vfsmount *sysfs_mount; 25static struct vfsmount *sysfs_mount;
27struct super_block * sysfs_sb = NULL; 26struct super_block * sysfs_sb = NULL;
@@ -30,6 +29,7 @@ struct kmem_cache *sysfs_dir_cachep;
30static const struct super_operations sysfs_ops = { 29static const struct super_operations sysfs_ops = {
31 .statfs = simple_statfs, 30 .statfs = simple_statfs,
32 .drop_inode = generic_delete_inode, 31 .drop_inode = generic_delete_inode,
32 .delete_inode = sysfs_delete_inode,
33}; 33};
34 34
35struct sysfs_dirent sysfs_root = { 35struct sysfs_dirent sysfs_root = {
@@ -53,7 +53,9 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
53 sysfs_sb = sb; 53 sysfs_sb = sb;
54 54
55 /* get root inode, initialize and unlock it */ 55 /* get root inode, initialize and unlock it */
56 mutex_lock(&sysfs_mutex);
56 inode = sysfs_get_inode(&sysfs_root); 57 inode = sysfs_get_inode(&sysfs_root);
58 mutex_unlock(&sysfs_mutex);
57 if (!inode) { 59 if (!inode) {
58 pr_debug("sysfs: could not get root inode\n"); 60 pr_debug("sysfs: could not get root inode\n");
59 return -ENOMEM; 61 return -ENOMEM;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 93c6d6b27c4d..3fa0d98481e2 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -28,6 +28,7 @@ struct sysfs_elem_attr {
28 28
29struct sysfs_elem_bin_attr { 29struct sysfs_elem_bin_attr {
30 struct bin_attribute *bin_attr; 30 struct bin_attribute *bin_attr;
31 struct hlist_head buffers;
31}; 32};
32 33
33/* 34/*
@@ -145,6 +146,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
145 * inode.c 146 * inode.c
146 */ 147 */
147struct inode *sysfs_get_inode(struct sysfs_dirent *sd); 148struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
149void sysfs_delete_inode(struct inode *inode);
148int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 150int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
149int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); 151int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
150int sysfs_inode_init(void); 152int sysfs_inode_init(void);
@@ -163,6 +165,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
163 * bin.c 165 * bin.c
164 */ 166 */
165extern const struct file_operations bin_fops; 167extern const struct file_operations bin_fops;
168void unmap_bin_file(struct sysfs_dirent *attr_sd);
166 169
167/* 170/*
168 * symlink.c 171 * symlink.c
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
new file mode 100644
index 000000000000..33aeb4b75db1
--- /dev/null
+++ b/fs/sysv/Kconfig
@@ -0,0 +1,36 @@
1config SYSV_FS
2 tristate "System V/Xenix/V7/Coherent file system support"
3 depends on BLOCK
4 help
5 SCO, Xenix and Coherent are commercial Unix systems for Intel
6 machines, and Version 7 was used on the DEC PDP-11. Saying Y
7 here would allow you to read from their floppies and hard disk
8 partitions.
9
10 If you have floppies or hard disk partitions like that, it is likely
11 that they contain binaries from those other Unix systems; in order
12 to run these binaries, you will want to install linux-abi which is
13 a set of kernel modules that lets you run SCO, Xenix, Wyse,
14 UnixWare, Dell Unix and System V programs under Linux. It is
15 available via FTP (user: ftp) from
16 <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
17 NOTE: that will work only for binaries from Intel-based systems;
18 PDP ones will have to wait until somebody ports Linux to -11 ;-)
19
20 If you only intend to mount files from some other Unix over the
21 network using NFS, you don't need the System V file system support
22 (but you need NFS file system support obviously).
23
24 Note that this option is generally not needed for floppies, since a
25 good portable way to transport files and directories between unixes
26 (and even other operating systems) is given by the tar program ("man
27 tar" or preferably "info tar"). Note also that this option has
28 nothing whatsoever to do with the option "System V IPC". Read about
29 the System V file system in
30 <file:Documentation/filesystems/sysv-fs.txt>.
31 Saying Y here will enlarge your kernel by about 27 KB.
32
33 To compile this as a module, choose M here: the module will be called
34 sysv.
35
36 If you haven't heard about all of this before, it's safe to say N.
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a123b8ff3f5..b042bd7034b1 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
186 BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); 186 BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
187 BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); 187 BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
188 188
189 if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) 189 if ((flags & ~TFD_CREATE_FLAGS) ||
190 return -EINVAL; 190 (clockid != CLOCK_MONOTONIC &&
191 if (clockid != CLOCK_MONOTONIC && 191 clockid != CLOCK_REALTIME))
192 clockid != CLOCK_REALTIME)
193 return -EINVAL; 192 return -EINVAL;
194 193
195 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 194 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
201 hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); 200 hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
202 201
203 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, 202 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
204 flags & (O_CLOEXEC | O_NONBLOCK)); 203 flags & TFD_SHARED_FCNTL_FLAGS);
205 if (ufd < 0) 204 if (ufd < 0)
206 kfree(ctx); 205 kfree(ctx);
207 206
@@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
219 if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) 218 if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
220 return -EFAULT; 219 return -EFAULT;
221 220
222 if (!timespec_valid(&ktmr.it_value) || 221 if ((flags & ~TFD_SETTIME_FLAGS) ||
222 !timespec_valid(&ktmr.it_value) ||
223 !timespec_valid(&ktmr.it_interval)) 223 !timespec_valid(&ktmr.it_interval))
224 return -EINVAL; 224 return -EINVAL;
225 225
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 175f9c590b77..f393620890ee 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -689,7 +689,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
689} 689}
690 690
691/** 691/**
692 * ubifs_get_free_space - return amount of free space. 692 * ubifs_get_free_space_nolock - return amount of free space.
693 * @c: UBIFS file-system description object 693 * @c: UBIFS file-system description object
694 * 694 *
695 * This function calculates amount of free space to report to user-space. 695 * This function calculates amount of free space to report to user-space.
@@ -704,16 +704,14 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
704 * traditional file-systems, because they have way less overhead than UBIFS. 704 * traditional file-systems, because they have way less overhead than UBIFS.
705 * So, to keep users happy, UBIFS tries to take the overhead into account. 705 * So, to keep users happy, UBIFS tries to take the overhead into account.
706 */ 706 */
707long long ubifs_get_free_space(struct ubifs_info *c) 707long long ubifs_get_free_space_nolock(struct ubifs_info *c)
708{ 708{
709 int min_idx_lebs, rsvd_idx_lebs, lebs; 709 int rsvd_idx_lebs, lebs;
710 long long available, outstanding, free; 710 long long available, outstanding, free;
711 711
712 spin_lock(&c->space_lock); 712 ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
713 min_idx_lebs = c->min_idx_lebs;
714 ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c));
715 outstanding = c->budg_data_growth + c->budg_dd_growth; 713 outstanding = c->budg_data_growth + c->budg_dd_growth;
716 available = ubifs_calc_available(c, min_idx_lebs); 714 available = ubifs_calc_available(c, c->min_idx_lebs);
717 715
718 /* 716 /*
719 * When reporting free space to user-space, UBIFS guarantees that it is 717 * When reporting free space to user-space, UBIFS guarantees that it is
@@ -726,15 +724,14 @@ long long ubifs_get_free_space(struct ubifs_info *c)
726 * Note, the calculations below are similar to what we have in 724 * Note, the calculations below are similar to what we have in
727 * 'do_budget_space()', so refer there for comments. 725 * 'do_budget_space()', so refer there for comments.
728 */ 726 */
729 if (min_idx_lebs > c->lst.idx_lebs) 727 if (c->min_idx_lebs > c->lst.idx_lebs)
730 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; 728 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
731 else 729 else
732 rsvd_idx_lebs = 0; 730 rsvd_idx_lebs = 0;
733 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - 731 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
734 c->lst.taken_empty_lebs; 732 c->lst.taken_empty_lebs;
735 lebs -= rsvd_idx_lebs; 733 lebs -= rsvd_idx_lebs;
736 available += lebs * (c->dark_wm - c->leb_overhead); 734 available += lebs * (c->dark_wm - c->leb_overhead);
737 spin_unlock(&c->space_lock);
738 735
739 if (available > outstanding) 736 if (available > outstanding)
740 free = ubifs_reported_space(c, available - outstanding); 737 free = ubifs_reported_space(c, available - outstanding);
@@ -742,3 +739,21 @@ long long ubifs_get_free_space(struct ubifs_info *c)
742 free = 0; 739 free = 0;
743 return free; 740 return free;
744} 741}
742
743/**
744 * ubifs_get_free_space - return amount of free space.
745 * @c: UBIFS file-system description object
746 *
747 * This function calculates and retuns amount of free space to report to
748 * user-space.
749 */
750long long ubifs_get_free_space(struct ubifs_info *c)
751{
752 long long free;
753
754 spin_lock(&c->space_lock);
755 free = ubifs_get_free_space_nolock(c);
756 spin_unlock(&c->space_lock);
757
758 return free;
759}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 792c5a16c182..e975bd82f38b 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -620,9 +620,11 @@ void dbg_dump_budg(struct ubifs_info *c)
620 c->dark_wm, c->dead_wm, c->max_idx_node_sz); 620 c->dark_wm, c->dead_wm, c->max_idx_node_sz);
621 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", 621 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
622 c->gc_lnum, c->ihead_lnum); 622 c->gc_lnum, c->ihead_lnum);
623 for (i = 0; i < c->jhead_cnt; i++) 623 /* If we are in R/O mode, journal heads do not exist */
624 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", 624 if (c->jheads)
625 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); 625 for (i = 0; i < c->jhead_cnt; i++)
626 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
627 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
626 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { 628 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
627 bud = rb_entry(rb, struct ubifs_bud, rb); 629 bud = rb_entry(rb, struct ubifs_bud, rb);
628 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); 630 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -637,10 +639,7 @@ void dbg_dump_budg(struct ubifs_info *c)
637 /* Print budgeting predictions */ 639 /* Print budgeting predictions */
638 available = ubifs_calc_available(c, c->min_idx_lebs); 640 available = ubifs_calc_available(c, c->min_idx_lebs);
639 outstanding = c->budg_data_growth + c->budg_dd_growth; 641 outstanding = c->budg_data_growth + c->budg_dd_growth;
640 if (available > outstanding) 642 free = ubifs_get_free_space_nolock(c);
641 free = ubifs_reported_space(c, available - outstanding);
642 else
643 free = 0;
644 printk(KERN_DEBUG "Budgeting predictions:\n"); 643 printk(KERN_DEBUG "Budgeting predictions:\n");
645 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", 644 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
646 available, outstanding, free); 645 available, outstanding, free);
@@ -861,6 +860,65 @@ void dbg_dump_index(struct ubifs_info *c)
861} 860}
862 861
863/** 862/**
863 * dbg_save_space_info - save information about flash space.
864 * @c: UBIFS file-system description object
865 *
866 * This function saves information about UBIFS free space, dirty space, etc, in
867 * order to check it later.
868 */
869void dbg_save_space_info(struct ubifs_info *c)
870{
871 struct ubifs_debug_info *d = c->dbg;
872
873 ubifs_get_lp_stats(c, &d->saved_lst);
874
875 spin_lock(&c->space_lock);
876 d->saved_free = ubifs_get_free_space_nolock(c);
877 spin_unlock(&c->space_lock);
878}
879
880/**
881 * dbg_check_space_info - check flash space information.
882 * @c: UBIFS file-system description object
883 *
884 * This function compares current flash space information with the information
885 * which was saved when the 'dbg_save_space_info()' function was called.
886 * Returns zero if the information has not changed, and %-EINVAL it it has
887 * changed.
888 */
889int dbg_check_space_info(struct ubifs_info *c)
890{
891 struct ubifs_debug_info *d = c->dbg;
892 struct ubifs_lp_stats lst;
893 long long avail, free;
894
895 spin_lock(&c->space_lock);
896 avail = ubifs_calc_available(c, c->min_idx_lebs);
897 spin_unlock(&c->space_lock);
898 free = ubifs_get_free_space(c);
899
900 if (free != d->saved_free) {
901 ubifs_err("free space changed from %lld to %lld",
902 d->saved_free, free);
903 goto out;
904 }
905
906 return 0;
907
908out:
909 ubifs_msg("saved lprops statistics dump");
910 dbg_dump_lstats(&d->saved_lst);
911 ubifs_get_lp_stats(c, &lst);
912 ubifs_msg("current lprops statistics dump");
913 dbg_dump_lstats(&d->saved_lst);
914 spin_lock(&c->space_lock);
915 dbg_dump_budg(c);
916 spin_unlock(&c->space_lock);
917 dump_stack();
918 return -EINVAL;
919}
920
921/**
864 * dbg_check_synced_i_size - check synchronized inode size. 922 * dbg_check_synced_i_size - check synchronized inode size.
865 * @inode: inode to check 923 * @inode: inode to check
866 * 924 *
@@ -1349,7 +1407,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra)
1349 * @c: UBIFS file-system description object 1407 * @c: UBIFS file-system description object
1350 * @leaf_cb: called for each leaf node 1408 * @leaf_cb: called for each leaf node
1351 * @znode_cb: called for each indexing node 1409 * @znode_cb: called for each indexing node
1352 * @priv: private date which is passed to callbacks 1410 * @priv: private data which is passed to callbacks
1353 * 1411 *
1354 * This function walks the UBIFS index and calls the @leaf_cb for each leaf 1412 * This function walks the UBIFS index and calls the @leaf_cb for each leaf
1355 * node and @znode_cb for each indexing node. Returns zero in case of success 1413 * node and @znode_cb for each indexing node. Returns zero in case of success
@@ -2409,7 +2467,7 @@ void ubifs_debugging_exit(struct ubifs_info *c)
2409 * Root directory for UBIFS stuff in debugfs. Contains sub-directories which 2467 * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
2410 * contain the stuff specific to particular file-system mounts. 2468 * contain the stuff specific to particular file-system mounts.
2411 */ 2469 */
2412static struct dentry *debugfs_rootdir; 2470static struct dentry *dfs_rootdir;
2413 2471
2414/** 2472/**
2415 * dbg_debugfs_init - initialize debugfs file-system. 2473 * dbg_debugfs_init - initialize debugfs file-system.
@@ -2421,9 +2479,9 @@ static struct dentry *debugfs_rootdir;
2421 */ 2479 */
2422int dbg_debugfs_init(void) 2480int dbg_debugfs_init(void)
2423{ 2481{
2424 debugfs_rootdir = debugfs_create_dir("ubifs", NULL); 2482 dfs_rootdir = debugfs_create_dir("ubifs", NULL);
2425 if (IS_ERR(debugfs_rootdir)) { 2483 if (IS_ERR(dfs_rootdir)) {
2426 int err = PTR_ERR(debugfs_rootdir); 2484 int err = PTR_ERR(dfs_rootdir);
2427 ubifs_err("cannot create \"ubifs\" debugfs directory, " 2485 ubifs_err("cannot create \"ubifs\" debugfs directory, "
2428 "error %d\n", err); 2486 "error %d\n", err);
2429 return err; 2487 return err;
@@ -2437,7 +2495,7 @@ int dbg_debugfs_init(void)
2437 */ 2495 */
2438void dbg_debugfs_exit(void) 2496void dbg_debugfs_exit(void)
2439{ 2497{
2440 debugfs_remove(debugfs_rootdir); 2498 debugfs_remove(dfs_rootdir);
2441} 2499}
2442 2500
2443static int open_debugfs_file(struct inode *inode, struct file *file) 2501static int open_debugfs_file(struct inode *inode, struct file *file)
@@ -2452,13 +2510,13 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
2452 struct ubifs_info *c = file->private_data; 2510 struct ubifs_info *c = file->private_data;
2453 struct ubifs_debug_info *d = c->dbg; 2511 struct ubifs_debug_info *d = c->dbg;
2454 2512
2455 if (file->f_path.dentry == d->dump_lprops) 2513 if (file->f_path.dentry == d->dfs_dump_lprops)
2456 dbg_dump_lprops(c); 2514 dbg_dump_lprops(c);
2457 else if (file->f_path.dentry == d->dump_budg) { 2515 else if (file->f_path.dentry == d->dfs_dump_budg) {
2458 spin_lock(&c->space_lock); 2516 spin_lock(&c->space_lock);
2459 dbg_dump_budg(c); 2517 dbg_dump_budg(c);
2460 spin_unlock(&c->space_lock); 2518 spin_unlock(&c->space_lock);
2461 } else if (file->f_path.dentry == d->dump_tnc) { 2519 } else if (file->f_path.dentry == d->dfs_dump_tnc) {
2462 mutex_lock(&c->tnc_mutex); 2520 mutex_lock(&c->tnc_mutex);
2463 dbg_dump_tnc(c); 2521 dbg_dump_tnc(c);
2464 mutex_unlock(&c->tnc_mutex); 2522 mutex_unlock(&c->tnc_mutex);
@@ -2469,7 +2527,7 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
2469 return count; 2527 return count;
2470} 2528}
2471 2529
2472static const struct file_operations debugfs_fops = { 2530static const struct file_operations dfs_fops = {
2473 .open = open_debugfs_file, 2531 .open = open_debugfs_file,
2474 .write = write_debugfs_file, 2532 .write = write_debugfs_file,
2475 .owner = THIS_MODULE, 2533 .owner = THIS_MODULE,
@@ -2494,36 +2552,32 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
2494 struct dentry *dent; 2552 struct dentry *dent;
2495 struct ubifs_debug_info *d = c->dbg; 2553 struct ubifs_debug_info *d = c->dbg;
2496 2554
2497 sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); 2555 sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
2498 d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name, 2556 d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir);
2499 debugfs_rootdir); 2557 if (IS_ERR(d->dfs_dir)) {
2500 if (IS_ERR(d->debugfs_dir)) { 2558 err = PTR_ERR(d->dfs_dir);
2501 err = PTR_ERR(d->debugfs_dir);
2502 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", 2559 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2503 d->debugfs_dir_name, err); 2560 d->dfs_dir_name, err);
2504 goto out; 2561 goto out;
2505 } 2562 }
2506 2563
2507 fname = "dump_lprops"; 2564 fname = "dump_lprops";
2508 dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, 2565 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
2509 &debugfs_fops);
2510 if (IS_ERR(dent)) 2566 if (IS_ERR(dent))
2511 goto out_remove; 2567 goto out_remove;
2512 d->dump_lprops = dent; 2568 d->dfs_dump_lprops = dent;
2513 2569
2514 fname = "dump_budg"; 2570 fname = "dump_budg";
2515 dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, 2571 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
2516 &debugfs_fops);
2517 if (IS_ERR(dent)) 2572 if (IS_ERR(dent))
2518 goto out_remove; 2573 goto out_remove;
2519 d->dump_budg = dent; 2574 d->dfs_dump_budg = dent;
2520 2575
2521 fname = "dump_tnc"; 2576 fname = "dump_tnc";
2522 dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, 2577 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
2523 &debugfs_fops);
2524 if (IS_ERR(dent)) 2578 if (IS_ERR(dent))
2525 goto out_remove; 2579 goto out_remove;
2526 d->dump_tnc = dent; 2580 d->dfs_dump_tnc = dent;
2527 2581
2528 return 0; 2582 return 0;
2529 2583
@@ -2531,7 +2585,7 @@ out_remove:
2531 err = PTR_ERR(dent); 2585 err = PTR_ERR(dent);
2532 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", 2586 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2533 fname, err); 2587 fname, err);
2534 debugfs_remove_recursive(d->debugfs_dir); 2588 debugfs_remove_recursive(d->dfs_dir);
2535out: 2589out:
2536 return err; 2590 return err;
2537} 2591}
@@ -2542,7 +2596,7 @@ out:
2542 */ 2596 */
2543void dbg_debugfs_exit_fs(struct ubifs_info *c) 2597void dbg_debugfs_exit_fs(struct ubifs_info *c)
2544{ 2598{
2545 debugfs_remove_recursive(c->dbg->debugfs_dir); 2599 debugfs_remove_recursive(c->dbg->dfs_dir);
2546} 2600}
2547 2601
2548#endif /* CONFIG_UBIFS_FS_DEBUG */ 2602#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 9820d6999f7e..c1cd73b2e06e 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -41,15 +41,17 @@
41 * @chk_lpt_wastage: used by LPT tree size checker 41 * @chk_lpt_wastage: used by LPT tree size checker
42 * @chk_lpt_lebs: used by LPT tree size checker 42 * @chk_lpt_lebs: used by LPT tree size checker
43 * @new_nhead_offs: used by LPT tree size checker 43 * @new_nhead_offs: used by LPT tree size checker
44 * @new_ihead_lnum: used by debugging to check ihead_lnum 44 * @new_ihead_lnum: used by debugging to check @c->ihead_lnum
45 * @new_ihead_offs: used by debugging to check ihead_offs 45 * @new_ihead_offs: used by debugging to check @c->ihead_offs
46 * 46 *
47 * debugfs_dir_name: name of debugfs directory containing this file-system's 47 * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
48 * files 48 * @saved_free: saved free space (used by 'dbg_save_space_info()')
49 * debugfs_dir: direntry object of the file-system debugfs directory 49 *
50 * dump_lprops: "dump lprops" debugfs knob 50 * dfs_dir_name: name of debugfs directory containing this file-system's files
51 * dump_budg: "dump budgeting information" debugfs knob 51 * dfs_dir: direntry object of the file-system debugfs directory
52 * dump_tnc: "dump TNC" debugfs knob 52 * dfs_dump_lprops: "dump lprops" debugfs knob
53 * dfs_dump_budg: "dump budgeting information" debugfs knob
54 * dfs_dump_tnc: "dump TNC" debugfs knob
53 */ 55 */
54struct ubifs_debug_info { 56struct ubifs_debug_info {
55 void *buf; 57 void *buf;
@@ -69,11 +71,14 @@ struct ubifs_debug_info {
69 int new_ihead_lnum; 71 int new_ihead_lnum;
70 int new_ihead_offs; 72 int new_ihead_offs;
71 73
72 char debugfs_dir_name[100]; 74 struct ubifs_lp_stats saved_lst;
73 struct dentry *debugfs_dir; 75 long long saved_free;
74 struct dentry *dump_lprops; 76
75 struct dentry *dump_budg; 77 char dfs_dir_name[100];
76 struct dentry *dump_tnc; 78 struct dentry *dfs_dir;
79 struct dentry *dfs_dump_lprops;
80 struct dentry *dfs_dump_budg;
81 struct dentry *dfs_dump_tnc;
77}; 82};
78 83
79#define ubifs_assert(expr) do { \ 84#define ubifs_assert(expr) do { \
@@ -297,7 +302,8 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
297 dbg_znode_callback znode_cb, void *priv); 302 dbg_znode_callback znode_cb, void *priv);
298 303
299/* Checking functions */ 304/* Checking functions */
300 305void dbg_save_space_info(struct ubifs_info *c);
306int dbg_check_space_info(struct ubifs_info *c);
301int dbg_check_lprops(struct ubifs_info *c); 307int dbg_check_lprops(struct ubifs_info *c);
302int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); 308int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
303int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); 309int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
@@ -439,6 +445,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
439 445
440#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 446#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
441#define dbg_old_index_check_init(c, zroot) 0 447#define dbg_old_index_check_init(c, zroot) 0
448#define dbg_save_space_info(c) ({})
449#define dbg_check_space_info(c) 0
442#define dbg_check_old_index(c, zroot) 0 450#define dbg_check_old_index(c, zroot) 0
443#define dbg_check_cats(c) 0 451#define dbg_check_cats(c) 0
444#define dbg_check_ltab(c) 0 452#define dbg_check_ltab(c) 0
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f448ab1f9c38..f55d523c52bb 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -482,30 +482,29 @@ static int ubifs_dir_release(struct inode *dir, struct file *file)
482} 482}
483 483
484/** 484/**
485 * lock_2_inodes - lock two UBIFS inodes. 485 * lock_2_inodes - a wrapper for locking two UBIFS inodes.
486 * @inode1: first inode 486 * @inode1: first inode
487 * @inode2: second inode 487 * @inode2: second inode
488 *
489 * We do not implement any tricks to guarantee strict lock ordering, because
490 * VFS has already done it for us on the @i_mutex. So this is just a simple
491 * wrapper function.
488 */ 492 */
489static void lock_2_inodes(struct inode *inode1, struct inode *inode2) 493static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
490{ 494{
491 if (inode1->i_ino < inode2->i_ino) { 495 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
492 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2); 496 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
493 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
494 } else {
495 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
496 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
497 }
498} 497}
499 498
500/** 499/**
501 * unlock_2_inodes - unlock two UBIFS inodes inodes. 500 * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
502 * @inode1: first inode 501 * @inode1: first inode
503 * @inode2: second inode 502 * @inode2: second inode
504 */ 503 */
505static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) 504static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
506{ 505{
507 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
508 mutex_unlock(&ubifs_inode(inode2)->ui_mutex); 506 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
507 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
509} 508}
510 509
511static int ubifs_link(struct dentry *old_dentry, struct inode *dir, 510static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
@@ -527,6 +526,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
527 dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu", 526 dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
528 dentry->d_name.len, dentry->d_name.name, inode->i_ino, 527 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
529 inode->i_nlink, dir->i_ino); 528 inode->i_nlink, dir->i_ino);
529 ubifs_assert(mutex_is_locked(&dir->i_mutex));
530 ubifs_assert(mutex_is_locked(&inode->i_mutex));
530 err = dbg_check_synced_i_size(inode); 531 err = dbg_check_synced_i_size(inode);
531 if (err) 532 if (err)
532 return err; 533 return err;
@@ -580,6 +581,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
580 dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu", 581 dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
581 dentry->d_name.len, dentry->d_name.name, inode->i_ino, 582 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
582 inode->i_nlink, dir->i_ino); 583 inode->i_nlink, dir->i_ino);
584 ubifs_assert(mutex_is_locked(&dir->i_mutex));
585 ubifs_assert(mutex_is_locked(&inode->i_mutex));
583 err = dbg_check_synced_i_size(inode); 586 err = dbg_check_synced_i_size(inode);
584 if (err) 587 if (err)
585 return err; 588 return err;
@@ -667,7 +670,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
667 670
668 dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len, 671 dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
669 dentry->d_name.name, inode->i_ino, dir->i_ino); 672 dentry->d_name.name, inode->i_ino, dir->i_ino);
670 673 ubifs_assert(mutex_is_locked(&dir->i_mutex));
674 ubifs_assert(mutex_is_locked(&inode->i_mutex));
671 err = check_dir_empty(c, dentry->d_inode); 675 err = check_dir_empty(c, dentry->d_inode);
672 if (err) 676 if (err)
673 return err; 677 return err;
@@ -922,59 +926,30 @@ out_budg:
922} 926}
923 927
924/** 928/**
925 * lock_3_inodes - lock three UBIFS inodes for rename. 929 * lock_3_inodes - a wrapper for locking three UBIFS inodes.
926 * @inode1: first inode 930 * @inode1: first inode
927 * @inode2: second inode 931 * @inode2: second inode
928 * @inode3: third inode 932 * @inode3: third inode
929 * 933 *
930 * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may 934 * This function is used for 'ubifs_rename()' and @inode1 may be the same as
931 * be null. 935 * @inode2 whereas @inode3 may be %NULL.
936 *
937 * We do not implement any tricks to guarantee strict lock ordering, because
938 * VFS has already done it for us on the @i_mutex. So this is just a simple
939 * wrapper function.
932 */ 940 */
933static void lock_3_inodes(struct inode *inode1, struct inode *inode2, 941static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
934 struct inode *inode3) 942 struct inode *inode3)
935{ 943{
936 struct inode *i1, *i2, *i3; 944 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
937 945 if (inode2 != inode1)
938 if (!inode3) { 946 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
939 if (inode1 != inode2) { 947 if (inode3)
940 lock_2_inodes(inode1, inode2); 948 mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
941 return;
942 }
943 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
944 return;
945 }
946
947 if (inode1 == inode2) {
948 lock_2_inodes(inode1, inode3);
949 return;
950 }
951
952 /* 3 different inodes */
953 if (inode1 < inode2) {
954 i3 = inode2;
955 if (inode1 < inode3) {
956 i1 = inode1;
957 i2 = inode3;
958 } else {
959 i1 = inode3;
960 i2 = inode1;
961 }
962 } else {
963 i3 = inode1;
964 if (inode2 < inode3) {
965 i1 = inode2;
966 i2 = inode3;
967 } else {
968 i1 = inode3;
969 i2 = inode2;
970 }
971 }
972 mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
973 lock_2_inodes(i2, i3);
974} 949}
975 950
976/** 951/**
977 * unlock_3_inodes - unlock three UBIFS inodes for rename. 952 * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
978 * @inode1: first inode 953 * @inode1: first inode
979 * @inode2: second inode 954 * @inode2: second inode
980 * @inode3: third inode 955 * @inode3: third inode
@@ -982,11 +957,11 @@ static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
982static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, 957static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
983 struct inode *inode3) 958 struct inode *inode3)
984{ 959{
985 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
986 if (inode1 != inode2)
987 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
988 if (inode3) 960 if (inode3)
989 mutex_unlock(&ubifs_inode(inode3)->ui_mutex); 961 mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
962 if (inode1 != inode2)
963 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
964 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
990} 965}
991 966
992static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, 967static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -1020,6 +995,11 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1020 "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name, 995 "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
1021 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len, 996 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
1022 new_dentry->d_name.name, new_dir->i_ino); 997 new_dentry->d_name.name, new_dir->i_ino);
998 ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
999 ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
1000 if (unlink)
1001 ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
1002
1023 1003
1024 if (unlink && is_dir) { 1004 if (unlink && is_dir) {
1025 err = check_dir_empty(c, new_inode); 1005 err = check_dir_empty(c, new_inode);
@@ -1199,7 +1179,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1199 return 0; 1179 return 0;
1200} 1180}
1201 1181
1202struct inode_operations ubifs_dir_inode_operations = { 1182const struct inode_operations ubifs_dir_inode_operations = {
1203 .lookup = ubifs_lookup, 1183 .lookup = ubifs_lookup,
1204 .create = ubifs_create, 1184 .create = ubifs_create,
1205 .link = ubifs_link, 1185 .link = ubifs_link,
@@ -1219,7 +1199,7 @@ struct inode_operations ubifs_dir_inode_operations = {
1219#endif 1199#endif
1220}; 1200};
1221 1201
1222struct file_operations ubifs_dir_operations = { 1202const struct file_operations ubifs_dir_operations = {
1223 .llseek = ubifs_dir_llseek, 1203 .llseek = ubifs_dir_llseek,
1224 .release = ubifs_dir_release, 1204 .release = ubifs_dir_release,
1225 .read = generic_read_dir, 1205 .read = generic_read_dir,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index bf37374567fa..93b6de51f261 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -432,7 +432,6 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
433 struct page *page; 433 struct page *page;
434 434
435
436 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
437 436
438 if (unlikely(c->ro_media)) 437 if (unlikely(c->ro_media))
@@ -1541,7 +1540,7 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1541 return 0; 1540 return 0;
1542} 1541}
1543 1542
1544struct address_space_operations ubifs_file_address_operations = { 1543const struct address_space_operations ubifs_file_address_operations = {
1545 .readpage = ubifs_readpage, 1544 .readpage = ubifs_readpage,
1546 .writepage = ubifs_writepage, 1545 .writepage = ubifs_writepage,
1547 .write_begin = ubifs_write_begin, 1546 .write_begin = ubifs_write_begin,
@@ -1551,7 +1550,7 @@ struct address_space_operations ubifs_file_address_operations = {
1551 .releasepage = ubifs_releasepage, 1550 .releasepage = ubifs_releasepage,
1552}; 1551};
1553 1552
1554struct inode_operations ubifs_file_inode_operations = { 1553const struct inode_operations ubifs_file_inode_operations = {
1555 .setattr = ubifs_setattr, 1554 .setattr = ubifs_setattr,
1556 .getattr = ubifs_getattr, 1555 .getattr = ubifs_getattr,
1557#ifdef CONFIG_UBIFS_FS_XATTR 1556#ifdef CONFIG_UBIFS_FS_XATTR
@@ -1562,14 +1561,14 @@ struct inode_operations ubifs_file_inode_operations = {
1562#endif 1561#endif
1563}; 1562};
1564 1563
1565struct inode_operations ubifs_symlink_inode_operations = { 1564const struct inode_operations ubifs_symlink_inode_operations = {
1566 .readlink = generic_readlink, 1565 .readlink = generic_readlink,
1567 .follow_link = ubifs_follow_link, 1566 .follow_link = ubifs_follow_link,
1568 .setattr = ubifs_setattr, 1567 .setattr = ubifs_setattr,
1569 .getattr = ubifs_getattr, 1568 .getattr = ubifs_getattr,
1570}; 1569};
1571 1570
1572struct file_operations ubifs_file_operations = { 1571const struct file_operations ubifs_file_operations = {
1573 .llseek = generic_file_llseek, 1572 .llseek = generic_file_llseek,
1574 .read = do_sync_read, 1573 .read = do_sync_read,
1575 .write = do_sync_write, 1574 .write = do_sync_write,
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 9832f9abe28e..a711d33b3d3e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -31,6 +31,26 @@
31 * to be reused. Garbage collection will cause the number of dirty index nodes 31 * to be reused. Garbage collection will cause the number of dirty index nodes
32 * to grow, however sufficient space is reserved for the index to ensure the 32 * to grow, however sufficient space is reserved for the index to ensure the
33 * commit will never run out of space. 33 * commit will never run out of space.
34 *
35 * Notes about dead watermark. At current UBIFS implementation we assume that
36 * LEBs which have less than @c->dead_wm bytes of free + dirty space are full
37 * and not worth garbage-collecting. The dead watermark is one min. I/O unit
38 * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
39 * Garbage Collector has to synchronize the GC head's write buffer before
40 * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
41 * actually reclaim even very small pieces of dirty space by garbage collecting
42 * enough dirty LEBs, but we do not bother doing this at this implementation.
43 *
44 * Notes about dark watermark. The results of GC work depends on how big are
45 * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
46 * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
47 * have to waste large pieces of free space at the end of LEB B, because nodes
48 * from LEB A would not fit. And the worst situation is when all nodes are of
49 * maximum size. So dark watermark is the amount of free + dirty space in LEB
50 * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
53 * good, and GC takes extra care when moving them.
34 */ 54 */
35 55
36#include <linux/pagemap.h> 56#include <linux/pagemap.h>
@@ -381,7 +401,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
381 401
382 /* 402 /*
383 * Don't release the LEB until after the next commit, because 403 * Don't release the LEB until after the next commit, because
384 * it may contain date which is needed for recovery. So 404 * it may contain data which is needed for recovery. So
385 * although we freed this LEB, it will become usable only after 405 * although we freed this LEB, it will become usable only after
386 * the commit. 406 * the commit.
387 */ 407 */
@@ -810,8 +830,9 @@ out:
810 * ubifs_destroy_idx_gc - destroy idx_gc list. 830 * ubifs_destroy_idx_gc - destroy idx_gc list.
811 * @c: UBIFS file-system description object 831 * @c: UBIFS file-system description object
812 * 832 *
813 * This function destroys the idx_gc list. It is called when unmounting or 833 * This function destroys the @c->idx_gc list. It is called when unmounting
814 * remounting read-only so locks are not needed. 834 * so locks are not needed. Returns zero in case of success and a negative
835 * error code in case of failure.
815 */ 836 */
816void ubifs_destroy_idx_gc(struct ubifs_info *c) 837void ubifs_destroy_idx_gc(struct ubifs_info *c)
817{ 838{
@@ -824,7 +845,6 @@ void ubifs_destroy_idx_gc(struct ubifs_info *c)
824 list_del(&idx_gc->list); 845 list_del(&idx_gc->list);
825 kfree(idx_gc); 846 kfree(idx_gc);
826 } 847 }
827
828} 848}
829 849
830/** 850/**
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 01682713af69..e8e632a1dcdf 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -29,7 +29,7 @@
29 * would have been wasted for padding to the nearest minimal I/O unit boundary. 29 * would have been wasted for padding to the nearest minimal I/O unit boundary.
30 * Instead, data first goes to the write-buffer and is flushed when the 30 * Instead, data first goes to the write-buffer and is flushed when the
31 * buffer is full or when it is not used for some time (by timer). This is 31 * buffer is full or when it is not used for some time (by timer). This is
32 * similarto the mechanism is used by JFFS2. 32 * similar to the mechanism is used by JFFS2.
33 * 33 *
34 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by 34 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
35 * mutexes defined inside these objects. Since sometimes upper-level code 35 * mutexes defined inside these objects. Since sometimes upper-level code
@@ -75,7 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
75 * @lnum: logical eraseblock number 75 * @lnum: logical eraseblock number
76 * @offs: offset within the logical eraseblock 76 * @offs: offset within the logical eraseblock
77 * @quiet: print no messages 77 * @quiet: print no messages
78 * @chk_crc: indicates whether to always check the CRC 78 * @must_chk_crc: indicates whether to always check the CRC
79 * 79 *
80 * This function checks node magic number and CRC checksum. This function also 80 * This function checks node magic number and CRC checksum. This function also
81 * validates node length to prevent UBIFS from becoming crazy when an attacker 81 * validates node length to prevent UBIFS from becoming crazy when an attacker
@@ -83,11 +83,17 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
83 * node length in the common header could cause UBIFS to read memory outside of 83 * node length in the common header could cause UBIFS to read memory outside of
84 * allocated buffer when checking the CRC checksum. 84 * allocated buffer when checking the CRC checksum.
85 * 85 *
86 * This function returns zero in case of success %-EUCLEAN in case of bad CRC 86 * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
87 * or magic. 87 * true, which is controlled by corresponding UBIFS mount option. However, if
88 * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
89 * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is
90 * ignored and CRC is checked.
91 *
92 * This function returns zero in case of success and %-EUCLEAN in case of bad
93 * CRC or magic.
88 */ 94 */
89int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, 95int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
90 int offs, int quiet, int chk_crc) 96 int offs, int quiet, int must_chk_crc)
91{ 97{
92 int err = -EINVAL, type, node_len; 98 int err = -EINVAL, type, node_len;
93 uint32_t crc, node_crc, magic; 99 uint32_t crc, node_crc, magic;
@@ -123,9 +129,9 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
123 node_len > c->ranges[type].max_len) 129 node_len > c->ranges[type].max_len)
124 goto out_len; 130 goto out_len;
125 131
126 if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc) 132 if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc &&
127 if (c->no_chk_data_crc) 133 c->no_chk_data_crc)
128 return 0; 134 return 0;
129 135
130 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); 136 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
131 node_crc = le32_to_cpu(ch->crc); 137 node_crc = le32_to_cpu(ch->crc);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 9b7c54e0cd2a..a11ca0958a23 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -208,7 +208,7 @@ again:
208 offs = 0; 208 offs = 0;
209 209
210out: 210out:
211 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM); 211 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
212 if (err) 212 if (err)
213 goto out_unlock; 213 goto out_unlock;
214 214
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index dfd2bcece27a..4cdd284dea56 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -635,10 +635,10 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
635 * @c: UBIFS file-system description object 635 * @c: UBIFS file-system description object
636 * @st: return statistics 636 * @st: return statistics
637 */ 637 */
638void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st) 638void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst)
639{ 639{
640 spin_lock(&c->space_lock); 640 spin_lock(&c->space_lock);
641 memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats)); 641 memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats));
642 spin_unlock(&c->space_lock); 642 spin_unlock(&c->space_lock);
643} 643}
644 644
@@ -678,6 +678,9 @@ int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
678 678
679out: 679out:
680 ubifs_release_lprops(c); 680 ubifs_release_lprops(c);
681 if (err)
682 ubifs_err("cannot change properties of LEB %d, error %d",
683 lnum, err);
681 return err; 684 return err;
682} 685}
683 686
@@ -714,6 +717,9 @@ int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
714 717
715out: 718out:
716 ubifs_release_lprops(c); 719 ubifs_release_lprops(c);
720 if (err)
721 ubifs_err("cannot update properties of LEB %d, error %d",
722 lnum, err);
717 return err; 723 return err;
718} 724}
719 725
@@ -737,6 +743,8 @@ int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
737 lpp = ubifs_lpt_lookup(c, lnum); 743 lpp = ubifs_lpt_lookup(c, lnum);
738 if (IS_ERR(lpp)) { 744 if (IS_ERR(lpp)) {
739 err = PTR_ERR(lpp); 745 err = PTR_ERR(lpp);
746 ubifs_err("cannot read properties of LEB %d, error %d",
747 lnum, err);
740 goto out; 748 goto out;
741 } 749 }
742 750
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 96ca95707175..3216a1f277f8 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -556,23 +556,23 @@ no_space:
556} 556}
557 557
558/** 558/**
559 * next_pnode - find next pnode. 559 * next_pnode_to_dirty - find next pnode to dirty.
560 * @c: UBIFS file-system description object 560 * @c: UBIFS file-system description object
561 * @pnode: pnode 561 * @pnode: pnode
562 * 562 *
563 * This function returns the next pnode or %NULL if there are no more pnodes. 563 * This function returns the next pnode to dirty or %NULL if there are no more
564 * pnodes. Note that pnodes that have never been written (lnum == 0) are
565 * skipped.
564 */ 566 */
565static struct ubifs_pnode *next_pnode(struct ubifs_info *c, 567static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
566 struct ubifs_pnode *pnode) 568 struct ubifs_pnode *pnode)
567{ 569{
568 struct ubifs_nnode *nnode; 570 struct ubifs_nnode *nnode;
569 int iip; 571 int iip;
570 572
571 /* Try to go right */ 573 /* Try to go right */
572 nnode = pnode->parent; 574 nnode = pnode->parent;
573 iip = pnode->iip + 1; 575 for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
574 if (iip < UBIFS_LPT_FANOUT) {
575 /* We assume here that LEB zero is never an LPT LEB */
576 if (nnode->nbranch[iip].lnum) 576 if (nnode->nbranch[iip].lnum)
577 return ubifs_get_pnode(c, nnode, iip); 577 return ubifs_get_pnode(c, nnode, iip);
578 } 578 }
@@ -583,8 +583,11 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
583 nnode = nnode->parent; 583 nnode = nnode->parent;
584 if (!nnode) 584 if (!nnode)
585 return NULL; 585 return NULL;
586 /* We assume here that LEB zero is never an LPT LEB */ 586 for (; iip < UBIFS_LPT_FANOUT; iip++) {
587 } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum); 587 if (nnode->nbranch[iip].lnum)
588 break;
589 }
590 } while (iip >= UBIFS_LPT_FANOUT);
588 591
589 /* Go right */ 592 /* Go right */
590 nnode = ubifs_get_nnode(c, nnode, iip); 593 nnode = ubifs_get_nnode(c, nnode, iip);
@@ -593,12 +596,29 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
593 596
594 /* Go down to level 1 */ 597 /* Go down to level 1 */
595 while (nnode->level > 1) { 598 while (nnode->level > 1) {
596 nnode = ubifs_get_nnode(c, nnode, 0); 599 for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) {
600 if (nnode->nbranch[iip].lnum)
601 break;
602 }
603 if (iip >= UBIFS_LPT_FANOUT) {
604 /*
605 * Should not happen, but we need to keep going
606 * if it does.
607 */
608 iip = 0;
609 }
610 nnode = ubifs_get_nnode(c, nnode, iip);
597 if (IS_ERR(nnode)) 611 if (IS_ERR(nnode))
598 return (void *)nnode; 612 return (void *)nnode;
599 } 613 }
600 614
601 return ubifs_get_pnode(c, nnode, 0); 615 for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++)
616 if (nnode->nbranch[iip].lnum)
617 break;
618 if (iip >= UBIFS_LPT_FANOUT)
619 /* Should not happen, but we need to keep going if it does */
620 iip = 0;
621 return ubifs_get_pnode(c, nnode, iip);
602} 622}
603 623
604/** 624/**
@@ -688,7 +708,7 @@ static int make_tree_dirty(struct ubifs_info *c)
688 pnode = pnode_lookup(c, 0); 708 pnode = pnode_lookup(c, 0);
689 while (pnode) { 709 while (pnode) {
690 do_make_pnode_dirty(c, pnode); 710 do_make_pnode_dirty(c, pnode);
691 pnode = next_pnode(c, pnode); 711 pnode = next_pnode_to_dirty(c, pnode);
692 if (IS_ERR(pnode)) 712 if (IS_ERR(pnode))
693 return PTR_ERR(pnode); 713 return PTR_ERR(pnode);
694 } 714 }
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 71d5493bf565..a88f33801b98 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -354,7 +354,7 @@ int ubifs_write_master(struct ubifs_info *c)
354 int err, lnum, offs, len; 354 int err, lnum, offs, len;
355 355
356 if (c->ro_media) 356 if (c->ro_media)
357 return -EINVAL; 357 return -EROFS;
358 358
359 lnum = UBIFS_MST_LNUM; 359 lnum = UBIFS_MST_LNUM;
360 offs = c->mst_offs + c->mst_node_alsz; 360 offs = c->mst_offs + c->mst_node_alsz;
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9e6f403f170e..152a7b34a141 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -46,7 +46,7 @@
46 * Orphans are accumulated in a rb-tree. When an inode's link count drops to 46 * Orphans are accumulated in a rb-tree. When an inode's link count drops to
47 * zero, the inode number is added to the rb-tree. It is removed from the tree 47 * zero, the inode number is added to the rb-tree. It is removed from the tree
48 * when the inode is deleted. Any new orphans that are in the orphan tree when 48 * when the inode is deleted. Any new orphans that are in the orphan tree when
49 * the commit is run, are written to the orphan area in 1 or more orph nodes. 49 * the commit is run, are written to the orphan area in 1 or more orphan nodes.
50 * If the orphan area is full, it is consolidated to make space. There is 50 * If the orphan area is full, it is consolidated to make space. There is
51 * always enough space because validation prevents the user from creating more 51 * always enough space because validation prevents the user from creating more
52 * than the maximum number of orphans allowed. 52 * than the maximum number of orphans allowed.
@@ -231,7 +231,7 @@ static int tot_avail_orphs(struct ubifs_info *c)
231} 231}
232 232
233/** 233/**
234 * do_write_orph_node - write a node 234 * do_write_orph_node - write a node to the orphan head.
235 * @c: UBIFS file-system description object 235 * @c: UBIFS file-system description object
236 * @len: length of node 236 * @len: length of node
237 * @atomic: write atomically 237 * @atomic: write atomically
@@ -264,11 +264,11 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
264} 264}
265 265
266/** 266/**
267 * write_orph_node - write an orph node 267 * write_orph_node - write an orphan node.
268 * @c: UBIFS file-system description object 268 * @c: UBIFS file-system description object
269 * @atomic: write atomically 269 * @atomic: write atomically
270 * 270 *
271 * This function builds an orph node from the cnext list and writes it to the 271 * This function builds an orphan node from the cnext list and writes it to the
272 * orphan head. On success, %0 is returned, otherwise a negative error code 272 * orphan head. On success, %0 is returned, otherwise a negative error code
273 * is returned. 273 * is returned.
274 */ 274 */
@@ -326,11 +326,11 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
326} 326}
327 327
328/** 328/**
329 * write_orph_nodes - write orph nodes until there are no more to commit 329 * write_orph_nodes - write orphan nodes until there are no more to commit.
330 * @c: UBIFS file-system description object 330 * @c: UBIFS file-system description object
331 * @atomic: write atomically 331 * @atomic: write atomically
332 * 332 *
333 * This function writes orph nodes for all the orphans to commit. On success, 333 * This function writes orphan nodes for all the orphans to commit. On success,
334 * %0 is returned, otherwise a negative error code is returned. 334 * %0 is returned, otherwise a negative error code is returned.
335 */ 335 */
336static int write_orph_nodes(struct ubifs_info *c, int atomic) 336static int write_orph_nodes(struct ubifs_info *c, int atomic)
@@ -478,14 +478,14 @@ int ubifs_orphan_end_commit(struct ubifs_info *c)
478} 478}
479 479
480/** 480/**
481 * clear_orphans - erase all LEBs used for orphans. 481 * ubifs_clear_orphans - erase all LEBs used for orphans.
482 * @c: UBIFS file-system description object 482 * @c: UBIFS file-system description object
483 * 483 *
484 * If recovery is not required, then the orphans from the previous session 484 * If recovery is not required, then the orphans from the previous session
485 * are not needed. This function locates the LEBs used to record 485 * are not needed. This function locates the LEBs used to record
486 * orphans, and un-maps them. 486 * orphans, and un-maps them.
487 */ 487 */
488static int clear_orphans(struct ubifs_info *c) 488int ubifs_clear_orphans(struct ubifs_info *c)
489{ 489{
490 int lnum, err; 490 int lnum, err;
491 491
@@ -547,9 +547,9 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
547 * do_kill_orphans - remove orphan inodes from the index. 547 * do_kill_orphans - remove orphan inodes from the index.
548 * @c: UBIFS file-system description object 548 * @c: UBIFS file-system description object
549 * @sleb: scanned LEB 549 * @sleb: scanned LEB
550 * @last_cmt_no: cmt_no of last orph node read is passed and returned here 550 * @last_cmt_no: cmt_no of last orphan node read is passed and returned here
551 * @outofdate: whether the LEB is out of date is returned here 551 * @outofdate: whether the LEB is out of date is returned here
552 * @last_flagged: whether the end orph node is encountered 552 * @last_flagged: whether the end orphan node is encountered
553 * 553 *
554 * This function is a helper to the 'kill_orphans()' function. It goes through 554 * This function is a helper to the 'kill_orphans()' function. It goes through
555 * every orphan node in a LEB and for every inode number recorded, removes 555 * every orphan node in a LEB and for every inode number recorded, removes
@@ -580,8 +580,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
580 /* 580 /*
581 * The commit number on the master node may be less, because 581 * The commit number on the master node may be less, because
582 * of a failed commit. If there are several failed commits in a 582 * of a failed commit. If there are several failed commits in a
583 * row, the commit number written on orph nodes will continue to 583 * row, the commit number written on orphan nodes will continue
584 * increase (because the commit number is adjusted here) even 584 * to increase (because the commit number is adjusted here) even
585 * though the commit number on the master node stays the same 585 * though the commit number on the master node stays the same
586 * because the master node has not been re-written. 586 * because the master node has not been re-written.
587 */ 587 */
@@ -589,9 +589,9 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
589 c->cmt_no = cmt_no; 589 c->cmt_no = cmt_no;
590 if (cmt_no < *last_cmt_no && *last_flagged) { 590 if (cmt_no < *last_cmt_no && *last_flagged) {
591 /* 591 /*
592 * The last orph node had a higher commit number and was 592 * The last orphan node had a higher commit number and
593 * flagged as the last written for that commit number. 593 * was flagged as the last written for that commit
594 * That makes this orph node, out of date. 594 * number. That makes this orphan node, out of date.
595 */ 595 */
596 if (!first) { 596 if (!first) {
597 ubifs_err("out of order commit number %llu in " 597 ubifs_err("out of order commit number %llu in "
@@ -658,10 +658,10 @@ static int kill_orphans(struct ubifs_info *c)
658 /* 658 /*
659 * Orph nodes always start at c->orph_first and are written to each 659 * Orph nodes always start at c->orph_first and are written to each
660 * successive LEB in turn. Generally unused LEBs will have been unmapped 660 * successive LEB in turn. Generally unused LEBs will have been unmapped
661 * but may contain out of date orph nodes if the unmap didn't go 661 * but may contain out of date orphan nodes if the unmap didn't go
662 * through. In addition, the last orph node written for each commit is 662 * through. In addition, the last orphan node written for each commit is
663 * marked (top bit of orph->cmt_no is set to 1). It is possible that 663 * marked (top bit of orph->cmt_no is set to 1). It is possible that
664 * there are orph nodes from the next commit (i.e. the commit did not 664 * there are orphan nodes from the next commit (i.e. the commit did not
665 * complete successfully). In that case, no orphans will have been lost 665 * complete successfully). In that case, no orphans will have been lost
666 * due to the way that orphans are written, and any orphans added will 666 * due to the way that orphans are written, and any orphans added will
667 * be valid orphans anyway and so can be deleted. 667 * be valid orphans anyway and so can be deleted.
@@ -718,7 +718,7 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
718 if (unclean) 718 if (unclean)
719 err = kill_orphans(c); 719 err = kill_orphans(c);
720 else if (!read_only) 720 else if (!read_only)
721 err = clear_orphans(c); 721 err = ubifs_clear_orphans(c);
722 722
723 return err; 723 return err;
724} 724}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 89556ee72518..1182b66a5491 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -397,6 +397,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
397 buf->f_namelen = UBIFS_MAX_NLEN; 397 buf->f_namelen = UBIFS_MAX_NLEN;
398 buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); 398 buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
399 buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); 399 buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
400 ubifs_assert(buf->f_bfree <= c->block_cnt);
400 return 0; 401 return 0;
401} 402}
402 403
@@ -432,33 +433,24 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
432 int i, err; 433 int i, err;
433 struct ubifs_info *c = sb->s_fs_info; 434 struct ubifs_info *c = sb->s_fs_info;
434 struct writeback_control wbc = { 435 struct writeback_control wbc = {
435 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 436 .sync_mode = WB_SYNC_ALL,
436 .range_start = 0, 437 .range_start = 0,
437 .range_end = LLONG_MAX, 438 .range_end = LLONG_MAX,
438 .nr_to_write = LONG_MAX, 439 .nr_to_write = LONG_MAX,
439 }; 440 };
440 441
441 /* 442 /*
442 * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an 443 * Zero @wait is just an advisory thing to help the file system shove
443 * advisory thing to help the file system shove lots of data into the 444 * lots of data into the queues, and there will be the second
444 * queues. If some gets missed then it'll be picked up on the second
445 * '->sync_fs()' call, with non-zero @wait. 445 * '->sync_fs()' call, with non-zero @wait.
446 */ 446 */
447 if (!wait)
448 return 0;
447 449
448 if (sb->s_flags & MS_RDONLY) 450 if (sb->s_flags & MS_RDONLY)
449 return 0; 451 return 0;
450 452
451 /* 453 /*
452 * Synchronize write buffers, because 'ubifs_run_commit()' does not
453 * do this if it waits for an already running commit.
454 */
455 for (i = 0; i < c->jhead_cnt; i++) {
456 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
457 if (err)
458 return err;
459 }
460
461 /*
462 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and 454 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
463 * pages, so synchronize them first, then commit the journal. Strictly 455 * pages, so synchronize them first, then commit the journal. Strictly
464 * speaking, it is not necessary to commit the journal here, 456 * speaking, it is not necessary to commit the journal here,
@@ -469,6 +461,16 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
469 */ 461 */
470 generic_sync_sb_inodes(sb, &wbc); 462 generic_sync_sb_inodes(sb, &wbc);
471 463
464 /*
465 * Synchronize write buffers, because 'ubifs_run_commit()' does not
466 * do this if it waits for an already running commit.
467 */
468 for (i = 0; i < c->jhead_cnt; i++) {
469 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
470 if (err)
471 return err;
472 }
473
472 err = ubifs_run_commit(c); 474 err = ubifs_run_commit(c);
473 if (err) 475 if (err)
474 return err; 476 return err;
@@ -572,15 +574,8 @@ static int init_constants_early(struct ubifs_info *c)
572 c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; 574 c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
573 575
574 /* 576 /*
575 * Initialize dead and dark LEB space watermarks. 577 * Initialize dead and dark LEB space watermarks. See gc.c for comments
576 * 578 * about these values.
577 * Dead space is the space which cannot be used. Its watermark is
578 * equivalent to min. I/O unit or minimum node size if it is greater
579 * then min. I/O unit.
580 *
581 * Dark space is the space which might be used, or might not, depending
582 * on which node should be written to the LEB. Its watermark is
583 * equivalent to maximum UBIFS node size.
584 */ 579 */
585 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); 580 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
586 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); 581 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
@@ -741,12 +736,12 @@ static void init_constants_master(struct ubifs_info *c)
741 * take_gc_lnum - reserve GC LEB. 736 * take_gc_lnum - reserve GC LEB.
742 * @c: UBIFS file-system description object 737 * @c: UBIFS file-system description object
743 * 738 *
744 * This function ensures that the LEB reserved for garbage collection is 739 * This function ensures that the LEB reserved for garbage collection is marked
745 * unmapped and is marked as "taken" in lprops. We also have to set free space 740 * as "taken" in lprops. We also have to set free space to LEB size and dirty
746 * to LEB size and dirty space to zero, because lprops may contain out-of-date 741 * space to zero, because lprops may contain out-of-date information if the
747 * information if the file-system was un-mounted before it has been committed. 742 * file-system was un-mounted before it has been committed. This function
748 * This function returns zero in case of success and a negative error code in 743 * returns zero in case of success and a negative error code in case of
749 * case of failure. 744 * failure.
750 */ 745 */
751static int take_gc_lnum(struct ubifs_info *c) 746static int take_gc_lnum(struct ubifs_info *c)
752{ 747{
@@ -757,10 +752,6 @@ static int take_gc_lnum(struct ubifs_info *c)
757 return -EINVAL; 752 return -EINVAL;
758 } 753 }
759 754
760 err = ubifs_leb_unmap(c, c->gc_lnum);
761 if (err)
762 return err;
763
764 /* And we have to tell lprops that this LEB is taken */ 755 /* And we have to tell lprops that this LEB is taken */
765 err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, 756 err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
766 LPROPS_TAKEN, 0, 0); 757 LPROPS_TAKEN, 0, 0);
@@ -966,13 +957,16 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
966 957
967 token = match_token(p, tokens, args); 958 token = match_token(p, tokens, args);
968 switch (token) { 959 switch (token) {
960 /*
961 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
962 * We accepte them in order to be backware-compatible. But this
963 * should be removed at some point.
964 */
969 case Opt_fast_unmount: 965 case Opt_fast_unmount:
970 c->mount_opts.unmount_mode = 2; 966 c->mount_opts.unmount_mode = 2;
971 c->fast_unmount = 1;
972 break; 967 break;
973 case Opt_norm_unmount: 968 case Opt_norm_unmount:
974 c->mount_opts.unmount_mode = 1; 969 c->mount_opts.unmount_mode = 1;
975 c->fast_unmount = 0;
976 break; 970 break;
977 case Opt_bulk_read: 971 case Opt_bulk_read:
978 c->mount_opts.bulk_read = 2; 972 c->mount_opts.bulk_read = 2;
@@ -1094,12 +1088,7 @@ static int check_free_space(struct ubifs_info *c)
1094 ubifs_err("insufficient free space to mount in read/write mode"); 1088 ubifs_err("insufficient free space to mount in read/write mode");
1095 dbg_dump_budg(c); 1089 dbg_dump_budg(c);
1096 dbg_dump_lprops(c); 1090 dbg_dump_lprops(c);
1097 /* 1091 return -ENOSPC;
1098 * We return %-EINVAL instead of %-ENOSPC because it seems to
1099 * be the closest error code mentioned in the mount function
1100 * documentation.
1101 */
1102 return -EINVAL;
1103 } 1092 }
1104 return 0; 1093 return 0;
1105} 1094}
@@ -1286,10 +1275,19 @@ static int mount_ubifs(struct ubifs_info *c)
1286 if (err) 1275 if (err)
1287 goto out_orphans; 1276 goto out_orphans;
1288 err = ubifs_rcvry_gc_commit(c); 1277 err = ubifs_rcvry_gc_commit(c);
1289 } else 1278 } else {
1290 err = take_gc_lnum(c); 1279 err = take_gc_lnum(c);
1291 if (err) 1280 if (err)
1292 goto out_orphans; 1281 goto out_orphans;
1282
1283 /*
1284 * GC LEB may contain garbage if there was an unclean
1285 * reboot, and it should be un-mapped.
1286 */
1287 err = ubifs_leb_unmap(c, c->gc_lnum);
1288 if (err)
1289 return err;
1290 }
1293 1291
1294 err = dbg_check_lprops(c); 1292 err = dbg_check_lprops(c);
1295 if (err) 1293 if (err)
@@ -1298,6 +1296,16 @@ static int mount_ubifs(struct ubifs_info *c)
1298 err = ubifs_recover_size(c); 1296 err = ubifs_recover_size(c);
1299 if (err) 1297 if (err)
1300 goto out_orphans; 1298 goto out_orphans;
1299 } else {
1300 /*
1301 * Even if we mount read-only, we have to set space in GC LEB
1302 * to proper value because this affects UBIFS free space
1303 * reporting. We do not want to have a situation when
1304 * re-mounting from R/O to R/W changes amount of free space.
1305 */
1306 err = take_gc_lnum(c);
1307 if (err)
1308 goto out_orphans;
1301 } 1309 }
1302 1310
1303 spin_lock(&ubifs_infos_lock); 1311 spin_lock(&ubifs_infos_lock);
@@ -1310,14 +1318,17 @@ static int mount_ubifs(struct ubifs_info *c)
1310 else { 1318 else {
1311 c->need_recovery = 0; 1319 c->need_recovery = 0;
1312 ubifs_msg("recovery completed"); 1320 ubifs_msg("recovery completed");
1321 /* GC LEB has to be empty and taken at this point */
1322 ubifs_assert(c->lst.taken_empty_lebs == 1);
1313 } 1323 }
1314 } 1324 } else
1325 ubifs_assert(c->lst.taken_empty_lebs == 1);
1315 1326
1316 err = dbg_debugfs_init_fs(c); 1327 err = dbg_check_filesystem(c);
1317 if (err) 1328 if (err)
1318 goto out_infos; 1329 goto out_infos;
1319 1330
1320 err = dbg_check_filesystem(c); 1331 err = dbg_debugfs_init_fs(c);
1321 if (err) 1332 if (err)
1322 goto out_infos; 1333 goto out_infos;
1323 1334
@@ -1351,7 +1362,6 @@ static int mount_ubifs(struct ubifs_info *c)
1351 c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7], 1362 c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
1352 c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11], 1363 c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
1353 c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]); 1364 c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
1354 dbg_msg("fast unmount: %d", c->fast_unmount);
1355 dbg_msg("big_lpt %d", c->big_lpt); 1365 dbg_msg("big_lpt %d", c->big_lpt);
1356 dbg_msg("log LEBs: %d (%d - %d)", 1366 dbg_msg("log LEBs: %d (%d - %d)",
1357 c->log_lebs, UBIFS_LOG_LNUM, c->log_last); 1367 c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
@@ -1475,10 +1485,8 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1475{ 1485{
1476 int err, lnum; 1486 int err, lnum;
1477 1487
1478 if (c->ro_media)
1479 return -EINVAL;
1480
1481 mutex_lock(&c->umount_mutex); 1488 mutex_lock(&c->umount_mutex);
1489 dbg_save_space_info(c);
1482 c->remounting_rw = 1; 1490 c->remounting_rw = 1;
1483 c->always_chk_crc = 1; 1491 c->always_chk_crc = 1;
1484 1492
@@ -1514,6 +1522,12 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1514 err = ubifs_recover_inl_heads(c, c->sbuf); 1522 err = ubifs_recover_inl_heads(c, c->sbuf);
1515 if (err) 1523 if (err)
1516 goto out; 1524 goto out;
1525 } else {
1526 /* A readonly mount is not allowed to have orphans */
1527 ubifs_assert(c->tot_orphans == 0);
1528 err = ubifs_clear_orphans(c);
1529 if (err)
1530 goto out;
1517 } 1531 }
1518 1532
1519 if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { 1533 if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
@@ -1569,7 +1583,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1569 if (c->need_recovery) 1583 if (c->need_recovery)
1570 err = ubifs_rcvry_gc_commit(c); 1584 err = ubifs_rcvry_gc_commit(c);
1571 else 1585 else
1572 err = take_gc_lnum(c); 1586 err = ubifs_leb_unmap(c, c->gc_lnum);
1573 if (err) 1587 if (err)
1574 goto out; 1588 goto out;
1575 1589
@@ -1582,8 +1596,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1582 c->vfs_sb->s_flags &= ~MS_RDONLY; 1596 c->vfs_sb->s_flags &= ~MS_RDONLY;
1583 c->remounting_rw = 0; 1597 c->remounting_rw = 0;
1584 c->always_chk_crc = 0; 1598 c->always_chk_crc = 0;
1599 err = dbg_check_space_info(c);
1585 mutex_unlock(&c->umount_mutex); 1600 mutex_unlock(&c->umount_mutex);
1586 return 0; 1601 return err;
1587 1602
1588out: 1603out:
1589 vfree(c->orph_buf); 1604 vfree(c->orph_buf);
@@ -1603,43 +1618,18 @@ out:
1603} 1618}
1604 1619
1605/** 1620/**
1606 * commit_on_unmount - commit the journal when un-mounting.
1607 * @c: UBIFS file-system description object
1608 *
1609 * This function is called during un-mounting and re-mounting, and it commits
1610 * the journal unless the "fast unmount" mode is enabled.
1611 */
1612static void commit_on_unmount(struct ubifs_info *c)
1613{
1614 struct super_block *sb = c->vfs_sb;
1615 long long bud_bytes;
1616
1617 /*
1618 * This function is called before the background thread is stopped, so
1619 * we may race with ongoing commit, which means we have to take
1620 * @c->bud_lock to access @c->bud_bytes.
1621 */
1622 spin_lock(&c->buds_lock);
1623 bud_bytes = c->bud_bytes;
1624 spin_unlock(&c->buds_lock);
1625
1626 if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes)
1627 ubifs_run_commit(c);
1628}
1629
1630/**
1631 * ubifs_remount_ro - re-mount in read-only mode. 1621 * ubifs_remount_ro - re-mount in read-only mode.
1632 * @c: UBIFS file-system description object 1622 * @c: UBIFS file-system description object
1633 * 1623 *
1634 * We rely on VFS to have stopped writing. Possibly the background thread could 1624 * We assume VFS has stopped writing. Possibly the background thread could be
1635 * be running a commit, however kthread_stop will wait in that case. 1625 * running a commit, however kthread_stop will wait in that case.
1636 */ 1626 */
1637static void ubifs_remount_ro(struct ubifs_info *c) 1627static void ubifs_remount_ro(struct ubifs_info *c)
1638{ 1628{
1639 int i, err; 1629 int i, err;
1640 1630
1641 ubifs_assert(!c->need_recovery); 1631 ubifs_assert(!c->need_recovery);
1642 commit_on_unmount(c); 1632 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
1643 1633
1644 mutex_lock(&c->umount_mutex); 1634 mutex_lock(&c->umount_mutex);
1645 if (c->bgt) { 1635 if (c->bgt) {
@@ -1647,27 +1637,29 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1647 c->bgt = NULL; 1637 c->bgt = NULL;
1648 } 1638 }
1649 1639
1640 dbg_save_space_info(c);
1641
1650 for (i = 0; i < c->jhead_cnt; i++) { 1642 for (i = 0; i < c->jhead_cnt; i++) {
1651 ubifs_wbuf_sync(&c->jheads[i].wbuf); 1643 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1652 del_timer_sync(&c->jheads[i].wbuf.timer); 1644 del_timer_sync(&c->jheads[i].wbuf.timer);
1653 } 1645 }
1654 1646
1655 if (!c->ro_media) { 1647 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1656 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1648 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
1657 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 1649 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
1658 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); 1650 err = ubifs_write_master(c);
1659 err = ubifs_write_master(c); 1651 if (err)
1660 if (err) 1652 ubifs_ro_mode(c, err);
1661 ubifs_ro_mode(c, err);
1662 }
1663 1653
1664 ubifs_destroy_idx_gc(c);
1665 free_wbufs(c); 1654 free_wbufs(c);
1666 vfree(c->orph_buf); 1655 vfree(c->orph_buf);
1667 c->orph_buf = NULL; 1656 c->orph_buf = NULL;
1668 vfree(c->ileb_buf); 1657 vfree(c->ileb_buf);
1669 c->ileb_buf = NULL; 1658 c->ileb_buf = NULL;
1670 ubifs_lpt_free(c, 1); 1659 ubifs_lpt_free(c, 1);
1660 err = dbg_check_space_info(c);
1661 if (err)
1662 ubifs_ro_mode(c, err);
1671 mutex_unlock(&c->umount_mutex); 1663 mutex_unlock(&c->umount_mutex);
1672} 1664}
1673 1665
@@ -1760,11 +1752,20 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1760 } 1752 }
1761 1753
1762 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1754 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
1755 if (c->ro_media) {
1756 ubifs_msg("cannot re-mount due to prior errors");
1757 return -EROFS;
1758 }
1763 err = ubifs_remount_rw(c); 1759 err = ubifs_remount_rw(c);
1764 if (err) 1760 if (err)
1765 return err; 1761 return err;
1766 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) 1762 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
1763 if (c->ro_media) {
1764 ubifs_msg("cannot re-mount due to prior errors");
1765 return -EROFS;
1766 }
1767 ubifs_remount_ro(c); 1767 ubifs_remount_ro(c);
1768 }
1768 1769
1769 if (c->bulk_read == 1) 1770 if (c->bulk_read == 1)
1770 bu_init(c); 1771 bu_init(c);
@@ -1774,10 +1775,11 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1774 c->bu.buf = NULL; 1775 c->bu.buf = NULL;
1775 } 1776 }
1776 1777
1778 ubifs_assert(c->lst.taken_empty_lebs == 1);
1777 return 0; 1779 return 0;
1778} 1780}
1779 1781
1780struct super_operations ubifs_super_operations = { 1782const struct super_operations ubifs_super_operations = {
1781 .alloc_inode = ubifs_alloc_inode, 1783 .alloc_inode = ubifs_alloc_inode,
1782 .destroy_inode = ubifs_destroy_inode, 1784 .destroy_inode = ubifs_destroy_inode,
1783 .put_super = ubifs_put_super, 1785 .put_super = ubifs_put_super,
@@ -2044,15 +2046,6 @@ out_close:
2044 2046
2045static void ubifs_kill_sb(struct super_block *sb) 2047static void ubifs_kill_sb(struct super_block *sb)
2046{ 2048{
2047 struct ubifs_info *c = sb->s_fs_info;
2048
2049 /*
2050 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
2051 * in order to be outside BKL.
2052 */
2053 if (sb->s_root)
2054 commit_on_unmount(c);
2055 /* The un-mount routine is actually done in put_super() */
2056 generic_shutdown_super(sb); 2049 generic_shutdown_super(sb);
2057} 2050}
2058 2051
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f7e36f545527..fa28a84c6a1b 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -443,6 +443,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
443 * This function performs that same function as ubifs_read_node except that 443 * This function performs that same function as ubifs_read_node except that
444 * it does not require that there is actually a node present and instead 444 * it does not require that there is actually a node present and instead
445 * the return code indicates if a node was read. 445 * the return code indicates if a node was read.
446 *
447 * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
448 * is true (it is controlled by corresponding mount option). However, if
449 * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always
450 * checked.
446 */ 451 */
447static int try_read_node(const struct ubifs_info *c, void *buf, int type, 452static int try_read_node(const struct ubifs_info *c, void *buf, int type,
448 int len, int lnum, int offs) 453 int len, int lnum, int offs)
@@ -470,9 +475,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
470 if (node_len != len) 475 if (node_len != len)
471 return 0; 476 return 0;
472 477
473 if (type == UBIFS_DATA_NODE && !c->always_chk_crc) 478 if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc)
474 if (c->no_chk_data_crc) 479 return 1;
475 return 0;
476 480
477 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); 481 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
478 node_crc = le32_to_cpu(ch->crc); 482 node_crc = le32_to_cpu(ch->crc);
@@ -1506,7 +1510,7 @@ out:
1506 * 1510 *
1507 * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function 1511 * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function
1508 * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares 1512 * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares
1509 * maxumum possible amount of nodes for bulk-read. 1513 * maximum possible amount of nodes for bulk-read.
1510 */ 1514 */
1511int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) 1515int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
1512{ 1516{
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index fc2a4cc66d03..039a68bee29a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -426,9 +426,9 @@ struct ubifs_unclean_leb {
426 * LEB properties flags. 426 * LEB properties flags.
427 * 427 *
428 * LPROPS_UNCAT: not categorized 428 * LPROPS_UNCAT: not categorized
429 * LPROPS_DIRTY: dirty > 0, not index 429 * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index
430 * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index 430 * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
431 * LPROPS_FREE: free > 0, not empty, not index 431 * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index
432 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs 432 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
433 * LPROPS_EMPTY: LEB is empty, not taken 433 * LPROPS_EMPTY: LEB is empty, not taken
434 * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken 434 * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
@@ -961,7 +961,6 @@ struct ubifs_debug_info;
961 * @cs_lock: commit state lock 961 * @cs_lock: commit state lock
962 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running 962 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
963 * 963 *
964 * @fast_unmount: do not run journal commit before un-mounting
965 * @big_lpt: flag that LPT is too big to write whole during commit 964 * @big_lpt: flag that LPT is too big to write whole during commit
966 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during 965 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
967 * recovery) 966 * recovery)
@@ -1202,7 +1201,6 @@ struct ubifs_info {
1202 spinlock_t cs_lock; 1201 spinlock_t cs_lock;
1203 wait_queue_head_t cmt_wq; 1202 wait_queue_head_t cmt_wq;
1204 1203
1205 unsigned int fast_unmount:1;
1206 unsigned int big_lpt:1; 1204 unsigned int big_lpt:1;
1207 unsigned int no_chk_data_crc:1; 1205 unsigned int no_chk_data_crc:1;
1208 unsigned int bulk_read:1; 1206 unsigned int bulk_read:1;
@@ -1405,13 +1403,13 @@ extern struct list_head ubifs_infos;
1405extern spinlock_t ubifs_infos_lock; 1403extern spinlock_t ubifs_infos_lock;
1406extern atomic_long_t ubifs_clean_zn_cnt; 1404extern atomic_long_t ubifs_clean_zn_cnt;
1407extern struct kmem_cache *ubifs_inode_slab; 1405extern struct kmem_cache *ubifs_inode_slab;
1408extern struct super_operations ubifs_super_operations; 1406extern const struct super_operations ubifs_super_operations;
1409extern struct address_space_operations ubifs_file_address_operations; 1407extern const struct address_space_operations ubifs_file_address_operations;
1410extern struct file_operations ubifs_file_operations; 1408extern const struct file_operations ubifs_file_operations;
1411extern struct inode_operations ubifs_file_inode_operations; 1409extern const struct inode_operations ubifs_file_inode_operations;
1412extern struct file_operations ubifs_dir_operations; 1410extern const struct file_operations ubifs_dir_operations;
1413extern struct inode_operations ubifs_dir_inode_operations; 1411extern const struct inode_operations ubifs_dir_inode_operations;
1414extern struct inode_operations ubifs_symlink_inode_operations; 1412extern const struct inode_operations ubifs_symlink_inode_operations;
1415extern struct backing_dev_info ubifs_backing_dev_info; 1413extern struct backing_dev_info ubifs_backing_dev_info;
1416extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; 1414extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
1417 1415
@@ -1428,7 +1426,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
1428int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, 1426int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
1429 int offs, int dtype); 1427 int offs, int dtype);
1430int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, 1428int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
1431 int offs, int quiet, int chk_crc); 1429 int offs, int quiet, int must_chk_crc);
1432void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); 1430void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
1433void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); 1431void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
1434int ubifs_io_init(struct ubifs_info *c); 1432int ubifs_io_init(struct ubifs_info *c);
@@ -1495,6 +1493,7 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
1495void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, 1493void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
1496 struct ubifs_budget_req *req); 1494 struct ubifs_budget_req *req);
1497long long ubifs_get_free_space(struct ubifs_info *c); 1495long long ubifs_get_free_space(struct ubifs_info *c);
1496long long ubifs_get_free_space_nolock(struct ubifs_info *c);
1498int ubifs_calc_min_idx_lebs(struct ubifs_info *c); 1497int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
1499void ubifs_convert_page_budget(struct ubifs_info *c); 1498void ubifs_convert_page_budget(struct ubifs_info *c);
1500long long ubifs_reported_space(const struct ubifs_info *c, long long free); 1499long long ubifs_reported_space(const struct ubifs_info *c, long long free);
@@ -1603,6 +1602,7 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
1603int ubifs_orphan_start_commit(struct ubifs_info *c); 1602int ubifs_orphan_start_commit(struct ubifs_info *c);
1604int ubifs_orphan_end_commit(struct ubifs_info *c); 1603int ubifs_orphan_end_commit(struct ubifs_info *c);
1605int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only); 1604int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
1605int ubifs_clear_orphans(struct ubifs_info *c);
1606 1606
1607/* lpt.c */ 1607/* lpt.c */
1608int ubifs_calc_lpt_geom(struct ubifs_info *c); 1608int ubifs_calc_lpt_geom(struct ubifs_info *c);
@@ -1646,7 +1646,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
1646 const struct ubifs_lprops *lp, 1646 const struct ubifs_lprops *lp,
1647 int free, int dirty, int flags, 1647 int free, int dirty, int flags,
1648 int idx_gc_cnt); 1648 int idx_gc_cnt);
1649void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats); 1649void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst);
1650void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, 1650void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
1651 int cat); 1651 int cat);
1652void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, 1652void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
new file mode 100644
index 000000000000..0e0e99bd6bce
--- /dev/null
+++ b/fs/udf/Kconfig
@@ -0,0 +1,18 @@
1config UDF_FS
2 tristate "UDF file system support"
3 select CRC_ITU_T
4 help
5 This is the new file system used on some CD-ROMs and DVDs. Say Y if
6 you intend to mount DVD discs or CDRW's written in packet mode, or
7 if written to by other UDF utilities, such as DirectCD.
8 Please read <file:Documentation/filesystems/udf.txt>.
9
10 To compile this file system support as a module, choose M here: the
11 module will be called udf.
12
13 If unsure, say N.
14
15config UDF_NLS
16 bool
17 default y
18 depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
new file mode 100644
index 000000000000..e4f10a40768a
--- /dev/null
+++ b/fs/ufs/Kconfig
@@ -0,0 +1,43 @@
1config UFS_FS
2 tristate "UFS file system support (read only)"
3 depends on BLOCK
4 help
5 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
6 OpenBSD and NeXTstep) use a file system called UFS. Some System V
7 Unixes can create and mount hard disk partitions and diskettes using
8 this file system as well. Saying Y here will allow you to read from
9 these partitions; if you also want to write to them, say Y to the
10 experimental "UFS file system write support", below. Please read the
11 file <file:Documentation/filesystems/ufs.txt> for more information.
12
13 The recently released UFS2 variant (used in FreeBSD 5.x) is
14 READ-ONLY supported.
15
16 Note that this option is generally not needed for floppies, since a
17 good portable way to transport files and directories between unixes
18 (and even other operating systems) is given by the tar program ("man
19 tar" or preferably "info tar").
20
21 When accessing NeXTstep files, you may need to convert them from the
22 NeXT character set to the Latin1 character set; use the program
23 recode ("info recode") for this purpose.
24
25 To compile the UFS file system support as a module, choose M here: the
26 module will be called ufs.
27
28 If you haven't heard about all of this before, it's safe to say N.
29
30config UFS_FS_WRITE
31 bool "UFS file system write support (DANGEROUS)"
32 depends on UFS_FS && EXPERIMENTAL
33 help
34 Say Y here if you want to try writing to UFS partitions. This is
35 experimental, so you should back up your UFS partitions beforehand.
36
37config UFS_DEBUG
38 bool "UFS debugging"
39 depends on UFS_FS
40 help
41 If you are experiencing any problems with the UFS filesystem, say
42 Y here. This will result in _many_ additional debugging messages to be
43 written to the system log.
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index e65212dfb60e..261a1c2f22dd 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -41,7 +41,7 @@
41 * Stefan Reinauer <stepan@home.culture.mipt.ru> 41 * Stefan Reinauer <stepan@home.culture.mipt.ru>
42 * 42 *
43 * Module usage counts added on 96/04/29 by 43 * Module usage counts added on 96/04/29 by
44 * Gertjan van Wingerde <gertjan@cs.vu.nl> 44 * Gertjan van Wingerde <gwingerde@gmail.com>
45 * 45 *
46 * Clean swab support on 19970406 by 46 * Clean swab support on 19970406 by
47 * Francois-Rene Rideau <fare@tunes.org> 47 * Francois-Rene Rideau <fare@tunes.org>
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 3f53dd101f99..29228f5899cd 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,7 @@
1config XFS_FS 1config XFS_FS
2 tristate "XFS filesystem support" 2 tristate "XFS filesystem support"
3 depends on BLOCK 3 depends on BLOCK
4 select EXPORTFS
4 help 5 help
5 XFS is a high performance journaling filesystem which originated 6 XFS is a high performance journaling filesystem which originated
6 on the SGI IRIX platform. It is completely multi-threaded, can 7 on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index d71dc44e21ed..aa1016bb9134 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -34,6 +34,12 @@
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36 36
37#include "xfs_sb.h"
38#include "xfs_inum.h"
39#include "xfs_ag.h"
40#include "xfs_dmapi.h"
41#include "xfs_mount.h"
42
37static kmem_zone_t *xfs_buf_zone; 43static kmem_zone_t *xfs_buf_zone;
38STATIC int xfsbufd(void *); 44STATIC int xfsbufd(void *);
39STATIC int xfsbufd_wakeup(int, gfp_t); 45STATIC int xfsbufd_wakeup(int, gfp_t);
@@ -166,6 +172,75 @@ test_page_region(
166} 172}
167 173
168/* 174/*
175 * Mapping of multi-page buffers into contiguous virtual space
176 */
177
178typedef struct a_list {
179 void *vm_addr;
180 struct a_list *next;
181} a_list_t;
182
183static a_list_t *as_free_head;
184static int as_list_len;
185static DEFINE_SPINLOCK(as_lock);
186
187/*
188 * Try to batch vunmaps because they are costly.
189 */
190STATIC void
191free_address(
192 void *addr)
193{
194 a_list_t *aentry;
195
196#ifdef CONFIG_XEN
197 /*
198 * Xen needs to be able to make sure it can get an exclusive
199 * RO mapping of pages it wants to turn into a pagetable. If
200 * a newly allocated page is also still being vmap()ed by xfs,
201 * it will cause pagetable construction to fail. This is a
202 * quick workaround to always eagerly unmap pages so that Xen
203 * is happy.
204 */
205 vunmap(addr);
206 return;
207#endif
208
209 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
210 if (likely(aentry)) {
211 spin_lock(&as_lock);
212 aentry->next = as_free_head;
213 aentry->vm_addr = addr;
214 as_free_head = aentry;
215 as_list_len++;
216 spin_unlock(&as_lock);
217 } else {
218 vunmap(addr);
219 }
220}
221
222STATIC void
223purge_addresses(void)
224{
225 a_list_t *aentry, *old;
226
227 if (as_free_head == NULL)
228 return;
229
230 spin_lock(&as_lock);
231 aentry = as_free_head;
232 as_free_head = NULL;
233 as_list_len = 0;
234 spin_unlock(&as_lock);
235
236 while ((old = aentry) != NULL) {
237 vunmap(aentry->vm_addr);
238 aentry = aentry->next;
239 kfree(old);
240 }
241}
242
243/*
169 * Internal xfs_buf_t object manipulation 244 * Internal xfs_buf_t object manipulation
170 */ 245 */
171 246
@@ -264,7 +339,7 @@ xfs_buf_free(
264 uint i; 339 uint i;
265 340
266 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 341 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
267 vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count); 342 free_address(bp->b_addr - bp->b_offset);
268 343
269 for (i = 0; i < bp->b_page_count; i++) { 344 for (i = 0; i < bp->b_page_count; i++) {
270 struct page *page = bp->b_pages[i]; 345 struct page *page = bp->b_pages[i];
@@ -386,8 +461,10 @@ _xfs_buf_map_pages(
386 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 461 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
387 bp->b_flags |= XBF_MAPPED; 462 bp->b_flags |= XBF_MAPPED;
388 } else if (flags & XBF_MAPPED) { 463 } else if (flags & XBF_MAPPED) {
389 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 464 if (as_list_len > 64)
390 -1, PAGE_KERNEL); 465 purge_addresses();
466 bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
467 VM_MAP, PAGE_KERNEL);
391 if (unlikely(bp->b_addr == NULL)) 468 if (unlikely(bp->b_addr == NULL))
392 return -ENOMEM; 469 return -ENOMEM;
393 bp->b_addr += bp->b_offset; 470 bp->b_addr += bp->b_offset;
@@ -1364,10 +1441,12 @@ xfs_unregister_buftarg(
1364 1441
1365void 1442void
1366xfs_free_buftarg( 1443xfs_free_buftarg(
1367 xfs_buftarg_t *btp) 1444 struct xfs_mount *mp,
1445 struct xfs_buftarg *btp)
1368{ 1446{
1369 xfs_flush_buftarg(btp, 1); 1447 xfs_flush_buftarg(btp, 1);
1370 xfs_blkdev_issue_flush(btp); 1448 if (mp->m_flags & XFS_MOUNT_BARRIER)
1449 xfs_blkdev_issue_flush(btp);
1371 xfs_free_bufhash(btp); 1450 xfs_free_bufhash(btp);
1372 iput(btp->bt_mapping->host); 1451 iput(btp->bt_mapping->host);
1373 1452
@@ -1672,6 +1751,8 @@ xfsbufd(
1672 count++; 1751 count++;
1673 } 1752 }
1674 1753
1754 if (as_list_len > 0)
1755 purge_addresses();
1675 if (count) 1756 if (count)
1676 blk_run_address_space(target->bt_mapping); 1757 blk_run_address_space(target->bt_mapping);
1677 1758
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 288ae7c4c800..9b4d666ad31f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -413,7 +413,7 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
413 * Handling of buftargs. 413 * Handling of buftargs.
414 */ 414 */
415extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); 415extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
416extern void xfs_free_buftarg(xfs_buftarg_t *); 416extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
417extern void xfs_wait_buftarg(xfs_buftarg_t *); 417extern void xfs_wait_buftarg(xfs_buftarg_t *);
418extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 418extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
419extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 419extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index e5be1e0be802..4bd112313f33 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -50,12 +50,14 @@
50#include "xfs_vnodeops.h" 50#include "xfs_vnodeops.h"
51#include "xfs_quota.h" 51#include "xfs_quota.h"
52#include "xfs_inode_item.h" 52#include "xfs_inode_item.h"
53#include "xfs_export.h"
53 54
54#include <linux/capability.h> 55#include <linux/capability.h>
55#include <linux/dcache.h> 56#include <linux/dcache.h>
56#include <linux/mount.h> 57#include <linux/mount.h>
57#include <linux/namei.h> 58#include <linux/namei.h>
58#include <linux/pagemap.h> 59#include <linux/pagemap.h>
60#include <linux/exportfs.h>
59 61
60/* 62/*
61 * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to 63 * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
@@ -164,97 +166,69 @@ xfs_find_handle(
164 return 0; 166 return 0;
165} 167}
166 168
167
168/* 169/*
169 * Convert userspace handle data into inode. 170 * No need to do permission checks on the various pathname components
170 * 171 * as the handle operations are privileged.
171 * We use the fact that all the fsop_handlereq ioctl calls have a data
172 * structure argument whose first component is always a xfs_fsop_handlereq_t,
173 * so we can pass that sub structure into this handy, shared routine.
174 *
175 * If no error, caller must always iput the returned inode.
176 */ 172 */
177STATIC int 173STATIC int
178xfs_vget_fsop_handlereq( 174xfs_handle_acceptable(
179 xfs_mount_t *mp, 175 void *context,
180 struct inode *parinode, /* parent inode pointer */ 176 struct dentry *dentry)
181 xfs_fsop_handlereq_t *hreq, 177{
182 struct inode **inode) 178 return 1;
179}
180
181/*
182 * Convert userspace handle data into a dentry.
183 */
184struct dentry *
185xfs_handle_to_dentry(
186 struct file *parfilp,
187 void __user *uhandle,
188 u32 hlen)
183{ 189{
184 void __user *hanp;
185 size_t hlen;
186 xfs_fid_t *xfid;
187 xfs_handle_t *handlep;
188 xfs_handle_t handle; 190 xfs_handle_t handle;
189 xfs_inode_t *ip; 191 struct xfs_fid64 fid;
190 xfs_ino_t ino;
191 __u32 igen;
192 int error;
193 192
194 /* 193 /*
195 * Only allow handle opens under a directory. 194 * Only allow handle opens under a directory.
196 */ 195 */
197 if (!S_ISDIR(parinode->i_mode)) 196 if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode))
198 return XFS_ERROR(ENOTDIR); 197 return ERR_PTR(-ENOTDIR);
199 198
200 hanp = hreq->ihandle; 199 if (hlen != sizeof(xfs_handle_t))
201 hlen = hreq->ihandlen; 200 return ERR_PTR(-EINVAL);
202 handlep = &handle; 201 if (copy_from_user(&handle, uhandle, hlen))
203 202 return ERR_PTR(-EFAULT);
204 if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) 203 if (handle.ha_fid.fid_len !=
205 return XFS_ERROR(EINVAL); 204 sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
206 if (copy_from_user(handlep, hanp, hlen)) 205 return ERR_PTR(-EINVAL);
207 return XFS_ERROR(EFAULT); 206
208 if (hlen < sizeof(*handlep)) 207 memset(&fid, 0, sizeof(struct fid));
209 memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); 208 fid.ino = handle.ha_fid.fid_ino;
210 if (hlen > sizeof(handlep->ha_fsid)) { 209 fid.gen = handle.ha_fid.fid_gen;
211 if (handlep->ha_fid.fid_len != 210
212 (hlen - sizeof(handlep->ha_fsid) - 211 return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
213 sizeof(handlep->ha_fid.fid_len)) || 212 FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
214 handlep->ha_fid.fid_pad) 213 xfs_handle_acceptable, NULL);
215 return XFS_ERROR(EINVAL); 214}
216 }
217
218 /*
219 * Crack the handle, obtain the inode # & generation #
220 */
221 xfid = (struct xfs_fid *)&handlep->ha_fid;
222 if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
223 ino = xfid->fid_ino;
224 igen = xfid->fid_gen;
225 } else {
226 return XFS_ERROR(EINVAL);
227 }
228
229 /*
230 * Get the XFS inode, building a Linux inode to go with it.
231 */
232 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
233 if (error)
234 return error;
235 if (ip == NULL)
236 return XFS_ERROR(EIO);
237 if (ip->i_d.di_gen != igen) {
238 xfs_iput_new(ip, XFS_ILOCK_SHARED);
239 return XFS_ERROR(ENOENT);
240 }
241
242 xfs_iunlock(ip, XFS_ILOCK_SHARED);
243 215
244 *inode = VFS_I(ip); 216STATIC struct dentry *
245 return 0; 217xfs_handlereq_to_dentry(
218 struct file *parfilp,
219 xfs_fsop_handlereq_t *hreq)
220{
221 return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
246} 222}
247 223
248int 224int
249xfs_open_by_handle( 225xfs_open_by_handle(
250 xfs_mount_t *mp,
251 xfs_fsop_handlereq_t *hreq,
252 struct file *parfilp, 226 struct file *parfilp,
253 struct inode *parinode) 227 xfs_fsop_handlereq_t *hreq)
254{ 228{
255 const struct cred *cred = current_cred(); 229 const struct cred *cred = current_cred();
256 int error; 230 int error;
257 int new_fd; 231 int fd;
258 int permflag; 232 int permflag;
259 struct file *filp; 233 struct file *filp;
260 struct inode *inode; 234 struct inode *inode;
@@ -263,19 +237,21 @@ xfs_open_by_handle(
263 if (!capable(CAP_SYS_ADMIN)) 237 if (!capable(CAP_SYS_ADMIN))
264 return -XFS_ERROR(EPERM); 238 return -XFS_ERROR(EPERM);
265 239
266 error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode); 240 dentry = xfs_handlereq_to_dentry(parfilp, hreq);
267 if (error) 241 if (IS_ERR(dentry))
268 return -error; 242 return PTR_ERR(dentry);
243 inode = dentry->d_inode;
269 244
270 /* Restrict xfs_open_by_handle to directories & regular files. */ 245 /* Restrict xfs_open_by_handle to directories & regular files. */
271 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { 246 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
272 iput(inode); 247 error = -XFS_ERROR(EPERM);
273 return -XFS_ERROR(EINVAL); 248 goto out_dput;
274 } 249 }
275 250
276#if BITS_PER_LONG != 32 251#if BITS_PER_LONG != 32
277 hreq->oflags |= O_LARGEFILE; 252 hreq->oflags |= O_LARGEFILE;
278#endif 253#endif
254
279 /* Put open permission in namei format. */ 255 /* Put open permission in namei format. */
280 permflag = hreq->oflags; 256 permflag = hreq->oflags;
281 if ((permflag+1) & O_ACCMODE) 257 if ((permflag+1) & O_ACCMODE)
@@ -285,50 +261,45 @@ xfs_open_by_handle(
285 261
286 if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && 262 if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
287 (permflag & FMODE_WRITE) && IS_APPEND(inode)) { 263 (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
288 iput(inode); 264 error = -XFS_ERROR(EPERM);
289 return -XFS_ERROR(EPERM); 265 goto out_dput;
290 } 266 }
291 267
292 if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) { 268 if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
293 iput(inode); 269 error = -XFS_ERROR(EACCES);
294 return -XFS_ERROR(EACCES); 270 goto out_dput;
295 } 271 }
296 272
297 /* Can't write directories. */ 273 /* Can't write directories. */
298 if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { 274 if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
299 iput(inode); 275 error = -XFS_ERROR(EISDIR);
300 return -XFS_ERROR(EISDIR); 276 goto out_dput;
301 } 277 }
302 278
303 if ((new_fd = get_unused_fd()) < 0) { 279 fd = get_unused_fd();
304 iput(inode); 280 if (fd < 0) {
305 return new_fd; 281 error = fd;
282 goto out_dput;
306 } 283 }
307 284
308 dentry = d_obtain_alias(inode); 285 filp = dentry_open(dentry, mntget(parfilp->f_path.mnt),
309 if (IS_ERR(dentry)) { 286 hreq->oflags, cred);
310 put_unused_fd(new_fd);
311 return PTR_ERR(dentry);
312 }
313
314 /* Ensure umount returns EBUSY on umounts while this file is open. */
315 mntget(parfilp->f_path.mnt);
316
317 /* Create file pointer. */
318 filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred);
319 if (IS_ERR(filp)) { 287 if (IS_ERR(filp)) {
320 put_unused_fd(new_fd); 288 put_unused_fd(fd);
321 return -XFS_ERROR(-PTR_ERR(filp)); 289 return PTR_ERR(filp);
322 } 290 }
323 291
324 if (inode->i_mode & S_IFREG) { 292 if (inode->i_mode & S_IFREG) {
325 /* invisible operation should not change atime */
326 filp->f_flags |= O_NOATIME; 293 filp->f_flags |= O_NOATIME;
327 filp->f_mode |= FMODE_NOCMTIME; 294 filp->f_mode |= FMODE_NOCMTIME;
328 } 295 }
329 296
330 fd_install(new_fd, filp); 297 fd_install(fd, filp);
331 return new_fd; 298 return fd;
299
300 out_dput:
301 dput(dentry);
302 return error;
332} 303}
333 304
334/* 305/*
@@ -359,11 +330,10 @@ do_readlink(
359 330
360int 331int
361xfs_readlink_by_handle( 332xfs_readlink_by_handle(
362 xfs_mount_t *mp, 333 struct file *parfilp,
363 xfs_fsop_handlereq_t *hreq, 334 xfs_fsop_handlereq_t *hreq)
364 struct inode *parinode)
365{ 335{
366 struct inode *inode; 336 struct dentry *dentry;
367 __u32 olen; 337 __u32 olen;
368 void *link; 338 void *link;
369 int error; 339 int error;
@@ -371,26 +341,28 @@ xfs_readlink_by_handle(
371 if (!capable(CAP_SYS_ADMIN)) 341 if (!capable(CAP_SYS_ADMIN))
372 return -XFS_ERROR(EPERM); 342 return -XFS_ERROR(EPERM);
373 343
374 error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode); 344 dentry = xfs_handlereq_to_dentry(parfilp, hreq);
375 if (error) 345 if (IS_ERR(dentry))
376 return -error; 346 return PTR_ERR(dentry);
377 347
378 /* Restrict this handle operation to symlinks only. */ 348 /* Restrict this handle operation to symlinks only. */
379 if (!S_ISLNK(inode->i_mode)) { 349 if (!S_ISLNK(dentry->d_inode->i_mode)) {
380 error = -XFS_ERROR(EINVAL); 350 error = -XFS_ERROR(EINVAL);
381 goto out_iput; 351 goto out_dput;
382 } 352 }
383 353
384 if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { 354 if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
385 error = -XFS_ERROR(EFAULT); 355 error = -XFS_ERROR(EFAULT);
386 goto out_iput; 356 goto out_dput;
387 } 357 }
388 358
389 link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); 359 link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
390 if (!link) 360 if (!link) {
391 goto out_iput; 361 error = -XFS_ERROR(ENOMEM);
362 goto out_dput;
363 }
392 364
393 error = -xfs_readlink(XFS_I(inode), link); 365 error = -xfs_readlink(XFS_I(dentry->d_inode), link);
394 if (error) 366 if (error)
395 goto out_kfree; 367 goto out_kfree;
396 error = do_readlink(hreq->ohandle, olen, link); 368 error = do_readlink(hreq->ohandle, olen, link);
@@ -399,32 +371,31 @@ xfs_readlink_by_handle(
399 371
400 out_kfree: 372 out_kfree:
401 kfree(link); 373 kfree(link);
402 out_iput: 374 out_dput:
403 iput(inode); 375 dput(dentry);
404 return error; 376 return error;
405} 377}
406 378
407STATIC int 379STATIC int
408xfs_fssetdm_by_handle( 380xfs_fssetdm_by_handle(
409 xfs_mount_t *mp, 381 struct file *parfilp,
410 void __user *arg, 382 void __user *arg)
411 struct inode *parinode)
412{ 383{
413 int error; 384 int error;
414 struct fsdmidata fsd; 385 struct fsdmidata fsd;
415 xfs_fsop_setdm_handlereq_t dmhreq; 386 xfs_fsop_setdm_handlereq_t dmhreq;
416 struct inode *inode; 387 struct dentry *dentry;
417 388
418 if (!capable(CAP_MKNOD)) 389 if (!capable(CAP_MKNOD))
419 return -XFS_ERROR(EPERM); 390 return -XFS_ERROR(EPERM);
420 if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) 391 if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
421 return -XFS_ERROR(EFAULT); 392 return -XFS_ERROR(EFAULT);
422 393
423 error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &inode); 394 dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
424 if (error) 395 if (IS_ERR(dentry))
425 return -error; 396 return PTR_ERR(dentry);
426 397
427 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { 398 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
428 error = -XFS_ERROR(EPERM); 399 error = -XFS_ERROR(EPERM);
429 goto out; 400 goto out;
430 } 401 }
@@ -434,24 +405,23 @@ xfs_fssetdm_by_handle(
434 goto out; 405 goto out;
435 } 406 }
436 407
437 error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask, 408 error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
438 fsd.fsd_dmstate); 409 fsd.fsd_dmstate);
439 410
440 out: 411 out:
441 iput(inode); 412 dput(dentry);
442 return error; 413 return error;
443} 414}
444 415
445STATIC int 416STATIC int
446xfs_attrlist_by_handle( 417xfs_attrlist_by_handle(
447 xfs_mount_t *mp, 418 struct file *parfilp,
448 void __user *arg, 419 void __user *arg)
449 struct inode *parinode)
450{ 420{
451 int error; 421 int error = -ENOMEM;
452 attrlist_cursor_kern_t *cursor; 422 attrlist_cursor_kern_t *cursor;
453 xfs_fsop_attrlist_handlereq_t al_hreq; 423 xfs_fsop_attrlist_handlereq_t al_hreq;
454 struct inode *inode; 424 struct dentry *dentry;
455 char *kbuf; 425 char *kbuf;
456 426
457 if (!capable(CAP_SYS_ADMIN)) 427 if (!capable(CAP_SYS_ADMIN))
@@ -467,16 +437,16 @@ xfs_attrlist_by_handle(
467 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) 437 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
468 return -XFS_ERROR(EINVAL); 438 return -XFS_ERROR(EINVAL);
469 439
470 error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, &inode); 440 dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
471 if (error) 441 if (IS_ERR(dentry))
472 goto out; 442 return PTR_ERR(dentry);
473 443
474 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); 444 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
475 if (!kbuf) 445 if (!kbuf)
476 goto out_vn_rele; 446 goto out_dput;
477 447
478 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; 448 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
479 error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen, 449 error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
480 al_hreq.flags, cursor); 450 al_hreq.flags, cursor);
481 if (error) 451 if (error)
482 goto out_kfree; 452 goto out_kfree;
@@ -486,10 +456,9 @@ xfs_attrlist_by_handle(
486 456
487 out_kfree: 457 out_kfree:
488 kfree(kbuf); 458 kfree(kbuf);
489 out_vn_rele: 459 out_dput:
490 iput(inode); 460 dput(dentry);
491 out: 461 return error;
492 return -error;
493} 462}
494 463
495int 464int
@@ -564,15 +533,13 @@ xfs_attrmulti_attr_remove(
564 533
565STATIC int 534STATIC int
566xfs_attrmulti_by_handle( 535xfs_attrmulti_by_handle(
567 xfs_mount_t *mp,
568 void __user *arg,
569 struct file *parfilp, 536 struct file *parfilp,
570 struct inode *parinode) 537 void __user *arg)
571{ 538{
572 int error; 539 int error;
573 xfs_attr_multiop_t *ops; 540 xfs_attr_multiop_t *ops;
574 xfs_fsop_attrmulti_handlereq_t am_hreq; 541 xfs_fsop_attrmulti_handlereq_t am_hreq;
575 struct inode *inode; 542 struct dentry *dentry;
576 unsigned int i, size; 543 unsigned int i, size;
577 char *attr_name; 544 char *attr_name;
578 545
@@ -581,19 +548,19 @@ xfs_attrmulti_by_handle(
581 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) 548 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
582 return -XFS_ERROR(EFAULT); 549 return -XFS_ERROR(EFAULT);
583 550
584 error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &inode); 551 dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
585 if (error) 552 if (IS_ERR(dentry))
586 goto out; 553 return PTR_ERR(dentry);
587 554
588 error = E2BIG; 555 error = E2BIG;
589 size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); 556 size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
590 if (!size || size > 16 * PAGE_SIZE) 557 if (!size || size > 16 * PAGE_SIZE)
591 goto out_vn_rele; 558 goto out_dput;
592 559
593 error = ENOMEM; 560 error = ENOMEM;
594 ops = kmalloc(size, GFP_KERNEL); 561 ops = kmalloc(size, GFP_KERNEL);
595 if (!ops) 562 if (!ops)
596 goto out_vn_rele; 563 goto out_dput;
597 564
598 error = EFAULT; 565 error = EFAULT;
599 if (copy_from_user(ops, am_hreq.ops, size)) 566 if (copy_from_user(ops, am_hreq.ops, size))
@@ -615,25 +582,28 @@ xfs_attrmulti_by_handle(
615 582
616 switch (ops[i].am_opcode) { 583 switch (ops[i].am_opcode) {
617 case ATTR_OP_GET: 584 case ATTR_OP_GET:
618 ops[i].am_error = xfs_attrmulti_attr_get(inode, 585 ops[i].am_error = xfs_attrmulti_attr_get(
619 attr_name, ops[i].am_attrvalue, 586 dentry->d_inode, attr_name,
620 &ops[i].am_length, ops[i].am_flags); 587 ops[i].am_attrvalue, &ops[i].am_length,
588 ops[i].am_flags);
621 break; 589 break;
622 case ATTR_OP_SET: 590 case ATTR_OP_SET:
623 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); 591 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
624 if (ops[i].am_error) 592 if (ops[i].am_error)
625 break; 593 break;
626 ops[i].am_error = xfs_attrmulti_attr_set(inode, 594 ops[i].am_error = xfs_attrmulti_attr_set(
627 attr_name, ops[i].am_attrvalue, 595 dentry->d_inode, attr_name,
628 ops[i].am_length, ops[i].am_flags); 596 ops[i].am_attrvalue, ops[i].am_length,
597 ops[i].am_flags);
629 mnt_drop_write(parfilp->f_path.mnt); 598 mnt_drop_write(parfilp->f_path.mnt);
630 break; 599 break;
631 case ATTR_OP_REMOVE: 600 case ATTR_OP_REMOVE:
632 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); 601 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
633 if (ops[i].am_error) 602 if (ops[i].am_error)
634 break; 603 break;
635 ops[i].am_error = xfs_attrmulti_attr_remove(inode, 604 ops[i].am_error = xfs_attrmulti_attr_remove(
636 attr_name, ops[i].am_flags); 605 dentry->d_inode, attr_name,
606 ops[i].am_flags);
637 mnt_drop_write(parfilp->f_path.mnt); 607 mnt_drop_write(parfilp->f_path.mnt);
638 break; 608 break;
639 default: 609 default:
@@ -647,9 +617,8 @@ xfs_attrmulti_by_handle(
647 kfree(attr_name); 617 kfree(attr_name);
648 out_kfree_ops: 618 out_kfree_ops:
649 kfree(ops); 619 kfree(ops);
650 out_vn_rele: 620 out_dput:
651 iput(inode); 621 dput(dentry);
652 out:
653 return -error; 622 return -error;
654} 623}
655 624
@@ -1440,23 +1409,23 @@ xfs_file_ioctl(
1440 1409
1441 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) 1410 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
1442 return -XFS_ERROR(EFAULT); 1411 return -XFS_ERROR(EFAULT);
1443 return xfs_open_by_handle(mp, &hreq, filp, inode); 1412 return xfs_open_by_handle(filp, &hreq);
1444 } 1413 }
1445 case XFS_IOC_FSSETDM_BY_HANDLE: 1414 case XFS_IOC_FSSETDM_BY_HANDLE:
1446 return xfs_fssetdm_by_handle(mp, arg, inode); 1415 return xfs_fssetdm_by_handle(filp, arg);
1447 1416
1448 case XFS_IOC_READLINK_BY_HANDLE: { 1417 case XFS_IOC_READLINK_BY_HANDLE: {
1449 xfs_fsop_handlereq_t hreq; 1418 xfs_fsop_handlereq_t hreq;
1450 1419
1451 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) 1420 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
1452 return -XFS_ERROR(EFAULT); 1421 return -XFS_ERROR(EFAULT);
1453 return xfs_readlink_by_handle(mp, &hreq, inode); 1422 return xfs_readlink_by_handle(filp, &hreq);
1454 } 1423 }
1455 case XFS_IOC_ATTRLIST_BY_HANDLE: 1424 case XFS_IOC_ATTRLIST_BY_HANDLE:
1456 return xfs_attrlist_by_handle(mp, arg, inode); 1425 return xfs_attrlist_by_handle(filp, arg);
1457 1426
1458 case XFS_IOC_ATTRMULTI_BY_HANDLE: 1427 case XFS_IOC_ATTRMULTI_BY_HANDLE:
1459 return xfs_attrmulti_by_handle(mp, arg, filp, inode); 1428 return xfs_attrmulti_by_handle(filp, arg);
1460 1429
1461 case XFS_IOC_SWAPEXT: { 1430 case XFS_IOC_SWAPEXT: {
1462 struct xfs_swapext sxp; 1431 struct xfs_swapext sxp;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 8c16bf2d7e03..7bd7c6afc1eb 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -34,16 +34,13 @@ xfs_find_handle(
34 34
35extern int 35extern int
36xfs_open_by_handle( 36xfs_open_by_handle(
37 xfs_mount_t *mp,
38 xfs_fsop_handlereq_t *hreq,
39 struct file *parfilp, 37 struct file *parfilp,
40 struct inode *parinode); 38 xfs_fsop_handlereq_t *hreq);
41 39
42extern int 40extern int
43xfs_readlink_by_handle( 41xfs_readlink_by_handle(
44 xfs_mount_t *mp, 42 struct file *parfilp,
45 xfs_fsop_handlereq_t *hreq, 43 xfs_fsop_handlereq_t *hreq);
46 struct inode *parinode);
47 44
48extern int 45extern int
49xfs_attrmulti_attr_get( 46xfs_attrmulti_attr_get(
@@ -67,6 +64,12 @@ xfs_attrmulti_attr_remove(
67 char *name, 64 char *name,
68 __uint32_t flags); 65 __uint32_t flags);
69 66
67extern struct dentry *
68xfs_handle_to_dentry(
69 struct file *parfilp,
70 void __user *uhandle,
71 u32 hlen);
72
70extern long 73extern long
71xfs_file_ioctl( 74xfs_file_ioctl(
72 struct file *filp, 75 struct file *filp,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 50903ad3182e..c70c4e3db790 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/ioctl.h> 19#include <linux/ioctl.h>
20#include <linux/mount.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include "xfs.h" 22#include "xfs.h"
22#include "xfs_fs.h" 23#include "xfs_fs.h"
@@ -340,96 +341,24 @@ xfs_compat_handlereq_copyin(
340 return 0; 341 return 0;
341} 342}
342 343
343/* 344STATIC struct dentry *
344 * Convert userspace handle data into inode. 345xfs_compat_handlereq_to_dentry(
345 * 346 struct file *parfilp,
346 * We use the fact that all the fsop_handlereq ioctl calls have a data 347 compat_xfs_fsop_handlereq_t *hreq)
347 * structure argument whose first component is always a xfs_fsop_handlereq_t,
348 * so we can pass that sub structure into this handy, shared routine.
349 *
350 * If no error, caller must always iput the returned inode.
351 */
352STATIC int
353xfs_vget_fsop_handlereq_compat(
354 xfs_mount_t *mp,
355 struct inode *parinode, /* parent inode pointer */
356 compat_xfs_fsop_handlereq_t *hreq,
357 struct inode **inode)
358{ 348{
359 void __user *hanp; 349 return xfs_handle_to_dentry(parfilp,
360 size_t hlen; 350 compat_ptr(hreq->ihandle), hreq->ihandlen);
361 xfs_fid_t *xfid;
362 xfs_handle_t *handlep;
363 xfs_handle_t handle;
364 xfs_inode_t *ip;
365 xfs_ino_t ino;
366 __u32 igen;
367 int error;
368
369 /*
370 * Only allow handle opens under a directory.
371 */
372 if (!S_ISDIR(parinode->i_mode))
373 return XFS_ERROR(ENOTDIR);
374
375 hanp = compat_ptr(hreq->ihandle);
376 hlen = hreq->ihandlen;
377 handlep = &handle;
378
379 if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
380 return XFS_ERROR(EINVAL);
381 if (copy_from_user(handlep, hanp, hlen))
382 return XFS_ERROR(EFAULT);
383 if (hlen < sizeof(*handlep))
384 memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
385 if (hlen > sizeof(handlep->ha_fsid)) {
386 if (handlep->ha_fid.fid_len !=
387 (hlen - sizeof(handlep->ha_fsid) -
388 sizeof(handlep->ha_fid.fid_len)) ||
389 handlep->ha_fid.fid_pad)
390 return XFS_ERROR(EINVAL);
391 }
392
393 /*
394 * Crack the handle, obtain the inode # & generation #
395 */
396 xfid = (struct xfs_fid *)&handlep->ha_fid;
397 if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
398 ino = xfid->fid_ino;
399 igen = xfid->fid_gen;
400 } else {
401 return XFS_ERROR(EINVAL);
402 }
403
404 /*
405 * Get the XFS inode, building a Linux inode to go with it.
406 */
407 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
408 if (error)
409 return error;
410 if (ip == NULL)
411 return XFS_ERROR(EIO);
412 if (ip->i_d.di_gen != igen) {
413 xfs_iput_new(ip, XFS_ILOCK_SHARED);
414 return XFS_ERROR(ENOENT);
415 }
416
417 xfs_iunlock(ip, XFS_ILOCK_SHARED);
418
419 *inode = VFS_I(ip);
420 return 0;
421} 351}
422 352
423STATIC int 353STATIC int
424xfs_compat_attrlist_by_handle( 354xfs_compat_attrlist_by_handle(
425 xfs_mount_t *mp, 355 struct file *parfilp,
426 void __user *arg, 356 void __user *arg)
427 struct inode *parinode)
428{ 357{
429 int error; 358 int error;
430 attrlist_cursor_kern_t *cursor; 359 attrlist_cursor_kern_t *cursor;
431 compat_xfs_fsop_attrlist_handlereq_t al_hreq; 360 compat_xfs_fsop_attrlist_handlereq_t al_hreq;
432 struct inode *inode; 361 struct dentry *dentry;
433 char *kbuf; 362 char *kbuf;
434 363
435 if (!capable(CAP_SYS_ADMIN)) 364 if (!capable(CAP_SYS_ADMIN))
@@ -446,17 +375,17 @@ xfs_compat_attrlist_by_handle(
446 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) 375 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
447 return -XFS_ERROR(EINVAL); 376 return -XFS_ERROR(EINVAL);
448 377
449 error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq, 378 dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
450 &inode); 379 if (IS_ERR(dentry))
451 if (error) 380 return PTR_ERR(dentry);
452 goto out;
453 381
382 error = -ENOMEM;
454 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); 383 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
455 if (!kbuf) 384 if (!kbuf)
456 goto out_vn_rele; 385 goto out_dput;
457 386
458 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; 387 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
459 error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen, 388 error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
460 al_hreq.flags, cursor); 389 al_hreq.flags, cursor);
461 if (error) 390 if (error)
462 goto out_kfree; 391 goto out_kfree;
@@ -466,22 +395,20 @@ xfs_compat_attrlist_by_handle(
466 395
467 out_kfree: 396 out_kfree:
468 kfree(kbuf); 397 kfree(kbuf);
469 out_vn_rele: 398 out_dput:
470 iput(inode); 399 dput(dentry);
471 out: 400 return error;
472 return -error;
473} 401}
474 402
475STATIC int 403STATIC int
476xfs_compat_attrmulti_by_handle( 404xfs_compat_attrmulti_by_handle(
477 xfs_mount_t *mp, 405 struct file *parfilp,
478 void __user *arg, 406 void __user *arg)
479 struct inode *parinode)
480{ 407{
481 int error; 408 int error;
482 compat_xfs_attr_multiop_t *ops; 409 compat_xfs_attr_multiop_t *ops;
483 compat_xfs_fsop_attrmulti_handlereq_t am_hreq; 410 compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
484 struct inode *inode; 411 struct dentry *dentry;
485 unsigned int i, size; 412 unsigned int i, size;
486 char *attr_name; 413 char *attr_name;
487 414
@@ -491,20 +418,19 @@ xfs_compat_attrmulti_by_handle(
491 sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) 418 sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
492 return -XFS_ERROR(EFAULT); 419 return -XFS_ERROR(EFAULT);
493 420
494 error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq, 421 dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
495 &inode); 422 if (IS_ERR(dentry))
496 if (error) 423 return PTR_ERR(dentry);
497 goto out;
498 424
499 error = E2BIG; 425 error = E2BIG;
500 size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t); 426 size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
501 if (!size || size > 16 * PAGE_SIZE) 427 if (!size || size > 16 * PAGE_SIZE)
502 goto out_vn_rele; 428 goto out_dput;
503 429
504 error = ENOMEM; 430 error = ENOMEM;
505 ops = kmalloc(size, GFP_KERNEL); 431 ops = kmalloc(size, GFP_KERNEL);
506 if (!ops) 432 if (!ops)
507 goto out_vn_rele; 433 goto out_dput;
508 434
509 error = EFAULT; 435 error = EFAULT;
510 if (copy_from_user(ops, compat_ptr(am_hreq.ops), size)) 436 if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
@@ -527,20 +453,29 @@ xfs_compat_attrmulti_by_handle(
527 453
528 switch (ops[i].am_opcode) { 454 switch (ops[i].am_opcode) {
529 case ATTR_OP_GET: 455 case ATTR_OP_GET:
530 ops[i].am_error = xfs_attrmulti_attr_get(inode, 456 ops[i].am_error = xfs_attrmulti_attr_get(
531 attr_name, 457 dentry->d_inode, attr_name,
532 compat_ptr(ops[i].am_attrvalue), 458 compat_ptr(ops[i].am_attrvalue),
533 &ops[i].am_length, ops[i].am_flags); 459 &ops[i].am_length, ops[i].am_flags);
534 break; 460 break;
535 case ATTR_OP_SET: 461 case ATTR_OP_SET:
536 ops[i].am_error = xfs_attrmulti_attr_set(inode, 462 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
537 attr_name, 463 if (ops[i].am_error)
464 break;
465 ops[i].am_error = xfs_attrmulti_attr_set(
466 dentry->d_inode, attr_name,
538 compat_ptr(ops[i].am_attrvalue), 467 compat_ptr(ops[i].am_attrvalue),
539 ops[i].am_length, ops[i].am_flags); 468 ops[i].am_length, ops[i].am_flags);
469 mnt_drop_write(parfilp->f_path.mnt);
540 break; 470 break;
541 case ATTR_OP_REMOVE: 471 case ATTR_OP_REMOVE:
542 ops[i].am_error = xfs_attrmulti_attr_remove(inode, 472 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
543 attr_name, ops[i].am_flags); 473 if (ops[i].am_error)
474 break;
475 ops[i].am_error = xfs_attrmulti_attr_remove(
476 dentry->d_inode, attr_name,
477 ops[i].am_flags);
478 mnt_drop_write(parfilp->f_path.mnt);
544 break; 479 break;
545 default: 480 default:
546 ops[i].am_error = EINVAL; 481 ops[i].am_error = EINVAL;
@@ -553,22 +488,20 @@ xfs_compat_attrmulti_by_handle(
553 kfree(attr_name); 488 kfree(attr_name);
554 out_kfree_ops: 489 out_kfree_ops:
555 kfree(ops); 490 kfree(ops);
556 out_vn_rele: 491 out_dput:
557 iput(inode); 492 dput(dentry);
558 out:
559 return -error; 493 return -error;
560} 494}
561 495
562STATIC int 496STATIC int
563xfs_compat_fssetdm_by_handle( 497xfs_compat_fssetdm_by_handle(
564 xfs_mount_t *mp, 498 struct file *parfilp,
565 void __user *arg, 499 void __user *arg)
566 struct inode *parinode)
567{ 500{
568 int error; 501 int error;
569 struct fsdmidata fsd; 502 struct fsdmidata fsd;
570 compat_xfs_fsop_setdm_handlereq_t dmhreq; 503 compat_xfs_fsop_setdm_handlereq_t dmhreq;
571 struct inode *inode; 504 struct dentry *dentry;
572 505
573 if (!capable(CAP_MKNOD)) 506 if (!capable(CAP_MKNOD))
574 return -XFS_ERROR(EPERM); 507 return -XFS_ERROR(EPERM);
@@ -576,12 +509,11 @@ xfs_compat_fssetdm_by_handle(
576 sizeof(compat_xfs_fsop_setdm_handlereq_t))) 509 sizeof(compat_xfs_fsop_setdm_handlereq_t)))
577 return -XFS_ERROR(EFAULT); 510 return -XFS_ERROR(EFAULT);
578 511
579 error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq, 512 dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
580 &inode); 513 if (IS_ERR(dentry))
581 if (error) 514 return PTR_ERR(dentry);
582 return -error;
583 515
584 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { 516 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
585 error = -XFS_ERROR(EPERM); 517 error = -XFS_ERROR(EPERM);
586 goto out; 518 goto out;
587 } 519 }
@@ -591,11 +523,11 @@ xfs_compat_fssetdm_by_handle(
591 goto out; 523 goto out;
592 } 524 }
593 525
594 error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask, 526 error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
595 fsd.fsd_dmstate); 527 fsd.fsd_dmstate);
596 528
597out: 529out:
598 iput(inode); 530 dput(dentry);
599 return error; 531 return error;
600} 532}
601 533
@@ -722,21 +654,21 @@ xfs_file_compat_ioctl(
722 654
723 if (xfs_compat_handlereq_copyin(&hreq, arg)) 655 if (xfs_compat_handlereq_copyin(&hreq, arg))
724 return -XFS_ERROR(EFAULT); 656 return -XFS_ERROR(EFAULT);
725 return xfs_open_by_handle(mp, &hreq, filp, inode); 657 return xfs_open_by_handle(filp, &hreq);
726 } 658 }
727 case XFS_IOC_READLINK_BY_HANDLE_32: { 659 case XFS_IOC_READLINK_BY_HANDLE_32: {
728 struct xfs_fsop_handlereq hreq; 660 struct xfs_fsop_handlereq hreq;
729 661
730 if (xfs_compat_handlereq_copyin(&hreq, arg)) 662 if (xfs_compat_handlereq_copyin(&hreq, arg))
731 return -XFS_ERROR(EFAULT); 663 return -XFS_ERROR(EFAULT);
732 return xfs_readlink_by_handle(mp, &hreq, inode); 664 return xfs_readlink_by_handle(filp, &hreq);
733 } 665 }
734 case XFS_IOC_ATTRLIST_BY_HANDLE_32: 666 case XFS_IOC_ATTRLIST_BY_HANDLE_32:
735 return xfs_compat_attrlist_by_handle(mp, arg, inode); 667 return xfs_compat_attrlist_by_handle(filp, arg);
736 case XFS_IOC_ATTRMULTI_BY_HANDLE_32: 668 case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
737 return xfs_compat_attrmulti_by_handle(mp, arg, inode); 669 return xfs_compat_attrmulti_by_handle(filp, arg);
738 case XFS_IOC_FSSETDM_BY_HANDLE_32: 670 case XFS_IOC_FSSETDM_BY_HANDLE_32:
739 return xfs_compat_fssetdm_by_handle(mp, arg, inode); 671 return xfs_compat_fssetdm_by_handle(filp, arg);
740 default: 672 default:
741 return -XFS_ERROR(ENOIOCTLCMD); 673 return -XFS_ERROR(ENOIOCTLCMD);
742 } 674 }
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 95a971080368..32ae5028e96b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -734,15 +734,15 @@ xfs_close_devices(
734{ 734{
735 if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { 735 if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
736 struct block_device *logdev = mp->m_logdev_targp->bt_bdev; 736 struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
737 xfs_free_buftarg(mp->m_logdev_targp); 737 xfs_free_buftarg(mp, mp->m_logdev_targp);
738 xfs_blkdev_put(logdev); 738 xfs_blkdev_put(logdev);
739 } 739 }
740 if (mp->m_rtdev_targp) { 740 if (mp->m_rtdev_targp) {
741 struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; 741 struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
742 xfs_free_buftarg(mp->m_rtdev_targp); 742 xfs_free_buftarg(mp, mp->m_rtdev_targp);
743 xfs_blkdev_put(rtdev); 743 xfs_blkdev_put(rtdev);
744 } 744 }
745 xfs_free_buftarg(mp->m_ddev_targp); 745 xfs_free_buftarg(mp, mp->m_ddev_targp);
746} 746}
747 747
748/* 748/*
@@ -811,9 +811,9 @@ xfs_open_devices(
811 811
812 out_free_rtdev_targ: 812 out_free_rtdev_targ:
813 if (mp->m_rtdev_targp) 813 if (mp->m_rtdev_targp)
814 xfs_free_buftarg(mp->m_rtdev_targp); 814 xfs_free_buftarg(mp, mp->m_rtdev_targp);
815 out_free_ddev_targ: 815 out_free_ddev_targ:
816 xfs_free_buftarg(mp->m_ddev_targp); 816 xfs_free_buftarg(mp, mp->m_ddev_targp);
817 out_close_rtdev: 817 out_close_rtdev:
818 if (rtdev) 818 if (rtdev)
819 xfs_blkdev_put(rtdev); 819 xfs_blkdev_put(rtdev);
@@ -1197,6 +1197,7 @@ xfs_fs_remount(
1197 struct xfs_mount *mp = XFS_M(sb); 1197 struct xfs_mount *mp = XFS_M(sb);
1198 substring_t args[MAX_OPT_ARGS]; 1198 substring_t args[MAX_OPT_ARGS];
1199 char *p; 1199 char *p;
1200 int error;
1200 1201
1201 while ((p = strsep(&options, ",")) != NULL) { 1202 while ((p = strsep(&options, ",")) != NULL) {
1202 int token; 1203 int token;
@@ -1247,11 +1248,25 @@ xfs_fs_remount(
1247 } 1248 }
1248 } 1249 }
1249 1250
1250 /* rw/ro -> rw */ 1251 /* ro -> rw */
1251 if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { 1252 if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
1252 mp->m_flags &= ~XFS_MOUNT_RDONLY; 1253 mp->m_flags &= ~XFS_MOUNT_RDONLY;
1253 if (mp->m_flags & XFS_MOUNT_BARRIER) 1254 if (mp->m_flags & XFS_MOUNT_BARRIER)
1254 xfs_mountfs_check_barriers(mp); 1255 xfs_mountfs_check_barriers(mp);
1256
1257 /*
1258 * If this is the first remount to writeable state we
1259 * might have some superblock changes to update.
1260 */
1261 if (mp->m_update_flags) {
1262 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1263 if (error) {
1264 cmn_err(CE_WARN,
1265 "XFS: failed to write sb changes");
1266 return error;
1267 }
1268 mp->m_update_flags = 0;
1269 }
1255 } 1270 }
1256 1271
1257 /* rw -> ro */ 1272 /* rw -> ro */
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 2ed035354c26..a608e72fa405 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -371,7 +371,11 @@ xfs_quiesce_attr(
371 /* flush inodes and push all remaining buffers out to disk */ 371 /* flush inodes and push all remaining buffers out to disk */
372 xfs_quiesce_fs(mp); 372 xfs_quiesce_fs(mp);
373 373
374 ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); 374 /*
375 * Just warn here till VFS can correctly support
376 * read-only remount without racing.
377 */
378 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
375 379
376 /* Push the superblock and write an unmount record */ 380 /* Push the superblock and write an unmount record */
377 error = xfs_log_sbcount(mp, 1); 381 error = xfs_log_sbcount(mp, 1);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 591ca6602bfb..6543c0b29753 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -73,6 +73,8 @@ int xfs_dqreq_num;
73int xfs_dqerror_mod = 33; 73int xfs_dqerror_mod = 33;
74#endif 74#endif
75 75
76static struct lock_class_key xfs_dquot_other_class;
77
76/* 78/*
77 * Allocate and initialize a dquot. We don't always allocate fresh memory; 79 * Allocate and initialize a dquot. We don't always allocate fresh memory;
78 * we try to reclaim a free dquot if the number of incore dquots are above 80 * we try to reclaim a free dquot if the number of incore dquots are above
@@ -139,7 +141,15 @@ xfs_qm_dqinit(
139 ASSERT(dqp->q_trace); 141 ASSERT(dqp->q_trace);
140 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT"); 142 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
141#endif 143#endif
142 } 144 }
145
146 /*
147 * In either case we need to make sure group quotas have a different
148 * lock class than user quotas, to make sure lockdep knows we can
149 * locks of one of each at the same time.
150 */
151 if (!(type & XFS_DQ_USER))
152 lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
143 153
144 /* 154 /*
145 * log item gets initialized later 155 * log item gets initialized later
@@ -421,7 +431,7 @@ xfs_qm_dqalloc(
421 /* 431 /*
422 * Initialize the bmap freelist prior to calling bmapi code. 432 * Initialize the bmap freelist prior to calling bmapi code.
423 */ 433 */
424 XFS_BMAP_INIT(&flist, &firstblock); 434 xfs_bmap_init(&flist, &firstblock);
425 xfs_ilock(quotip, XFS_ILOCK_EXCL); 435 xfs_ilock(quotip, XFS_ILOCK_EXCL);
426 /* 436 /*
427 * Return if this type of quotas is turned off while we didn't 437 * Return if this type of quotas is turned off while we didn't
@@ -1383,6 +1393,12 @@ xfs_dqunlock_nonotify(
1383 mutex_unlock(&(dqp->q_qlock)); 1393 mutex_unlock(&(dqp->q_qlock));
1384} 1394}
1385 1395
1396/*
1397 * Lock two xfs_dquot structures.
1398 *
1399 * To avoid deadlocks we always lock the quota structure with
1400 * the lowerd id first.
1401 */
1386void 1402void
1387xfs_dqlock2( 1403xfs_dqlock2(
1388 xfs_dquot_t *d1, 1404 xfs_dquot_t *d1,
@@ -1392,18 +1408,16 @@ xfs_dqlock2(
1392 ASSERT(d1 != d2); 1408 ASSERT(d1 != d2);
1393 if (be32_to_cpu(d1->q_core.d_id) > 1409 if (be32_to_cpu(d1->q_core.d_id) >
1394 be32_to_cpu(d2->q_core.d_id)) { 1410 be32_to_cpu(d2->q_core.d_id)) {
1395 xfs_dqlock(d2); 1411 mutex_lock(&d2->q_qlock);
1396 xfs_dqlock(d1); 1412 mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
1397 } else { 1413 } else {
1398 xfs_dqlock(d1); 1414 mutex_lock(&d1->q_qlock);
1399 xfs_dqlock(d2); 1415 mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
1400 }
1401 } else {
1402 if (d1) {
1403 xfs_dqlock(d1);
1404 } else if (d2) {
1405 xfs_dqlock(d2);
1406 } 1416 }
1417 } else if (d1) {
1418 mutex_lock(&d1->q_qlock);
1419 } else if (d2) {
1420 mutex_lock(&d2->q_qlock);
1407 } 1421 }
1408} 1422}
1409 1423
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 7e455337e2ba..d443e93b4331 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -97,6 +97,16 @@ typedef struct xfs_dquot {
97#define dq_hashlist q_lists.dqm_hashlist 97#define dq_hashlist q_lists.dqm_hashlist
98#define dq_flags q_lists.dqm_flags 98#define dq_flags q_lists.dqm_flags
99 99
100/*
101 * Lock hierachy for q_qlock:
102 * XFS_QLOCK_NORMAL is the implicit default,
103 * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
104 */
105enum {
106 XFS_QLOCK_NORMAL = 0,
107 XFS_QLOCK_NESTED,
108};
109
100#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) 110#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++)
101 111
102#ifdef DEBUG 112#ifdef DEBUG
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 6b13960cf318..7a2beb64314f 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1070,6 +1070,13 @@ xfs_qm_sync(
1070 return 0; 1070 return 0;
1071} 1071}
1072 1072
1073/*
1074 * The hash chains and the mplist use the same xfs_dqhash structure as
1075 * their list head, but we can take the mplist qh_lock and one of the
1076 * hash qh_locks at the same time without any problem as they aren't
1077 * related.
1078 */
1079static struct lock_class_key xfs_quota_mplist_class;
1073 1080
1074/* 1081/*
1075 * This initializes all the quota information that's kept in the 1082 * This initializes all the quota information that's kept in the
@@ -1105,6 +1112,8 @@ xfs_qm_init_quotainfo(
1105 } 1112 }
1106 1113
1107 xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); 1114 xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
1115 lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class);
1116
1108 qinf->qi_dqreclaims = 0; 1117 qinf->qi_dqreclaims = 0;
1109 1118
1110 /* mutex used to serialize quotaoffs */ 1119 /* mutex used to serialize quotaoffs */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index d3b3cf742999..143d63ecb20a 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -244,8 +244,8 @@ typedef struct xfs_perag
244#define XFS_AG_CHECK_DADDR(mp,d,len) \ 244#define XFS_AG_CHECK_DADDR(mp,d,len) \
245 ((len) == 1 ? \ 245 ((len) == 1 ? \
246 ASSERT((d) == XFS_SB_DADDR || \ 246 ASSERT((d) == XFS_SB_DADDR || \
247 XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \ 247 xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
248 ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \ 248 ASSERT(xfs_daddr_to_agno(mp, d) == \
249 XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1))) 249 xfs_daddr_to_agno(mp, (d) + (len) - 1)))
250 250
251#endif /* __XFS_AG_H__ */ 251#endif /* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 733cb75a8c5d..c10c3a292d30 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -115,7 +115,7 @@ xfs_allocbt_free_block(
115 xfs_agblock_t bno; 115 xfs_agblock_t bno;
116 int error; 116 int error;
117 117
118 bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp)); 118 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
119 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); 119 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
120 if (error) 120 if (error)
121 return error; 121 return error;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index f7cdc28aff41..5fde1654b430 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -374,7 +374,7 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
374 * It won't fit in the shortform, transform to a leaf block. 374 * It won't fit in the shortform, transform to a leaf block.
375 * GROT: another possible req'mt for a double-split btree op. 375 * GROT: another possible req'mt for a double-split btree op.
376 */ 376 */
377 XFS_BMAP_INIT(args.flist, args.firstblock); 377 xfs_bmap_init(args.flist, args.firstblock);
378 error = xfs_attr_shortform_to_leaf(&args); 378 error = xfs_attr_shortform_to_leaf(&args);
379 if (!error) { 379 if (!error) {
380 error = xfs_bmap_finish(&args.trans, args.flist, 380 error = xfs_bmap_finish(&args.trans, args.flist,
@@ -956,7 +956,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
956 * Commit that transaction so that the node_addname() call 956 * Commit that transaction so that the node_addname() call
957 * can manage its own transactions. 957 * can manage its own transactions.
958 */ 958 */
959 XFS_BMAP_INIT(args->flist, args->firstblock); 959 xfs_bmap_init(args->flist, args->firstblock);
960 error = xfs_attr_leaf_to_node(args); 960 error = xfs_attr_leaf_to_node(args);
961 if (!error) { 961 if (!error) {
962 error = xfs_bmap_finish(&args->trans, args->flist, 962 error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1057,7 +1057,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
1057 * If the result is small enough, shrink it all into the inode. 1057 * If the result is small enough, shrink it all into the inode.
1058 */ 1058 */
1059 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 1059 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
1060 XFS_BMAP_INIT(args->flist, args->firstblock); 1060 xfs_bmap_init(args->flist, args->firstblock);
1061 error = xfs_attr_leaf_to_shortform(bp, args, forkoff); 1061 error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
1062 /* bp is gone due to xfs_da_shrink_inode */ 1062 /* bp is gone due to xfs_da_shrink_inode */
1063 if (!error) { 1063 if (!error) {
@@ -1135,7 +1135,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
1135 * If the result is small enough, shrink it all into the inode. 1135 * If the result is small enough, shrink it all into the inode.
1136 */ 1136 */
1137 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 1137 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
1138 XFS_BMAP_INIT(args->flist, args->firstblock); 1138 xfs_bmap_init(args->flist, args->firstblock);
1139 error = xfs_attr_leaf_to_shortform(bp, args, forkoff); 1139 error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
1140 /* bp is gone due to xfs_da_shrink_inode */ 1140 /* bp is gone due to xfs_da_shrink_inode */
1141 if (!error) { 1141 if (!error) {
@@ -1290,7 +1290,7 @@ restart:
1290 * have been a b-tree. 1290 * have been a b-tree.
1291 */ 1291 */
1292 xfs_da_state_free(state); 1292 xfs_da_state_free(state);
1293 XFS_BMAP_INIT(args->flist, args->firstblock); 1293 xfs_bmap_init(args->flist, args->firstblock);
1294 error = xfs_attr_leaf_to_node(args); 1294 error = xfs_attr_leaf_to_node(args);
1295 if (!error) { 1295 if (!error) {
1296 error = xfs_bmap_finish(&args->trans, 1296 error = xfs_bmap_finish(&args->trans,
@@ -1331,7 +1331,7 @@ restart:
1331 * in the index/blkno/rmtblkno/rmtblkcnt fields and 1331 * in the index/blkno/rmtblkno/rmtblkcnt fields and
1332 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. 1332 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
1333 */ 1333 */
1334 XFS_BMAP_INIT(args->flist, args->firstblock); 1334 xfs_bmap_init(args->flist, args->firstblock);
1335 error = xfs_da_split(state); 1335 error = xfs_da_split(state);
1336 if (!error) { 1336 if (!error) {
1337 error = xfs_bmap_finish(&args->trans, args->flist, 1337 error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1443,7 +1443,7 @@ restart:
1443 * Check to see if the tree needs to be collapsed. 1443 * Check to see if the tree needs to be collapsed.
1444 */ 1444 */
1445 if (retval && (state->path.active > 1)) { 1445 if (retval && (state->path.active > 1)) {
1446 XFS_BMAP_INIT(args->flist, args->firstblock); 1446 xfs_bmap_init(args->flist, args->firstblock);
1447 error = xfs_da_join(state); 1447 error = xfs_da_join(state);
1448 if (!error) { 1448 if (!error) {
1449 error = xfs_bmap_finish(&args->trans, 1449 error = xfs_bmap_finish(&args->trans,
@@ -1579,7 +1579,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1579 * Check to see if the tree needs to be collapsed. 1579 * Check to see if the tree needs to be collapsed.
1580 */ 1580 */
1581 if (retval && (state->path.active > 1)) { 1581 if (retval && (state->path.active > 1)) {
1582 XFS_BMAP_INIT(args->flist, args->firstblock); 1582 xfs_bmap_init(args->flist, args->firstblock);
1583 error = xfs_da_join(state); 1583 error = xfs_da_join(state);
1584 if (!error) { 1584 if (!error) {
1585 error = xfs_bmap_finish(&args->trans, args->flist, 1585 error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1630,7 +1630,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1630 == XFS_ATTR_LEAF_MAGIC); 1630 == XFS_ATTR_LEAF_MAGIC);
1631 1631
1632 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 1632 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
1633 XFS_BMAP_INIT(args->flist, args->firstblock); 1633 xfs_bmap_init(args->flist, args->firstblock);
1634 error = xfs_attr_leaf_to_shortform(bp, args, forkoff); 1634 error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
1635 /* bp is gone due to xfs_da_shrink_inode */ 1635 /* bp is gone due to xfs_da_shrink_inode */
1636 if (!error) { 1636 if (!error) {
@@ -2069,7 +2069,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2069 /* 2069 /*
2070 * Allocate a single extent, up to the size of the value. 2070 * Allocate a single extent, up to the size of the value.
2071 */ 2071 */
2072 XFS_BMAP_INIT(args->flist, args->firstblock); 2072 xfs_bmap_init(args->flist, args->firstblock);
2073 nmap = 1; 2073 nmap = 1;
2074 error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, 2074 error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
2075 blkcnt, 2075 blkcnt,
@@ -2123,7 +2123,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2123 /* 2123 /*
2124 * Try to remember where we decided to put the value. 2124 * Try to remember where we decided to put the value.
2125 */ 2125 */
2126 XFS_BMAP_INIT(args->flist, args->firstblock); 2126 xfs_bmap_init(args->flist, args->firstblock);
2127 nmap = 1; 2127 nmap = 1;
2128 error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, 2128 error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
2129 args->rmtblkcnt, 2129 args->rmtblkcnt,
@@ -2188,7 +2188,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2188 /* 2188 /*
2189 * Try to remember where we decided to put the value. 2189 * Try to remember where we decided to put the value.
2190 */ 2190 */
2191 XFS_BMAP_INIT(args->flist, args->firstblock); 2191 xfs_bmap_init(args->flist, args->firstblock);
2192 nmap = 1; 2192 nmap = 1;
2193 error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, 2193 error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
2194 args->rmtblkcnt, 2194 args->rmtblkcnt,
@@ -2229,7 +2229,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2229 blkcnt = args->rmtblkcnt; 2229 blkcnt = args->rmtblkcnt;
2230 done = 0; 2230 done = 0;
2231 while (!done) { 2231 while (!done) {
2232 XFS_BMAP_INIT(args->flist, args->firstblock); 2232 xfs_bmap_init(args->flist, args->firstblock);
2233 error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, 2233 error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
2234 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2234 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2235 1, args->firstblock, args->flist, 2235 1, args->firstblock, args->flist,
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 138308e70d14..c852cd65aaea 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -595,9 +595,9 @@ xfs_bmap_add_extent(
595 xfs_iext_insert(ifp, 0, 1, new); 595 xfs_iext_insert(ifp, 0, 1, new);
596 ASSERT(cur == NULL); 596 ASSERT(cur == NULL);
597 ifp->if_lastex = 0; 597 ifp->if_lastex = 0;
598 if (!ISNULLSTARTBLOCK(new->br_startblock)) { 598 if (!isnullstartblock(new->br_startblock)) {
599 XFS_IFORK_NEXT_SET(ip, whichfork, 1); 599 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
600 logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); 600 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
601 } else 601 } else
602 logflags = 0; 602 logflags = 0;
603 /* DELTA: single new extent */ 603 /* DELTA: single new extent */
@@ -613,7 +613,7 @@ xfs_bmap_add_extent(
613 /* 613 /*
614 * Any kind of new delayed allocation goes here. 614 * Any kind of new delayed allocation goes here.
615 */ 615 */
616 else if (ISNULLSTARTBLOCK(new->br_startblock)) { 616 else if (isnullstartblock(new->br_startblock)) {
617 if (cur) 617 if (cur)
618 ASSERT((cur->bc_private.b.flags & 618 ASSERT((cur->bc_private.b.flags &
619 XFS_BTCUR_BPRV_WASDEL) == 0); 619 XFS_BTCUR_BPRV_WASDEL) == 0);
@@ -644,11 +644,11 @@ xfs_bmap_add_extent(
644 * in a delayed or unwritten allocation with a real one, or 644 * in a delayed or unwritten allocation with a real one, or
645 * converting real back to unwritten. 645 * converting real back to unwritten.
646 */ 646 */
647 if (!ISNULLSTARTBLOCK(new->br_startblock) && 647 if (!isnullstartblock(new->br_startblock) &&
648 new->br_startoff + new->br_blockcount > prev.br_startoff) { 648 new->br_startoff + new->br_blockcount > prev.br_startoff) {
649 if (prev.br_state != XFS_EXT_UNWRITTEN && 649 if (prev.br_state != XFS_EXT_UNWRITTEN &&
650 ISNULLSTARTBLOCK(prev.br_startblock)) { 650 isnullstartblock(prev.br_startblock)) {
651 da_old = STARTBLOCKVAL(prev.br_startblock); 651 da_old = startblockval(prev.br_startblock);
652 if (cur) 652 if (cur)
653 ASSERT(cur->bc_private.b.flags & 653 ASSERT(cur->bc_private.b.flags &
654 XFS_BTCUR_BPRV_WASDEL); 654 XFS_BTCUR_BPRV_WASDEL);
@@ -803,7 +803,7 @@ xfs_bmap_add_extent_delay_real(
803 */ 803 */
804 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { 804 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
805 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); 805 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
806 STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); 806 STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
807 } 807 }
808 STATE_SET(LEFT_CONTIG, 808 STATE_SET(LEFT_CONTIG,
809 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && 809 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
@@ -820,7 +820,7 @@ xfs_bmap_add_extent_delay_real(
820 idx < 820 idx <
821 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { 821 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
822 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); 822 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
823 STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); 823 STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
824 } 824 }
825 STATE_SET(RIGHT_CONTIG, 825 STATE_SET(RIGHT_CONTIG,
826 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && 826 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
@@ -1019,8 +1019,8 @@ xfs_bmap_add_extent_delay_real(
1019 goto done; 1019 goto done;
1020 } 1020 }
1021 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1021 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1022 STARTBLOCKVAL(PREV.br_startblock)); 1022 startblockval(PREV.br_startblock));
1023 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1023 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1024 XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); 1024 XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK);
1025 *dnew = temp; 1025 *dnew = temp;
1026 /* DELTA: The boundary between two in-core extents moved. */ 1026 /* DELTA: The boundary between two in-core extents moved. */
@@ -1067,10 +1067,10 @@ xfs_bmap_add_extent_delay_real(
1067 goto done; 1067 goto done;
1068 } 1068 }
1069 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1069 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1070 STARTBLOCKVAL(PREV.br_startblock) - 1070 startblockval(PREV.br_startblock) -
1071 (cur ? cur->bc_private.b.allocated : 0)); 1071 (cur ? cur->bc_private.b.allocated : 0));
1072 ep = xfs_iext_get_ext(ifp, idx + 1); 1072 ep = xfs_iext_get_ext(ifp, idx + 1);
1073 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1073 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1074 XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK); 1074 XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK);
1075 *dnew = temp; 1075 *dnew = temp;
1076 /* DELTA: One in-core extent is split in two. */ 1076 /* DELTA: One in-core extent is split in two. */
@@ -1110,8 +1110,8 @@ xfs_bmap_add_extent_delay_real(
1110 goto done; 1110 goto done;
1111 } 1111 }
1112 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1112 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1113 STARTBLOCKVAL(PREV.br_startblock)); 1113 startblockval(PREV.br_startblock));
1114 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1114 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1115 XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); 1115 XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK);
1116 *dnew = temp; 1116 *dnew = temp;
1117 /* DELTA: The boundary between two in-core extents moved. */ 1117 /* DELTA: The boundary between two in-core extents moved. */
@@ -1157,10 +1157,10 @@ xfs_bmap_add_extent_delay_real(
1157 goto done; 1157 goto done;
1158 } 1158 }
1159 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1159 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1160 STARTBLOCKVAL(PREV.br_startblock) - 1160 startblockval(PREV.br_startblock) -
1161 (cur ? cur->bc_private.b.allocated : 0)); 1161 (cur ? cur->bc_private.b.allocated : 0));
1162 ep = xfs_iext_get_ext(ifp, idx); 1162 ep = xfs_iext_get_ext(ifp, idx);
1163 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1163 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1164 XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK); 1164 XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK);
1165 *dnew = temp; 1165 *dnew = temp;
1166 /* DELTA: One in-core extent is split in two. */ 1166 /* DELTA: One in-core extent is split in two. */
@@ -1213,7 +1213,7 @@ xfs_bmap_add_extent_delay_real(
1213 } 1213 }
1214 temp = xfs_bmap_worst_indlen(ip, temp); 1214 temp = xfs_bmap_worst_indlen(ip, temp);
1215 temp2 = xfs_bmap_worst_indlen(ip, temp2); 1215 temp2 = xfs_bmap_worst_indlen(ip, temp2);
1216 diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) - 1216 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
1217 (cur ? cur->bc_private.b.allocated : 0)); 1217 (cur ? cur->bc_private.b.allocated : 0));
1218 if (diff > 0 && 1218 if (diff > 0 &&
1219 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) { 1219 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) {
@@ -1241,11 +1241,11 @@ xfs_bmap_add_extent_delay_real(
1241 } 1241 }
1242 } 1242 }
1243 ep = xfs_iext_get_ext(ifp, idx); 1243 ep = xfs_iext_get_ext(ifp, idx);
1244 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1244 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1245 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK); 1245 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK);
1246 XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); 1246 XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
1247 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), 1247 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
1248 NULLSTARTBLOCK((int)temp2)); 1248 nullstartblock((int)temp2));
1249 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); 1249 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
1250 *dnew = temp + temp2; 1250 *dnew = temp + temp2;
1251 /* DELTA: One in-core extent is split in three. */ 1251 /* DELTA: One in-core extent is split in three. */
@@ -1365,7 +1365,7 @@ xfs_bmap_add_extent_unwritten_real(
1365 */ 1365 */
1366 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { 1366 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
1367 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); 1367 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
1368 STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); 1368 STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
1369 } 1369 }
1370 STATE_SET(LEFT_CONTIG, 1370 STATE_SET(LEFT_CONTIG,
1371 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && 1371 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
@@ -1382,7 +1382,7 @@ xfs_bmap_add_extent_unwritten_real(
1382 idx < 1382 idx <
1383 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { 1383 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
1384 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); 1384 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
1385 STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); 1385 STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
1386 } 1386 }
1387 STATE_SET(RIGHT_CONTIG, 1387 STATE_SET(RIGHT_CONTIG,
1388 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && 1388 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
@@ -1889,13 +1889,13 @@ xfs_bmap_add_extent_hole_delay(
1889 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1889 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1890 ep = xfs_iext_get_ext(ifp, idx); 1890 ep = xfs_iext_get_ext(ifp, idx);
1891 state = 0; 1891 state = 0;
1892 ASSERT(ISNULLSTARTBLOCK(new->br_startblock)); 1892 ASSERT(isnullstartblock(new->br_startblock));
1893 /* 1893 /*
1894 * Check and set flags if this segment has a left neighbor 1894 * Check and set flags if this segment has a left neighbor
1895 */ 1895 */
1896 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { 1896 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
1897 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); 1897 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
1898 STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); 1898 STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
1899 } 1899 }
1900 /* 1900 /*
1901 * Check and set flags if the current (right) segment exists. 1901 * Check and set flags if the current (right) segment exists.
@@ -1905,7 +1905,7 @@ xfs_bmap_add_extent_hole_delay(
1905 idx < 1905 idx <
1906 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { 1906 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
1907 xfs_bmbt_get_all(ep, &right); 1907 xfs_bmbt_get_all(ep, &right);
1908 STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); 1908 STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
1909 } 1909 }
1910 /* 1910 /*
1911 * Set contiguity flags on the left and right neighbors. 1911 * Set contiguity flags on the left and right neighbors.
@@ -1938,12 +1938,12 @@ xfs_bmap_add_extent_hole_delay(
1938 XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1, 1938 XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1,
1939 XFS_DATA_FORK); 1939 XFS_DATA_FORK);
1940 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1940 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
1941 oldlen = STARTBLOCKVAL(left.br_startblock) + 1941 oldlen = startblockval(left.br_startblock) +
1942 STARTBLOCKVAL(new->br_startblock) + 1942 startblockval(new->br_startblock) +
1943 STARTBLOCKVAL(right.br_startblock); 1943 startblockval(right.br_startblock);
1944 newlen = xfs_bmap_worst_indlen(ip, temp); 1944 newlen = xfs_bmap_worst_indlen(ip, temp);
1945 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1945 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
1946 NULLSTARTBLOCK((int)newlen)); 1946 nullstartblock((int)newlen));
1947 XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1, 1947 XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1,
1948 XFS_DATA_FORK); 1948 XFS_DATA_FORK);
1949 XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK); 1949 XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK);
@@ -1964,11 +1964,11 @@ xfs_bmap_add_extent_hole_delay(
1964 XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, 1964 XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1,
1965 XFS_DATA_FORK); 1965 XFS_DATA_FORK);
1966 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1966 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
1967 oldlen = STARTBLOCKVAL(left.br_startblock) + 1967 oldlen = startblockval(left.br_startblock) +
1968 STARTBLOCKVAL(new->br_startblock); 1968 startblockval(new->br_startblock);
1969 newlen = xfs_bmap_worst_indlen(ip, temp); 1969 newlen = xfs_bmap_worst_indlen(ip, temp);
1970 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1970 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
1971 NULLSTARTBLOCK((int)newlen)); 1971 nullstartblock((int)newlen));
1972 XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, 1972 XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1,
1973 XFS_DATA_FORK); 1973 XFS_DATA_FORK);
1974 ip->i_df.if_lastex = idx - 1; 1974 ip->i_df.if_lastex = idx - 1;
@@ -1985,11 +1985,11 @@ xfs_bmap_add_extent_hole_delay(
1985 */ 1985 */
1986 XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK); 1986 XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK);
1987 temp = new->br_blockcount + right.br_blockcount; 1987 temp = new->br_blockcount + right.br_blockcount;
1988 oldlen = STARTBLOCKVAL(new->br_startblock) + 1988 oldlen = startblockval(new->br_startblock) +
1989 STARTBLOCKVAL(right.br_startblock); 1989 startblockval(right.br_startblock);
1990 newlen = xfs_bmap_worst_indlen(ip, temp); 1990 newlen = xfs_bmap_worst_indlen(ip, temp);
1991 xfs_bmbt_set_allf(ep, new->br_startoff, 1991 xfs_bmbt_set_allf(ep, new->br_startoff,
1992 NULLSTARTBLOCK((int)newlen), temp, right.br_state); 1992 nullstartblock((int)newlen), temp, right.br_state);
1993 XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK); 1993 XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK);
1994 ip->i_df.if_lastex = idx; 1994 ip->i_df.if_lastex = idx;
1995 /* DELTA: One in-core extent grew into a hole. */ 1995 /* DELTA: One in-core extent grew into a hole. */
@@ -2085,7 +2085,7 @@ xfs_bmap_add_extent_hole_real(
2085 */ 2085 */
2086 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { 2086 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
2087 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); 2087 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
2088 STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); 2088 STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
2089 } 2089 }
2090 /* 2090 /*
2091 * Check and set flags if this segment has a current value. 2091 * Check and set flags if this segment has a current value.
@@ -2095,7 +2095,7 @@ xfs_bmap_add_extent_hole_real(
2095 idx < 2095 idx <
2096 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { 2096 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
2097 xfs_bmbt_get_all(ep, &right); 2097 xfs_bmbt_get_all(ep, &right);
2098 STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); 2098 STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
2099 } 2099 }
2100 /* 2100 /*
2101 * We're inserting a real allocation between "left" and "right". 2101 * We're inserting a real allocation between "left" and "right".
@@ -2143,7 +2143,7 @@ xfs_bmap_add_extent_hole_real(
2143 XFS_IFORK_NEXT_SET(ip, whichfork, 2143 XFS_IFORK_NEXT_SET(ip, whichfork,
2144 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 2144 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
2145 if (cur == NULL) { 2145 if (cur == NULL) {
2146 rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); 2146 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
2147 } else { 2147 } else {
2148 rval = XFS_ILOG_CORE; 2148 rval = XFS_ILOG_CORE;
2149 if ((error = xfs_bmbt_lookup_eq(cur, 2149 if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2185,7 +2185,7 @@ xfs_bmap_add_extent_hole_real(
2185 XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork); 2185 XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork);
2186 ifp->if_lastex = idx - 1; 2186 ifp->if_lastex = idx - 1;
2187 if (cur == NULL) { 2187 if (cur == NULL) {
2188 rval = XFS_ILOG_FEXT(whichfork); 2188 rval = xfs_ilog_fext(whichfork);
2189 } else { 2189 } else {
2190 rval = 0; 2190 rval = 0;
2191 if ((error = xfs_bmbt_lookup_eq(cur, 2191 if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2220,7 +2220,7 @@ xfs_bmap_add_extent_hole_real(
2220 XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork); 2220 XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork);
2221 ifp->if_lastex = idx; 2221 ifp->if_lastex = idx;
2222 if (cur == NULL) { 2222 if (cur == NULL) {
2223 rval = XFS_ILOG_FEXT(whichfork); 2223 rval = xfs_ilog_fext(whichfork);
2224 } else { 2224 } else {
2225 rval = 0; 2225 rval = 0;
2226 if ((error = xfs_bmbt_lookup_eq(cur, 2226 if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2254,7 +2254,7 @@ xfs_bmap_add_extent_hole_real(
2254 XFS_IFORK_NEXT_SET(ip, whichfork, 2254 XFS_IFORK_NEXT_SET(ip, whichfork,
2255 XFS_IFORK_NEXTENTS(ip, whichfork) + 1); 2255 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
2256 if (cur == NULL) { 2256 if (cur == NULL) {
2257 rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); 2257 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
2258 } else { 2258 } else {
2259 rval = XFS_ILOG_CORE; 2259 rval = XFS_ILOG_CORE;
2260 if ((error = xfs_bmbt_lookup_eq(cur, 2260 if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2482,7 +2482,7 @@ xfs_bmap_adjacent(
2482 * try to use it's last block as our starting point. 2482 * try to use it's last block as our starting point.
2483 */ 2483 */
2484 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && 2484 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
2485 !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && 2485 !isnullstartblock(ap->prevp->br_startblock) &&
2486 ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, 2486 ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
2487 ap->prevp->br_startblock)) { 2487 ap->prevp->br_startblock)) {
2488 ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; 2488 ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
@@ -2511,7 +2511,7 @@ xfs_bmap_adjacent(
2511 * start block based on it. 2511 * start block based on it.
2512 */ 2512 */
2513 if (ap->prevp->br_startoff != NULLFILEOFF && 2513 if (ap->prevp->br_startoff != NULLFILEOFF &&
2514 !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && 2514 !isnullstartblock(ap->prevp->br_startblock) &&
2515 (prevbno = ap->prevp->br_startblock + 2515 (prevbno = ap->prevp->br_startblock +
2516 ap->prevp->br_blockcount) && 2516 ap->prevp->br_blockcount) &&
2517 ISVALID(prevbno, ap->prevp->br_startblock)) { 2517 ISVALID(prevbno, ap->prevp->br_startblock)) {
@@ -2552,7 +2552,7 @@ xfs_bmap_adjacent(
2552 * If there's a following (right) block, select a requested 2552 * If there's a following (right) block, select a requested
2553 * start block based on it. 2553 * start block based on it.
2554 */ 2554 */
2555 if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) { 2555 if (!isnullstartblock(ap->gotp->br_startblock)) {
2556 /* 2556 /*
2557 * Calculate gap to start of next block. 2557 * Calculate gap to start of next block.
2558 */ 2558 */
@@ -3082,7 +3082,7 @@ xfs_bmap_btree_to_extents(
3082 ASSERT(ifp->if_broot == NULL); 3082 ASSERT(ifp->if_broot == NULL);
3083 ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); 3083 ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
3084 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); 3084 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
3085 *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); 3085 *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
3086 return 0; 3086 return 0;
3087} 3087}
3088 3088
@@ -3136,8 +3136,8 @@ xfs_bmap_del_extent(
3136 del_endoff = del->br_startoff + del->br_blockcount; 3136 del_endoff = del->br_startoff + del->br_blockcount;
3137 got_endoff = got.br_startoff + got.br_blockcount; 3137 got_endoff = got.br_startoff + got.br_blockcount;
3138 ASSERT(got_endoff >= del_endoff); 3138 ASSERT(got_endoff >= del_endoff);
3139 delay = ISNULLSTARTBLOCK(got.br_startblock); 3139 delay = isnullstartblock(got.br_startblock);
3140 ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay); 3140 ASSERT(isnullstartblock(del->br_startblock) == delay);
3141 flags = 0; 3141 flags = 0;
3142 qfield = 0; 3142 qfield = 0;
3143 error = 0; 3143 error = 0;
@@ -3189,7 +3189,7 @@ xfs_bmap_del_extent(
3189 } 3189 }
3190 da_old = da_new = 0; 3190 da_old = da_new = 0;
3191 } else { 3191 } else {
3192 da_old = STARTBLOCKVAL(got.br_startblock); 3192 da_old = startblockval(got.br_startblock);
3193 da_new = 0; 3193 da_new = 0;
3194 nblks = 0; 3194 nblks = 0;
3195 do_fx = 0; 3195 do_fx = 0;
@@ -3213,7 +3213,7 @@ xfs_bmap_del_extent(
3213 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 3213 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
3214 flags |= XFS_ILOG_CORE; 3214 flags |= XFS_ILOG_CORE;
3215 if (!cur) { 3215 if (!cur) {
3216 flags |= XFS_ILOG_FEXT(whichfork); 3216 flags |= xfs_ilog_fext(whichfork);
3217 break; 3217 break;
3218 } 3218 }
3219 if ((error = xfs_btree_delete(cur, &i))) 3219 if ((error = xfs_btree_delete(cur, &i)))
@@ -3233,7 +3233,7 @@ xfs_bmap_del_extent(
3233 if (delay) { 3233 if (delay) {
3234 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 3234 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
3235 da_old); 3235 da_old);
3236 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 3236 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
3237 XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, 3237 XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx,
3238 whichfork); 3238 whichfork);
3239 da_new = temp; 3239 da_new = temp;
@@ -3242,7 +3242,7 @@ xfs_bmap_del_extent(
3242 xfs_bmbt_set_startblock(ep, del_endblock); 3242 xfs_bmbt_set_startblock(ep, del_endblock);
3243 XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork); 3243 XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork);
3244 if (!cur) { 3244 if (!cur) {
3245 flags |= XFS_ILOG_FEXT(whichfork); 3245 flags |= xfs_ilog_fext(whichfork);
3246 break; 3246 break;
3247 } 3247 }
3248 if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, 3248 if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
@@ -3262,7 +3262,7 @@ xfs_bmap_del_extent(
3262 if (delay) { 3262 if (delay) {
3263 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 3263 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
3264 da_old); 3264 da_old);
3265 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 3265 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
3266 XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, 3266 XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx,
3267 whichfork); 3267 whichfork);
3268 da_new = temp; 3268 da_new = temp;
@@ -3270,7 +3270,7 @@ xfs_bmap_del_extent(
3270 } 3270 }
3271 XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork); 3271 XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork);
3272 if (!cur) { 3272 if (!cur) {
3273 flags |= XFS_ILOG_FEXT(whichfork); 3273 flags |= xfs_ilog_fext(whichfork);
3274 break; 3274 break;
3275 } 3275 }
3276 if ((error = xfs_bmbt_update(cur, got.br_startoff, 3276 if ((error = xfs_bmbt_update(cur, got.br_startoff,
@@ -3345,22 +3345,22 @@ xfs_bmap_del_extent(
3345 } 3345 }
3346 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 3346 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
3347 } else 3347 } else
3348 flags |= XFS_ILOG_FEXT(whichfork); 3348 flags |= xfs_ilog_fext(whichfork);
3349 XFS_IFORK_NEXT_SET(ip, whichfork, 3349 XFS_IFORK_NEXT_SET(ip, whichfork,
3350 XFS_IFORK_NEXTENTS(ip, whichfork) + 1); 3350 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
3351 } else { 3351 } else {
3352 ASSERT(whichfork == XFS_DATA_FORK); 3352 ASSERT(whichfork == XFS_DATA_FORK);
3353 temp = xfs_bmap_worst_indlen(ip, temp); 3353 temp = xfs_bmap_worst_indlen(ip, temp);
3354 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 3354 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
3355 temp2 = xfs_bmap_worst_indlen(ip, temp2); 3355 temp2 = xfs_bmap_worst_indlen(ip, temp2);
3356 new.br_startblock = NULLSTARTBLOCK((int)temp2); 3356 new.br_startblock = nullstartblock((int)temp2);
3357 da_new = temp + temp2; 3357 da_new = temp + temp2;
3358 while (da_new > da_old) { 3358 while (da_new > da_old) {
3359 if (temp) { 3359 if (temp) {
3360 temp--; 3360 temp--;
3361 da_new--; 3361 da_new--;
3362 xfs_bmbt_set_startblock(ep, 3362 xfs_bmbt_set_startblock(ep,
3363 NULLSTARTBLOCK((int)temp)); 3363 nullstartblock((int)temp));
3364 } 3364 }
3365 if (da_new == da_old) 3365 if (da_new == da_old)
3366 break; 3366 break;
@@ -3368,7 +3368,7 @@ xfs_bmap_del_extent(
3368 temp2--; 3368 temp2--;
3369 da_new--; 3369 da_new--;
3370 new.br_startblock = 3370 new.br_startblock =
3371 NULLSTARTBLOCK((int)temp2); 3371 nullstartblock((int)temp2);
3372 } 3372 }
3373 } 3373 }
3374 } 3374 }
@@ -3545,7 +3545,7 @@ xfs_bmap_extents_to_btree(
3545 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3545 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3546 for (cnt = i = 0; i < nextents; i++) { 3546 for (cnt = i = 0; i < nextents; i++) {
3547 ep = xfs_iext_get_ext(ifp, i); 3547 ep = xfs_iext_get_ext(ifp, i);
3548 if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) { 3548 if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
3549 arp->l0 = cpu_to_be64(ep->l0); 3549 arp->l0 = cpu_to_be64(ep->l0);
3550 arp->l1 = cpu_to_be64(ep->l1); 3550 arp->l1 = cpu_to_be64(ep->l1);
3551 arp++; cnt++; 3551 arp++; cnt++;
@@ -3572,7 +3572,7 @@ xfs_bmap_extents_to_btree(
3572 xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); 3572 xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
3573 ASSERT(*curp == NULL); 3573 ASSERT(*curp == NULL);
3574 *curp = cur; 3574 *curp = cur;
3575 *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork); 3575 *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
3576 return 0; 3576 return 0;
3577} 3577}
3578 3578
@@ -3676,7 +3676,7 @@ xfs_bmap_local_to_extents(
3676 ip->i_d.di_nblocks = 1; 3676 ip->i_d.di_nblocks = 1;
3677 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip, 3677 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip,
3678 XFS_TRANS_DQ_BCOUNT, 1L); 3678 XFS_TRANS_DQ_BCOUNT, 1L);
3679 flags |= XFS_ILOG_FEXT(whichfork); 3679 flags |= xfs_ilog_fext(whichfork);
3680 } else { 3680 } else {
3681 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); 3681 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
3682 xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); 3682 xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
@@ -4082,7 +4082,7 @@ xfs_bmap_add_attrfork(
4082 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 4082 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
4083 ip->i_afp->if_flags = XFS_IFEXTENTS; 4083 ip->i_afp->if_flags = XFS_IFEXTENTS;
4084 logflags = 0; 4084 logflags = 0;
4085 XFS_BMAP_INIT(&flist, &firstblock); 4085 xfs_bmap_init(&flist, &firstblock);
4086 switch (ip->i_d.di_format) { 4086 switch (ip->i_d.di_format) {
4087 case XFS_DINODE_FMT_LOCAL: 4087 case XFS_DINODE_FMT_LOCAL:
4088 error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, 4088 error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
@@ -4162,7 +4162,7 @@ xfs_bmap_add_free(
4162 ASSERT(bno != NULLFSBLOCK); 4162 ASSERT(bno != NULLFSBLOCK);
4163 ASSERT(len > 0); 4163 ASSERT(len > 0);
4164 ASSERT(len <= MAXEXTLEN); 4164 ASSERT(len <= MAXEXTLEN);
4165 ASSERT(!ISNULLSTARTBLOCK(bno)); 4165 ASSERT(!isnullstartblock(bno));
4166 agno = XFS_FSB_TO_AGNO(mp, bno); 4166 agno = XFS_FSB_TO_AGNO(mp, bno);
4167 agbno = XFS_FSB_TO_AGBNO(mp, bno); 4167 agbno = XFS_FSB_TO_AGBNO(mp, bno);
4168 ASSERT(agno < mp->m_sb.sb_agcount); 4168 ASSERT(agno < mp->m_sb.sb_agcount);
@@ -4909,7 +4909,7 @@ xfs_bmapi(
4909 got.br_startoff = end; 4909 got.br_startoff = end;
4910 inhole = eof || got.br_startoff > bno; 4910 inhole = eof || got.br_startoff > bno;
4911 wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) && 4911 wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
4912 ISNULLSTARTBLOCK(got.br_startblock); 4912 isnullstartblock(got.br_startblock);
4913 /* 4913 /*
4914 * First, deal with the hole before the allocated space 4914 * First, deal with the hole before the allocated space
4915 * that we found, if any. 4915 * that we found, if any.
@@ -5028,7 +5028,7 @@ xfs_bmapi(
5028 } 5028 }
5029 5029
5030 ip->i_delayed_blks += alen; 5030 ip->i_delayed_blks += alen;
5031 abno = NULLSTARTBLOCK(indlen); 5031 abno = nullstartblock(indlen);
5032 } else { 5032 } else {
5033 /* 5033 /*
5034 * If first time, allocate and fill in 5034 * If first time, allocate and fill in
@@ -5144,8 +5144,8 @@ xfs_bmapi(
5144 aoff + alen); 5144 aoff + alen);
5145#ifdef DEBUG 5145#ifdef DEBUG
5146 if (flags & XFS_BMAPI_DELAY) { 5146 if (flags & XFS_BMAPI_DELAY) {
5147 ASSERT(ISNULLSTARTBLOCK(got.br_startblock)); 5147 ASSERT(isnullstartblock(got.br_startblock));
5148 ASSERT(STARTBLOCKVAL(got.br_startblock) > 0); 5148 ASSERT(startblockval(got.br_startblock) > 0);
5149 } 5149 }
5150 ASSERT(got.br_state == XFS_EXT_NORM || 5150 ASSERT(got.br_state == XFS_EXT_NORM ||
5151 got.br_state == XFS_EXT_UNWRITTEN); 5151 got.br_state == XFS_EXT_UNWRITTEN);
@@ -5179,7 +5179,7 @@ xfs_bmapi(
5179 ASSERT((bno >= obno) || (n == 0)); 5179 ASSERT((bno >= obno) || (n == 0));
5180 ASSERT(bno < end); 5180 ASSERT(bno < end);
5181 mval->br_startoff = bno; 5181 mval->br_startoff = bno;
5182 if (ISNULLSTARTBLOCK(got.br_startblock)) { 5182 if (isnullstartblock(got.br_startblock)) {
5183 ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); 5183 ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
5184 mval->br_startblock = DELAYSTARTBLOCK; 5184 mval->br_startblock = DELAYSTARTBLOCK;
5185 } else 5185 } else
@@ -5201,7 +5201,7 @@ xfs_bmapi(
5201 ASSERT(mval->br_blockcount <= len); 5201 ASSERT(mval->br_blockcount <= len);
5202 } else { 5202 } else {
5203 *mval = got; 5203 *mval = got;
5204 if (ISNULLSTARTBLOCK(mval->br_startblock)) { 5204 if (isnullstartblock(mval->br_startblock)) {
5205 ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); 5205 ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
5206 mval->br_startblock = DELAYSTARTBLOCK; 5206 mval->br_startblock = DELAYSTARTBLOCK;
5207 } 5207 }
@@ -5329,12 +5329,12 @@ error0:
5329 * Log everything. Do this after conversion, there's no point in 5329 * Log everything. Do this after conversion, there's no point in
5330 * logging the extent records if we've converted to btree format. 5330 * logging the extent records if we've converted to btree format.
5331 */ 5331 */
5332 if ((logflags & XFS_ILOG_FEXT(whichfork)) && 5332 if ((logflags & xfs_ilog_fext(whichfork)) &&
5333 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) 5333 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
5334 logflags &= ~XFS_ILOG_FEXT(whichfork); 5334 logflags &= ~xfs_ilog_fext(whichfork);
5335 else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && 5335 else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
5336 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) 5336 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
5337 logflags &= ~XFS_ILOG_FBROOT(whichfork); 5337 logflags &= ~xfs_ilog_fbroot(whichfork);
5338 /* 5338 /*
5339 * Log whatever the flags say, even if error. Otherwise we might miss 5339 * Log whatever the flags say, even if error. Otherwise we might miss
5340 * detecting a case where the data is changed, there's an error, 5340 * detecting a case where the data is changed, there's an error,
@@ -5411,7 +5411,7 @@ xfs_bmapi_single(
5411 *fsb = NULLFSBLOCK; 5411 *fsb = NULLFSBLOCK;
5412 return 0; 5412 return 0;
5413 } 5413 }
5414 ASSERT(!ISNULLSTARTBLOCK(got.br_startblock)); 5414 ASSERT(!isnullstartblock(got.br_startblock));
5415 ASSERT(bno < got.br_startoff + got.br_blockcount); 5415 ASSERT(bno < got.br_startoff + got.br_blockcount);
5416 *fsb = got.br_startblock + (bno - got.br_startoff); 5416 *fsb = got.br_startblock + (bno - got.br_startoff);
5417 ifp->if_lastex = lastx; 5417 ifp->if_lastex = lastx;
@@ -5543,7 +5543,7 @@ xfs_bunmapi(
5543 */ 5543 */
5544 ASSERT(ep != NULL); 5544 ASSERT(ep != NULL);
5545 del = got; 5545 del = got;
5546 wasdel = ISNULLSTARTBLOCK(del.br_startblock); 5546 wasdel = isnullstartblock(del.br_startblock);
5547 if (got.br_startoff < start) { 5547 if (got.br_startoff < start) {
5548 del.br_startoff = start; 5548 del.br_startoff = start;
5549 del.br_blockcount -= start - got.br_startoff; 5549 del.br_blockcount -= start - got.br_startoff;
@@ -5638,7 +5638,7 @@ xfs_bunmapi(
5638 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, 5638 xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
5639 lastx - 1), &prev); 5639 lastx - 1), &prev);
5640 ASSERT(prev.br_state == XFS_EXT_NORM); 5640 ASSERT(prev.br_state == XFS_EXT_NORM);
5641 ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock)); 5641 ASSERT(!isnullstartblock(prev.br_startblock));
5642 ASSERT(del.br_startblock == 5642 ASSERT(del.br_startblock ==
5643 prev.br_startblock + prev.br_blockcount); 5643 prev.br_startblock + prev.br_blockcount);
5644 if (prev.br_startoff < start) { 5644 if (prev.br_startoff < start) {
@@ -5666,7 +5666,7 @@ xfs_bunmapi(
5666 } 5666 }
5667 } 5667 }
5668 if (wasdel) { 5668 if (wasdel) {
5669 ASSERT(STARTBLOCKVAL(del.br_startblock) > 0); 5669 ASSERT(startblockval(del.br_startblock) > 0);
5670 /* Update realtime/data freespace, unreserve quota */ 5670 /* Update realtime/data freespace, unreserve quota */
5671 if (isrt) { 5671 if (isrt) {
5672 xfs_filblks_t rtexts; 5672 xfs_filblks_t rtexts;
@@ -5782,12 +5782,12 @@ error0:
5782 * Log everything. Do this after conversion, there's no point in 5782 * Log everything. Do this after conversion, there's no point in
5783 * logging the extent records if we've converted to btree format. 5783 * logging the extent records if we've converted to btree format.
5784 */ 5784 */
5785 if ((logflags & XFS_ILOG_FEXT(whichfork)) && 5785 if ((logflags & xfs_ilog_fext(whichfork)) &&
5786 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) 5786 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
5787 logflags &= ~XFS_ILOG_FEXT(whichfork); 5787 logflags &= ~xfs_ilog_fext(whichfork);
5788 else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && 5788 else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
5789 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) 5789 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
5790 logflags &= ~XFS_ILOG_FBROOT(whichfork); 5790 logflags &= ~xfs_ilog_fbroot(whichfork);
5791 /* 5791 /*
5792 * Log inode even in the error case, if the transaction 5792 * Log inode even in the error case, if the transaction
5793 * is dirty we'll need to shut down the filesystem. 5793 * is dirty we'll need to shut down the filesystem.
@@ -5838,7 +5838,7 @@ xfs_getbmapx_fix_eof_hole(
5838 if (startblock == DELAYSTARTBLOCK) 5838 if (startblock == DELAYSTARTBLOCK)
5839 out->bmv_block = -2; 5839 out->bmv_block = -2;
5840 else 5840 else
5841 out->bmv_block = XFS_FSB_TO_DB(ip, startblock); 5841 out->bmv_block = xfs_fsb_to_db(ip, startblock);
5842 fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); 5842 fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
5843 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 5843 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
5844 if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && 5844 if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
@@ -5979,7 +5979,7 @@ xfs_getbmap(
5979 if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) 5979 if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
5980 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; 5980 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
5981 5981
5982 bmapi_flags = XFS_BMAPI_AFLAG(whichfork) | 5982 bmapi_flags = xfs_bmapi_aflag(whichfork) |
5983 ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE); 5983 ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
5984 5984
5985 /* 5985 /*
@@ -6098,7 +6098,7 @@ xfs_bmap_isaeof(
6098 */ 6098 */
6099 *aeof = (off >= s.br_startoff && 6099 *aeof = (off >= s.br_startoff &&
6100 off < s.br_startoff + s.br_blockcount && 6100 off < s.br_startoff + s.br_blockcount &&
6101 ISNULLSTARTBLOCK(s.br_startblock)) || 6101 isnullstartblock(s.br_startblock)) ||
6102 off >= s.br_startoff + s.br_blockcount; 6102 off >= s.br_startoff + s.br_blockcount;
6103 return 0; 6103 return 0;
6104} 6104}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 284571c05ed0..be2979d88d32 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -95,7 +95,6 @@ typedef struct xfs_bmap_free
95 /* need write cache flushing and no */ 95 /* need write cache flushing and no */
96 /* additional allocation alignments */ 96 /* additional allocation alignments */
97 97
98#define XFS_BMAPI_AFLAG(w) xfs_bmapi_aflag(w)
99static inline int xfs_bmapi_aflag(int w) 98static inline int xfs_bmapi_aflag(int w)
100{ 99{
101 return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); 100 return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
@@ -107,7 +106,6 @@ static inline int xfs_bmapi_aflag(int w)
107#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) 106#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL)
108#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) 107#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL)
109 108
110#define XFS_BMAP_INIT(flp,fbp) xfs_bmap_init(flp,fbp)
111static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) 109static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
112{ 110{
113 ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ 111 ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index ba6b08c2fb02..0760d352586f 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -121,7 +121,7 @@ __xfs_bmbt_get_all(
121 121
122 b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) | 122 b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
123 (((xfs_dfsbno_t)l1) >> 21); 123 (((xfs_dfsbno_t)l1) >> 21);
124 ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); 124 ASSERT((b >> 32) == 0 || isnulldstartblock(b));
125 s->br_startblock = (xfs_fsblock_t)b; 125 s->br_startblock = (xfs_fsblock_t)b;
126 } 126 }
127#else /* !DEBUG */ 127#else /* !DEBUG */
@@ -172,7 +172,7 @@ xfs_bmbt_get_startblock(
172 172
173 b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) | 173 b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
174 (((xfs_dfsbno_t)r->l1) >> 21); 174 (((xfs_dfsbno_t)r->l1) >> 21);
175 ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); 175 ASSERT((b >> 32) == 0 || isnulldstartblock(b));
176 return (xfs_fsblock_t)b; 176 return (xfs_fsblock_t)b;
177#else /* !DEBUG */ 177#else /* !DEBUG */
178 return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21); 178 return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
@@ -261,7 +261,7 @@ xfs_bmbt_set_allf(
261 ((xfs_bmbt_rec_base_t)blockcount & 261 ((xfs_bmbt_rec_base_t)blockcount &
262 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); 262 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
263#else /* !XFS_BIG_BLKNOS */ 263#else /* !XFS_BIG_BLKNOS */
264 if (ISNULLSTARTBLOCK(startblock)) { 264 if (isnullstartblock(startblock)) {
265 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | 265 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
266 ((xfs_bmbt_rec_base_t)startoff << 9) | 266 ((xfs_bmbt_rec_base_t)startoff << 9) |
267 (xfs_bmbt_rec_base_t)xfs_mask64lo(9); 267 (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
@@ -321,7 +321,7 @@ xfs_bmbt_disk_set_allf(
321 ((xfs_bmbt_rec_base_t)blockcount & 321 ((xfs_bmbt_rec_base_t)blockcount &
322 (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); 322 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
323#else /* !XFS_BIG_BLKNOS */ 323#else /* !XFS_BIG_BLKNOS */
324 if (ISNULLSTARTBLOCK(startblock)) { 324 if (isnullstartblock(startblock)) {
325 r->l0 = cpu_to_be64( 325 r->l0 = cpu_to_be64(
326 ((xfs_bmbt_rec_base_t)extent_flag << 63) | 326 ((xfs_bmbt_rec_base_t)extent_flag << 63) |
327 ((xfs_bmbt_rec_base_t)startoff << 9) | 327 ((xfs_bmbt_rec_base_t)startoff << 9) |
@@ -382,7 +382,7 @@ xfs_bmbt_set_startblock(
382 r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) | 382 r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
383 (xfs_bmbt_rec_base_t)(v << 21); 383 (xfs_bmbt_rec_base_t)(v << 21);
384#else /* !XFS_BIG_BLKNOS */ 384#else /* !XFS_BIG_BLKNOS */
385 if (ISNULLSTARTBLOCK(v)) { 385 if (isnullstartblock(v)) {
386 r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9); 386 r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
387 r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) | 387 r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
388 ((xfs_bmbt_rec_base_t)v << 21) | 388 ((xfs_bmbt_rec_base_t)v << 21) |
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index a4555abb6622..0e8df007615e 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -76,26 +76,22 @@ typedef struct xfs_bmbt_rec_host {
76#define DSTARTBLOCKMASK \ 76#define DSTARTBLOCKMASK \
77 (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) 77 (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
78 78
79#define ISNULLSTARTBLOCK(x) isnullstartblock(x)
80static inline int isnullstartblock(xfs_fsblock_t x) 79static inline int isnullstartblock(xfs_fsblock_t x)
81{ 80{
82 return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; 81 return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
83} 82}
84 83
85#define ISNULLDSTARTBLOCK(x) isnulldstartblock(x)
86static inline int isnulldstartblock(xfs_dfsbno_t x) 84static inline int isnulldstartblock(xfs_dfsbno_t x)
87{ 85{
88 return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK; 86 return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
89} 87}
90 88
91#define NULLSTARTBLOCK(k) nullstartblock(k)
92static inline xfs_fsblock_t nullstartblock(int k) 89static inline xfs_fsblock_t nullstartblock(int k)
93{ 90{
94 ASSERT(k < (1 << STARTBLOCKVALBITS)); 91 ASSERT(k < (1 << STARTBLOCKVALBITS));
95 return STARTBLOCKMASK | (k); 92 return STARTBLOCKMASK | (k);
96} 93}
97 94
98#define STARTBLOCKVAL(x) startblockval(x)
99static inline xfs_filblks_t startblockval(xfs_fsblock_t x) 95static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
100{ 96{
101 return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); 97 return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 2c3ef20f8842..e73c332eb23f 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -843,7 +843,7 @@ xfs_btree_ptr_is_null(
843 union xfs_btree_ptr *ptr) 843 union xfs_btree_ptr *ptr)
844{ 844{
845 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 845 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
846 return be64_to_cpu(ptr->l) == NULLFSBLOCK; 846 return be64_to_cpu(ptr->l) == NULLDFSBNO;
847 else 847 else
848 return be32_to_cpu(ptr->s) == NULLAGBLOCK; 848 return be32_to_cpu(ptr->s) == NULLAGBLOCK;
849} 849}
@@ -854,7 +854,7 @@ xfs_btree_set_ptr_null(
854 union xfs_btree_ptr *ptr) 854 union xfs_btree_ptr *ptr)
855{ 855{
856 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 856 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
857 ptr->l = cpu_to_be64(NULLFSBLOCK); 857 ptr->l = cpu_to_be64(NULLDFSBNO);
858 else 858 else
859 ptr->s = cpu_to_be32(NULLAGBLOCK); 859 ptr->s = cpu_to_be32(NULLAGBLOCK);
860} 860}
@@ -918,8 +918,8 @@ xfs_btree_init_block(
918 new->bb_numrecs = cpu_to_be16(numrecs); 918 new->bb_numrecs = cpu_to_be16(numrecs);
919 919
920 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 920 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
921 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 921 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
922 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 922 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
923 } else { 923 } else {
924 new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 924 new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
925 new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 925 new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
@@ -960,7 +960,7 @@ xfs_btree_buf_to_ptr(
960 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, 960 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
961 XFS_BUF_ADDR(bp))); 961 XFS_BUF_ADDR(bp)));
962 else { 962 else {
963 ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp, 963 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
964 XFS_BUF_ADDR(bp))); 964 XFS_BUF_ADDR(bp)));
965 } 965 }
966} 966}
@@ -971,7 +971,7 @@ xfs_btree_ptr_to_daddr(
971 union xfs_btree_ptr *ptr) 971 union xfs_btree_ptr *ptr)
972{ 972{
973 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 973 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
974 ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK); 974 ASSERT(be64_to_cpu(ptr->l) != NULLDFSBNO);
975 975
976 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); 976 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
977 } else { 977 } else {
@@ -2454,7 +2454,7 @@ xfs_btree_new_iroot(
2454 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 2454 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2455 2455
2456 *logflags |= 2456 *logflags |=
2457 XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork); 2457 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
2458 *stat = 1; 2458 *stat = 1;
2459 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 2459 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2460 return 0; 2460 return 0;
@@ -3048,7 +3048,7 @@ xfs_btree_kill_iroot(
3048 cur->bc_bufs[level - 1] = NULL; 3048 cur->bc_bufs[level - 1] = NULL;
3049 be16_add_cpu(&block->bb_level, -1); 3049 be16_add_cpu(&block->bb_level, -1);
3050 xfs_trans_log_inode(cur->bc_tp, ip, 3050 xfs_trans_log_inode(cur->bc_tp, ip,
3051 XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); 3051 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3052 cur->bc_nlevels--; 3052 cur->bc_nlevels--;
3053out0: 3053out0:
3054 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3054 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index a11a8390bf6c..c45f74ff1a5b 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1597,7 +1597,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
1597 nmap = 1; 1597 nmap = 1;
1598 ASSERT(args->firstblock != NULL); 1598 ASSERT(args->firstblock != NULL);
1599 if ((error = xfs_bmapi(tp, dp, bno, count, 1599 if ((error = xfs_bmapi(tp, dp, bno, count,
1600 XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| 1600 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
1601 XFS_BMAPI_CONTIG, 1601 XFS_BMAPI_CONTIG,
1602 args->firstblock, args->total, &map, &nmap, 1602 args->firstblock, args->total, &map, &nmap,
1603 args->flist, NULL))) { 1603 args->flist, NULL))) {
@@ -1618,7 +1618,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
1618 nmap = MIN(XFS_BMAP_MAX_NMAP, count); 1618 nmap = MIN(XFS_BMAP_MAX_NMAP, count);
1619 c = (int)(bno + count - b); 1619 c = (int)(bno + count - b);
1620 if ((error = xfs_bmapi(tp, dp, b, c, 1620 if ((error = xfs_bmapi(tp, dp, b, c,
1621 XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE| 1621 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
1622 XFS_BMAPI_METADATA, 1622 XFS_BMAPI_METADATA,
1623 args->firstblock, args->total, 1623 args->firstblock, args->total,
1624 &mapp[mapi], &nmap, args->flist, 1624 &mapp[mapi], &nmap, args->flist,
@@ -1882,7 +1882,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
1882 * the last block to the place we want to kill. 1882 * the last block to the place we want to kill.
1883 */ 1883 */
1884 if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, 1884 if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
1885 XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA, 1885 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
1886 0, args->firstblock, args->flist, NULL, 1886 0, args->firstblock, args->flist, NULL,
1887 &done)) == ENOSPC) { 1887 &done)) == ENOSPC) {
1888 if (w != XFS_DATA_FORK) 1888 if (w != XFS_DATA_FORK)
@@ -1987,7 +1987,7 @@ xfs_da_do_buf(
1987 if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, 1987 if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
1988 nfsb, 1988 nfsb,
1989 XFS_BMAPI_METADATA | 1989 XFS_BMAPI_METADATA |
1990 XFS_BMAPI_AFLAG(whichfork), 1990 xfs_bmapi_aflag(whichfork),
1991 NULL, 0, mapp, &nmap, NULL, NULL))) 1991 NULL, 0, mapp, &nmap, NULL, NULL)))
1992 goto exit0; 1992 goto exit0;
1993 } 1993 }
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b4c1ee713492..f8278cfcc1d3 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -55,17 +55,11 @@ xfs_swapext(
55 struct file *file, *target_file; 55 struct file *file, *target_file;
56 int error = 0; 56 int error = 0;
57 57
58 sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
59 if (!sxp) {
60 error = XFS_ERROR(ENOMEM);
61 goto out;
62 }
63
64 /* Pull information for the target fd */ 58 /* Pull information for the target fd */
65 file = fget((int)sxp->sx_fdtarget); 59 file = fget((int)sxp->sx_fdtarget);
66 if (!file) { 60 if (!file) {
67 error = XFS_ERROR(EINVAL); 61 error = XFS_ERROR(EINVAL);
68 goto out_free_sxp; 62 goto out;
69 } 63 }
70 64
71 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) { 65 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) {
@@ -109,8 +103,6 @@ xfs_swapext(
109 fput(target_file); 103 fput(target_file);
110 out_put_file: 104 out_put_file:
111 fput(file); 105 fput(file);
112 out_free_sxp:
113 kmem_free(sxp);
114 out: 106 out:
115 return error; 107 return error;
116} 108}
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index e6ebbaeb4dc6..ab016e5ae7be 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -357,7 +357,7 @@ xfs_ialloc_ag_alloc(
357 int ioffset = i << args.mp->m_sb.sb_inodelog; 357 int ioffset = i << args.mp->m_sb.sb_inodelog;
358 uint isize = sizeof(struct xfs_dinode); 358 uint isize = sizeof(struct xfs_dinode);
359 359
360 free = XFS_MAKE_IPTR(args.mp, fbuf, i); 360 free = xfs_make_iptr(args.mp, fbuf, i);
361 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 361 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
362 free->di_version = version; 362 free->di_version = version;
363 free->di_gen = cpu_to_be32(gen); 363 free->di_gen = cpu_to_be32(gen);
@@ -937,7 +937,7 @@ nextag:
937 } 937 }
938 } 938 }
939 } 939 }
940 offset = XFS_IALLOC_FIND_FREE(&rec.ir_free); 940 offset = xfs_ialloc_find_free(&rec.ir_free);
941 ASSERT(offset >= 0); 941 ASSERT(offset >= 0);
942 ASSERT(offset < XFS_INODES_PER_CHUNK); 942 ASSERT(offset < XFS_INODES_PER_CHUNK);
943 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % 943 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1279,7 +1279,7 @@ xfs_imap(
1279 offset = XFS_INO_TO_OFFSET(mp, ino); 1279 offset = XFS_INO_TO_OFFSET(mp, ino);
1280 ASSERT(offset < mp->m_sb.sb_inopblock); 1280 ASSERT(offset < mp->m_sb.sb_inopblock);
1281 1281
1282 cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno); 1282 cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno);
1283 offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock; 1283 offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
1284 1284
1285 imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); 1285 imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 50f558a4e0a8..aeee8278f92c 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -39,7 +39,6 @@ struct xfs_trans;
39/* 39/*
40 * Make an inode pointer out of the buffer/offset. 40 * Make an inode pointer out of the buffer/offset.
41 */ 41 */
42#define XFS_MAKE_IPTR(mp,b,o) xfs_make_iptr(mp,b,o)
43static inline struct xfs_dinode * 42static inline struct xfs_dinode *
44xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) 43xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
45{ 44{
@@ -50,7 +49,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
50/* 49/*
51 * Find a free (set) bit in the inode bitmask. 50 * Find a free (set) bit in the inode bitmask.
52 */ 51 */
53#define XFS_IALLOC_FIND_FREE(fp) xfs_ialloc_find_free(fp)
54static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) 52static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
55{ 53{
56 return xfs_lowbit64(*fp); 54 return xfs_lowbit64(*fp);
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 37e5dd01a577..5580e255ff06 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -36,7 +36,6 @@ typedef __uint64_t xfs_inofree_t;
36#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) 36#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
37#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) 37#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
38 38
39#define XFS_INOBT_MASKN(i,n) xfs_inobt_maskn(i,n)
40static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) 39static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
41{ 40{
42 return (((n) >= XFS_INODES_PER_CHUNK ? \ 41 return (((n) >= XFS_INODES_PER_CHUNK ? \
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e2fb6210d4c5..478e587087fe 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -246,9 +246,6 @@ xfs_iget_cache_miss(
246 goto out_destroy; 246 goto out_destroy;
247 } 247 }
248 248
249 if (lock_flags)
250 xfs_ilock(ip, lock_flags);
251
252 /* 249 /*
253 * Preload the radix tree so we can insert safely under the 250 * Preload the radix tree so we can insert safely under the
254 * write spinlock. Note that we cannot sleep inside the preload 251 * write spinlock. Note that we cannot sleep inside the preload
@@ -256,7 +253,16 @@ xfs_iget_cache_miss(
256 */ 253 */
257 if (radix_tree_preload(GFP_KERNEL)) { 254 if (radix_tree_preload(GFP_KERNEL)) {
258 error = EAGAIN; 255 error = EAGAIN;
259 goto out_unlock; 256 goto out_destroy;
257 }
258
259 /*
260 * Because the inode hasn't been added to the radix-tree yet it can't
261 * be found by another thread, so we can do the non-sleeping lock here.
262 */
263 if (lock_flags) {
264 if (!xfs_ilock_nowait(ip, lock_flags))
265 BUG();
260 } 266 }
261 267
262 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 268 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
@@ -284,7 +290,6 @@ xfs_iget_cache_miss(
284out_preload_end: 290out_preload_end:
285 write_unlock(&pag->pag_ici_lock); 291 write_unlock(&pag->pag_ici_lock);
286 radix_tree_preload_end(); 292 radix_tree_preload_end();
287out_unlock:
288 if (lock_flags) 293 if (lock_flags)
289 xfs_iunlock(ip, lock_flags); 294 xfs_iunlock(ip, lock_flags);
290out_destroy: 295out_destroy:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5a5e035e5d38..e7ae08d1df48 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -424,6 +424,19 @@ xfs_iformat(
424 case XFS_DINODE_FMT_LOCAL: 424 case XFS_DINODE_FMT_LOCAL:
425 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 425 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
426 size = be16_to_cpu(atp->hdr.totsize); 426 size = be16_to_cpu(atp->hdr.totsize);
427
428 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
429 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
430 "corrupt inode %Lu "
431 "(bad attr fork size %Ld).",
432 (unsigned long long) ip->i_ino,
433 (long long) size);
434 XFS_CORRUPTION_ERROR("xfs_iformat(8)",
435 XFS_ERRLEVEL_LOW,
436 ip->i_mount, dip);
437 return XFS_ERROR(EFSCORRUPTED);
438 }
439
427 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 440 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
428 break; 441 break;
429 case XFS_DINODE_FMT_EXTENTS: 442 case XFS_DINODE_FMT_EXTENTS:
@@ -1601,10 +1614,10 @@ xfs_itruncate_finish(
1601 * in this file with garbage in them once recovery 1614 * in this file with garbage in them once recovery
1602 * runs. 1615 * runs.
1603 */ 1616 */
1604 XFS_BMAP_INIT(&free_list, &first_block); 1617 xfs_bmap_init(&free_list, &first_block);
1605 error = xfs_bunmapi(ntp, ip, 1618 error = xfs_bunmapi(ntp, ip,
1606 first_unmap_block, unmap_len, 1619 first_unmap_block, unmap_len,
1607 XFS_BMAPI_AFLAG(fork) | 1620 xfs_bmapi_aflag(fork) |
1608 (sync ? 0 : XFS_BMAPI_ASYNC), 1621 (sync ? 0 : XFS_BMAPI_ASYNC),
1609 XFS_ITRUNC_MAX_EXTENTS, 1622 XFS_ITRUNC_MAX_EXTENTS,
1610 &first_block, &free_list, 1623 &first_block, &free_list,
@@ -2557,7 +2570,7 @@ xfs_iextents_copy(
2557 for (i = 0; i < nrecs; i++) { 2570 for (i = 0; i < nrecs; i++) {
2558 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 2571 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
2559 start_block = xfs_bmbt_get_startblock(ep); 2572 start_block = xfs_bmbt_get_startblock(ep);
2560 if (ISNULLSTARTBLOCK(start_block)) { 2573 if (isnullstartblock(start_block)) {
2561 /* 2574 /*
2562 * It's a delayed allocation extent, so skip it. 2575 * It's a delayed allocation extent, so skip it.
2563 */ 2576 */
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 1ff04cc323ad..9957d0602d54 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -111,20 +111,16 @@ typedef struct xfs_inode_log_format_64 {
111 111
112#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED) 112#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
113 113
114
115#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w)
116static inline int xfs_ilog_fbroot(int w) 114static inline int xfs_ilog_fbroot(int w)
117{ 115{
118 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); 116 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
119} 117}
120 118
121#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w)
122static inline int xfs_ilog_fext(int w) 119static inline int xfs_ilog_fext(int w)
123{ 120{
124 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); 121 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
125} 122}
126 123
127#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w)
128static inline int xfs_ilog_fdata(int w) 124static inline int xfs_ilog_fdata(int w)
129{ 125{
130 return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); 126 return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 911062cf73a6..08ce72316bfe 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -155,7 +155,7 @@ xfs_imap_to_bmap(
155 iomapp->iomap_bn = IOMAP_DADDR_NULL; 155 iomapp->iomap_bn = IOMAP_DADDR_NULL;
156 iomapp->iomap_flags |= IOMAP_DELAY; 156 iomapp->iomap_flags |= IOMAP_DELAY;
157 } else { 157 } else {
158 iomapp->iomap_bn = XFS_FSB_TO_DB(ip, start_block); 158 iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
159 if (ISUNWRITTEN(imap)) 159 if (ISUNWRITTEN(imap))
160 iomapp->iomap_flags |= IOMAP_UNWRITTEN; 160 iomapp->iomap_flags |= IOMAP_UNWRITTEN;
161 } 161 }
@@ -261,7 +261,7 @@ xfs_iomap(
261 xfs_iunlock(ip, lockmode); 261 xfs_iunlock(ip, lockmode);
262 lockmode = 0; 262 lockmode = 0;
263 263
264 if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) { 264 if (nimaps && !isnullstartblock(imap.br_startblock)) {
265 xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip, 265 xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip,
266 offset, count, iomapp, &imap, flags); 266 offset, count, iomapp, &imap, flags);
267 break; 267 break;
@@ -491,7 +491,7 @@ xfs_iomap_write_direct(
491 /* 491 /*
492 * Issue the xfs_bmapi() call to allocate the blocks 492 * Issue the xfs_bmapi() call to allocate the blocks
493 */ 493 */
494 XFS_BMAP_INIT(&free_list, &firstfsb); 494 xfs_bmap_init(&free_list, &firstfsb);
495 nimaps = 1; 495 nimaps = 1;
496 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, 496 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
497 &firstfsb, 0, &imap, &nimaps, &free_list, NULL); 497 &firstfsb, 0, &imap, &nimaps, &free_list, NULL);
@@ -751,7 +751,7 @@ xfs_iomap_write_allocate(
751 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 751 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
752 xfs_trans_ihold(tp, ip); 752 xfs_trans_ihold(tp, ip);
753 753
754 XFS_BMAP_INIT(&free_list, &first_block); 754 xfs_bmap_init(&free_list, &first_block);
755 755
756 /* 756 /*
757 * it is possible that the extents have changed since 757 * it is possible that the extents have changed since
@@ -911,7 +911,7 @@ xfs_iomap_write_unwritten(
911 /* 911 /*
912 * Modify the unwritten extent state of the buffer. 912 * Modify the unwritten extent state of the buffer.
913 */ 913 */
914 XFS_BMAP_INIT(&free_list, &firstfsb); 914 xfs_bmap_init(&free_list, &firstfsb);
915 nimaps = 1; 915 nimaps = 1;
916 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, 916 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
917 XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, 917 XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index e19d0a8d5618..cf98a805ec90 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -453,7 +453,7 @@ xfs_bulkstat(
453 (chunkidx = agino - gino + 1) < 453 (chunkidx = agino - gino + 1) <
454 XFS_INODES_PER_CHUNK && 454 XFS_INODES_PER_CHUNK &&
455 /* there are some left allocated */ 455 /* there are some left allocated */
456 XFS_INOBT_MASKN(chunkidx, 456 xfs_inobt_maskn(chunkidx,
457 XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) { 457 XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
458 /* 458 /*
459 * Grab the chunk record. Mark all the 459 * Grab the chunk record. Mark all the
@@ -464,7 +464,7 @@ xfs_bulkstat(
464 if (XFS_INOBT_MASK(i) & ~gfree) 464 if (XFS_INOBT_MASK(i) & ~gfree)
465 gcnt++; 465 gcnt++;
466 } 466 }
467 gfree |= XFS_INOBT_MASKN(0, chunkidx); 467 gfree |= xfs_inobt_maskn(0, chunkidx);
468 irbp->ir_startino = gino; 468 irbp->ir_startino = gino;
469 irbp->ir_freecount = gcnt; 469 irbp->ir_freecount = gcnt;
470 irbp->ir_free = gfree; 470 irbp->ir_free = gfree;
@@ -535,7 +535,7 @@ xfs_bulkstat(
535 chunkidx < XFS_INODES_PER_CHUNK; 535 chunkidx < XFS_INODES_PER_CHUNK;
536 chunkidx += nicluster, 536 chunkidx += nicluster,
537 agbno += nbcluster) { 537 agbno += nbcluster) {
538 if (XFS_INOBT_MASKN(chunkidx, 538 if (xfs_inobt_maskn(chunkidx,
539 nicluster) & ~gfree) 539 nicluster) & ~gfree)
540 xfs_btree_reada_bufs(mp, agno, 540 xfs_btree_reada_bufs(mp, agno,
541 agbno, nbcluster); 541 agbno, nbcluster);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 35cca98bd94c..61af610d79b3 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -70,16 +70,21 @@ STATIC void xlog_recover_check_summary(xlog_t *);
70xfs_buf_t * 70xfs_buf_t *
71xlog_get_bp( 71xlog_get_bp(
72 xlog_t *log, 72 xlog_t *log,
73 int num_bblks) 73 int nbblks)
74{ 74{
75 ASSERT(num_bblks > 0); 75 if (nbblks <= 0 || nbblks > log->l_logBBsize) {
76 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
77 XFS_ERROR_REPORT("xlog_get_bp(1)",
78 XFS_ERRLEVEL_HIGH, log->l_mp);
79 return NULL;
80 }
76 81
77 if (log->l_sectbb_log) { 82 if (log->l_sectbb_log) {
78 if (num_bblks > 1) 83 if (nbblks > 1)
79 num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 84 nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
80 num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks); 85 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
81 } 86 }
82 return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp); 87 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
83} 88}
84 89
85void 90void
@@ -102,6 +107,13 @@ xlog_bread(
102{ 107{
103 int error; 108 int error;
104 109
110 if (nbblks <= 0 || nbblks > log->l_logBBsize) {
111 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
112 XFS_ERROR_REPORT("xlog_bread(1)",
113 XFS_ERRLEVEL_HIGH, log->l_mp);
114 return EFSCORRUPTED;
115 }
116
105 if (log->l_sectbb_log) { 117 if (log->l_sectbb_log) {
106 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 118 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
107 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 119 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
@@ -139,6 +151,13 @@ xlog_bwrite(
139{ 151{
140 int error; 152 int error;
141 153
154 if (nbblks <= 0 || nbblks > log->l_logBBsize) {
155 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
156 XFS_ERROR_REPORT("xlog_bwrite(1)",
157 XFS_ERRLEVEL_HIGH, log->l_mp);
158 return EFSCORRUPTED;
159 }
160
142 if (log->l_sectbb_log) { 161 if (log->l_sectbb_log) {
143 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 162 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
144 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 163 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
@@ -1436,10 +1455,19 @@ xlog_recover_add_to_trans(
1436 item = item->ri_prev; 1455 item = item->ri_prev;
1437 1456
1438 if (item->ri_total == 0) { /* first region to be added */ 1457 if (item->ri_total == 0) { /* first region to be added */
1439 item->ri_total = in_f->ilf_size; 1458 if (in_f->ilf_size == 0 ||
1440 ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM); 1459 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1441 item->ri_buf = kmem_zalloc((item->ri_total * 1460 xlog_warn(
1442 sizeof(xfs_log_iovec_t)), KM_SLEEP); 1461 "XFS: bad number of regions (%d) in inode log format",
1462 in_f->ilf_size);
1463 ASSERT(0);
1464 return XFS_ERROR(EIO);
1465 }
1466
1467 item->ri_total = in_f->ilf_size;
1468 item->ri_buf =
1469 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1470 KM_SLEEP);
1443 } 1471 }
1444 ASSERT(item->ri_total > item->ri_cnt); 1472 ASSERT(item->ri_total > item->ri_cnt);
1445 /* Description region is ri_buf[0] */ 1473 /* Description region is ri_buf[0] */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3c97c6463a4e..35300250e86d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,6 @@
45#include "xfs_fsops.h" 45#include "xfs_fsops.h"
46#include "xfs_utils.h" 46#include "xfs_utils.h"
47 47
48STATIC int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
49STATIC int xfs_uuid_mount(xfs_mount_t *); 48STATIC int xfs_uuid_mount(xfs_mount_t *);
50STATIC void xfs_unmountfs_wait(xfs_mount_t *); 49STATIC void xfs_unmountfs_wait(xfs_mount_t *);
51 50
@@ -682,7 +681,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
682 * Update alignment values based on mount options and sb values 681 * Update alignment values based on mount options and sb values
683 */ 682 */
684STATIC int 683STATIC int
685xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags) 684xfs_update_alignment(xfs_mount_t *mp)
686{ 685{
687 xfs_sb_t *sbp = &(mp->m_sb); 686 xfs_sb_t *sbp = &(mp->m_sb);
688 687
@@ -736,11 +735,11 @@ xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
736 if (xfs_sb_version_hasdalign(sbp)) { 735 if (xfs_sb_version_hasdalign(sbp)) {
737 if (sbp->sb_unit != mp->m_dalign) { 736 if (sbp->sb_unit != mp->m_dalign) {
738 sbp->sb_unit = mp->m_dalign; 737 sbp->sb_unit = mp->m_dalign;
739 *update_flags |= XFS_SB_UNIT; 738 mp->m_update_flags |= XFS_SB_UNIT;
740 } 739 }
741 if (sbp->sb_width != mp->m_swidth) { 740 if (sbp->sb_width != mp->m_swidth) {
742 sbp->sb_width = mp->m_swidth; 741 sbp->sb_width = mp->m_swidth;
743 *update_flags |= XFS_SB_WIDTH; 742 mp->m_update_flags |= XFS_SB_WIDTH;
744 } 743 }
745 } 744 }
746 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && 745 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
@@ -905,7 +904,6 @@ xfs_mountfs(
905 xfs_sb_t *sbp = &(mp->m_sb); 904 xfs_sb_t *sbp = &(mp->m_sb);
906 xfs_inode_t *rip; 905 xfs_inode_t *rip;
907 __uint64_t resblks; 906 __uint64_t resblks;
908 __int64_t update_flags = 0LL;
909 uint quotamount, quotaflags; 907 uint quotamount, quotaflags;
910 int uuid_mounted = 0; 908 int uuid_mounted = 0;
911 int error = 0; 909 int error = 0;
@@ -933,7 +931,7 @@ xfs_mountfs(
933 "XFS: correcting sb_features alignment problem"); 931 "XFS: correcting sb_features alignment problem");
934 sbp->sb_features2 |= sbp->sb_bad_features2; 932 sbp->sb_features2 |= sbp->sb_bad_features2;
935 sbp->sb_bad_features2 = sbp->sb_features2; 933 sbp->sb_bad_features2 = sbp->sb_features2;
936 update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; 934 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
937 935
938 /* 936 /*
939 * Re-check for ATTR2 in case it was found in bad_features2 937 * Re-check for ATTR2 in case it was found in bad_features2
@@ -947,11 +945,11 @@ xfs_mountfs(
947 if (xfs_sb_version_hasattr2(&mp->m_sb) && 945 if (xfs_sb_version_hasattr2(&mp->m_sb) &&
948 (mp->m_flags & XFS_MOUNT_NOATTR2)) { 946 (mp->m_flags & XFS_MOUNT_NOATTR2)) {
949 xfs_sb_version_removeattr2(&mp->m_sb); 947 xfs_sb_version_removeattr2(&mp->m_sb);
950 update_flags |= XFS_SB_FEATURES2; 948 mp->m_update_flags |= XFS_SB_FEATURES2;
951 949
952 /* update sb_versionnum for the clearing of the morebits */ 950 /* update sb_versionnum for the clearing of the morebits */
953 if (!sbp->sb_features2) 951 if (!sbp->sb_features2)
954 update_flags |= XFS_SB_VERSIONNUM; 952 mp->m_update_flags |= XFS_SB_VERSIONNUM;
955 } 953 }
956 954
957 /* 955 /*
@@ -960,7 +958,7 @@ xfs_mountfs(
960 * allocator alignment is within an ag, therefore ag has 958 * allocator alignment is within an ag, therefore ag has
961 * to be aligned at stripe boundary. 959 * to be aligned at stripe boundary.
962 */ 960 */
963 error = xfs_update_alignment(mp, &update_flags); 961 error = xfs_update_alignment(mp);
964 if (error) 962 if (error)
965 goto error1; 963 goto error1;
966 964
@@ -1137,10 +1135,12 @@ xfs_mountfs(
1137 } 1135 }
1138 1136
1139 /* 1137 /*
1140 * If fs is not mounted readonly, then update the superblock changes. 1138 * If this is a read-only mount defer the superblock updates until
1139 * the next remount into writeable mode. Otherwise we would never
1140 * perform the update e.g. for the root filesystem.
1141 */ 1141 */
1142 if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { 1142 if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
1143 error = xfs_mount_log_sb(mp, update_flags); 1143 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1144 if (error) { 1144 if (error) {
1145 cmn_err(CE_WARN, "XFS: failed to write sb changes"); 1145 cmn_err(CE_WARN, "XFS: failed to write sb changes");
1146 goto error4; 1146 goto error4;
@@ -1820,7 +1820,7 @@ xfs_uuid_mount(
1820 * be altered by the mount options, as well as any potential sb_features2 1820 * be altered by the mount options, as well as any potential sb_features2
1821 * fixup. Only the first superblock is updated. 1821 * fixup. Only the first superblock is updated.
1822 */ 1822 */
1823STATIC int 1823int
1824xfs_mount_log_sb( 1824xfs_mount_log_sb(
1825 xfs_mount_t *mp, 1825 xfs_mount_t *mp,
1826 __int64_t fields) 1826 __int64_t fields)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index c1e028467327..f5e9937f9bdb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -44,9 +44,9 @@ typedef struct xfs_trans_reservations {
44 44
45#ifndef __KERNEL__ 45#ifndef __KERNEL__
46 46
47#define XFS_DADDR_TO_AGNO(mp,d) \ 47#define xfs_daddr_to_agno(mp,d) \
48 ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks)) 48 ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
49#define XFS_DADDR_TO_AGBNO(mp,d) \ 49#define xfs_daddr_to_agbno(mp,d) \
50 ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks)) 50 ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
51 51
52#else /* __KERNEL__ */ 52#else /* __KERNEL__ */
@@ -327,6 +327,8 @@ typedef struct xfs_mount {
327 spinlock_t m_sync_lock; /* work item list lock */ 327 spinlock_t m_sync_lock; /* work item list lock */
328 int m_sync_seq; /* sync thread generation no. */ 328 int m_sync_seq; /* sync thread generation no. */
329 wait_queue_head_t m_wait_single_sync_task; 329 wait_queue_head_t m_wait_single_sync_task;
330 __int64_t m_update_flags; /* sb flags we need to update
331 on the next remount,rw */
330} xfs_mount_t; 332} xfs_mount_t;
331 333
332/* 334/*
@@ -439,7 +441,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
439 */ 441 */
440#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ 442#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */
441 443
442#define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d)
443static inline xfs_agnumber_t 444static inline xfs_agnumber_t
444xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) 445xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
445{ 446{
@@ -448,7 +449,6 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
448 return (xfs_agnumber_t) ld; 449 return (xfs_agnumber_t) ld;
449} 450}
450 451
451#define XFS_DADDR_TO_AGBNO(mp,d) xfs_daddr_to_agbno(mp,d)
452static inline xfs_agblock_t 452static inline xfs_agblock_t
453xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) 453xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
454{ 454{
@@ -514,6 +514,7 @@ extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
514 int64_t, int); 514 int64_t, int);
515extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, 515extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
516 uint, int); 516 uint, int);
517extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
517extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); 518extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
518extern int xfs_readsb(xfs_mount_t *, int); 519extern int xfs_readsb(xfs_mount_t *, int);
519extern void xfs_freesb(xfs_mount_t *); 520extern void xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 86471bb40fd4..58f85e9cd11d 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -147,7 +147,7 @@ xfs_rename(
147 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, 147 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
148 inodes, &num_inodes); 148 inodes, &num_inodes);
149 149
150 XFS_BMAP_INIT(&free_list, &first_block); 150 xfs_bmap_init(&free_list, &first_block);
151 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); 151 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
152 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 152 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
153 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); 153 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index edf12c7b834c..c5bb86f3ec05 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -120,7 +120,7 @@ xfs_growfs_rt_alloc(
120 if ((error = xfs_trans_iget(mp, tp, ino, 0, 120 if ((error = xfs_trans_iget(mp, tp, ino, 0,
121 XFS_ILOCK_EXCL, &ip))) 121 XFS_ILOCK_EXCL, &ip)))
122 goto error_cancel; 122 goto error_cancel;
123 XFS_BMAP_INIT(&flist, &firstblock); 123 xfs_bmap_init(&flist, &firstblock);
124 /* 124 /*
125 * Allocate blocks to the bitmap file. 125 * Allocate blocks to the bitmap file.
126 */ 126 */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f87db5344ce6..f76c003ec55d 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -28,7 +28,6 @@ struct xfs_mount;
28 * file is a real time file or not, because the bmap code 28 * file is a real time file or not, because the bmap code
29 * does. 29 * does.
30 */ 30 */
31#define XFS_FSB_TO_DB(ip,fsb) xfs_fsb_to_db(ip,fsb)
32static inline xfs_daddr_t 31static inline xfs_daddr_t
33xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) 32xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
34{ 33{
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1ed71916e4c9..1b017c657494 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -505,7 +505,7 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
505 505
506#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) 506#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
507#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ 507#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
508 XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d)) 508 xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
509#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \ 509#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \
510 XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno)) 510 XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
511 511
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f07bf8768c3a..0e55c5d7db5f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -862,7 +862,7 @@ xfs_inactive_symlink_rmt(
862 * Find the block(s) so we can inval and unmap them. 862 * Find the block(s) so we can inval and unmap them.
863 */ 863 */
864 done = 0; 864 done = 0;
865 XFS_BMAP_INIT(&free_list, &first_block); 865 xfs_bmap_init(&free_list, &first_block);
866 nmaps = ARRAY_SIZE(mval); 866 nmaps = ARRAY_SIZE(mval);
867 if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), 867 if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
868 XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, 868 XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
@@ -1288,7 +1288,7 @@ xfs_inactive(
1288 /* 1288 /*
1289 * Free the inode. 1289 * Free the inode.
1290 */ 1290 */
1291 XFS_BMAP_INIT(&free_list, &first_block); 1291 xfs_bmap_init(&free_list, &first_block);
1292 error = xfs_ifree(tp, ip, &free_list); 1292 error = xfs_ifree(tp, ip, &free_list);
1293 if (error) { 1293 if (error) {
1294 /* 1294 /*
@@ -1461,7 +1461,7 @@ xfs_create(
1461 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1461 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1462 unlock_dp_on_error = B_TRUE; 1462 unlock_dp_on_error = B_TRUE;
1463 1463
1464 XFS_BMAP_INIT(&free_list, &first_block); 1464 xfs_bmap_init(&free_list, &first_block);
1465 1465
1466 ASSERT(ip == NULL); 1466 ASSERT(ip == NULL);
1467 1467
@@ -1879,7 +1879,7 @@ xfs_remove(
1879 } 1879 }
1880 } 1880 }
1881 1881
1882 XFS_BMAP_INIT(&free_list, &first_block); 1882 xfs_bmap_init(&free_list, &first_block);
1883 error = xfs_dir_removename(tp, dp, name, ip->i_ino, 1883 error = xfs_dir_removename(tp, dp, name, ip->i_ino,
1884 &first_block, &free_list, resblks); 1884 &first_block, &free_list, resblks);
1885 if (error) { 1885 if (error) {
@@ -2059,7 +2059,7 @@ xfs_link(
2059 if (error) 2059 if (error)
2060 goto error_return; 2060 goto error_return;
2061 2061
2062 XFS_BMAP_INIT(&free_list, &first_block); 2062 xfs_bmap_init(&free_list, &first_block);
2063 2063
2064 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, 2064 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
2065 &first_block, &free_list, resblks); 2065 &first_block, &free_list, resblks);
@@ -2231,7 +2231,7 @@ xfs_mkdir(
2231 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 2231 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2232 unlock_dp_on_error = B_FALSE; 2232 unlock_dp_on_error = B_FALSE;
2233 2233
2234 XFS_BMAP_INIT(&free_list, &first_block); 2234 xfs_bmap_init(&free_list, &first_block);
2235 2235
2236 error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino, 2236 error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
2237 &first_block, &free_list, resblks ? 2237 &first_block, &free_list, resblks ?
@@ -2438,7 +2438,7 @@ xfs_symlink(
2438 * Initialize the bmap freelist prior to calling either 2438 * Initialize the bmap freelist prior to calling either
2439 * bmapi or the directory create code. 2439 * bmapi or the directory create code.
2440 */ 2440 */
2441 XFS_BMAP_INIT(&free_list, &first_block); 2441 xfs_bmap_init(&free_list, &first_block);
2442 2442
2443 /* 2443 /*
2444 * Allocate an inode for the symlink. 2444 * Allocate an inode for the symlink.
@@ -2860,7 +2860,7 @@ retry:
2860 /* 2860 /*
2861 * Issue the xfs_bmapi() call to allocate the blocks 2861 * Issue the xfs_bmapi() call to allocate the blocks
2862 */ 2862 */
2863 XFS_BMAP_INIT(&free_list, &firstfsb); 2863 xfs_bmap_init(&free_list, &firstfsb);
2864 error = xfs_bmapi(tp, ip, startoffset_fsb, 2864 error = xfs_bmapi(tp, ip, startoffset_fsb,
2865 allocatesize_fsb, bmapi_flag, 2865 allocatesize_fsb, bmapi_flag,
2866 &firstfsb, 0, imapp, &nimaps, 2866 &firstfsb, 0, imapp, &nimaps,
@@ -2980,7 +2980,7 @@ xfs_zero_remaining_bytes(
2980 XFS_BUF_UNDONE(bp); 2980 XFS_BUF_UNDONE(bp);
2981 XFS_BUF_UNWRITE(bp); 2981 XFS_BUF_UNWRITE(bp);
2982 XFS_BUF_READ(bp); 2982 XFS_BUF_READ(bp);
2983 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock)); 2983 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
2984 xfsbdstrat(mp, bp); 2984 xfsbdstrat(mp, bp);
2985 error = xfs_iowait(bp); 2985 error = xfs_iowait(bp);
2986 if (error) { 2986 if (error) {
@@ -3186,7 +3186,7 @@ xfs_free_file_space(
3186 /* 3186 /*
3187 * issue the bunmapi() call to free the blocks 3187 * issue the bunmapi() call to free the blocks
3188 */ 3188 */
3189 XFS_BMAP_INIT(&free_list, &firstfsb); 3189 xfs_bmap_init(&free_list, &firstfsb);
3190 error = xfs_bunmapi(tp, ip, startoffset_fsb, 3190 error = xfs_bunmapi(tp, ip, startoffset_fsb,
3191 endoffset_fsb - startoffset_fsb, 3191 endoffset_fsb - startoffset_fsb,
3192 0, 2, &firstfsb, &free_list, NULL, &done); 3192 0, 2, &firstfsb, &free_list, NULL, &done);