aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2009-09-14 00:16:56 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2009-09-14 00:16:56 -0400
commitfc8e1ead9314cf0e0f1922e661428b93d3a50d88 (patch)
treef3cb97c4769b74f6627a59769f1ed5c92a13c58a /fs
parent2bcaa6a4238094c5695d5b1943078388d82d3004 (diff)
parent9de48cc300fb10f7d9faa978670becf5e352462a (diff)
Merge branch 'next' into for-linus
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c21
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_addr.c2
-rw-r--r--fs/9p/vfs_inode.c126
-rw-r--r--fs/9p/vfs_super.c42
-rw-r--r--fs/Kconfig55
-rw-r--r--fs/adfs/adfs.h59
-rw-r--r--fs/adfs/dir.c18
-rw-r--r--fs/adfs/dir_f.c25
-rw-r--r--fs/adfs/dir_fplus.c25
-rw-r--r--fs/adfs/file.c6
-rw-r--r--fs/adfs/inode.c14
-rw-r--r--fs/adfs/map.c8
-rw-r--r--fs/adfs/super.c22
-rw-r--r--fs/affs/affs.h1
-rw-r--r--fs/affs/dir.c2
-rw-r--r--fs/affs/file.c14
-rw-r--r--fs/affs/super.c54
-rw-r--r--fs/afs/dir.c2
-rw-r--r--fs/afs/file.c18
-rw-r--r--fs/afs/flock.c1
-rw-r--r--fs/afs/misc.c16
-rw-r--r--fs/afs/mntpt.c3
-rw-r--r--fs/afs/super.c5
-rw-r--r--fs/afs/vlocation.c2
-rw-r--r--fs/aio.c24
-rw-r--r--fs/anon_inodes.c15
-rw-r--r--fs/autofs/dirhash.c5
-rw-r--r--fs/autofs4/autofs_i.h6
-rw-r--r--fs/autofs4/dev-ioctl.c196
-rw-r--r--fs/autofs4/expire.c15
-rw-r--r--fs/autofs4/root.c7
-rw-r--r--fs/befs/linuxvfs.c21
-rw-r--r--fs/bfs/dir.c9
-rw-r--r--fs/bfs/file.c1
-rw-r--r--fs/bfs/inode.c52
-rw-r--r--fs/binfmt_elf.c17
-rw-r--r--fs/binfmt_elf_fdpic.c8
-rw-r--r--fs/binfmt_flat.c17
-rw-r--r--fs/bio-integrity.c170
-rw-r--r--fs/bio.c67
-rw-r--r--fs/block_dev.c41
-rw-r--r--fs/btrfs/Makefile4
-rw-r--r--fs/btrfs/acl.c49
-rw-r--r--fs/btrfs/async-thread.c8
-rw-r--r--fs/btrfs/btrfs_inode.h8
-rw-r--r--fs/btrfs/compression.c7
-rw-r--r--fs/btrfs/crc32c.h29
-rw-r--r--fs/btrfs/ctree.c815
-rw-r--r--fs/btrfs/ctree.h360
-rw-r--r--fs/btrfs/delayed-ref.c509
-rw-r--r--fs/btrfs/delayed-ref.h85
-rw-r--r--fs/btrfs/disk-io.c205
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c3610
-rw-r--r--fs/btrfs/extent_io.c18
-rw-r--r--fs/btrfs/file.c84
-rw-r--r--fs/btrfs/free-space-cache.c1066
-rw-r--r--fs/btrfs/free-space-cache.h9
-rw-r--r--fs/btrfs/hash.h4
-rw-r--r--fs/btrfs/inode.c218
-rw-r--r--fs/btrfs/ioctl.c204
-rw-r--r--fs/btrfs/print-tree.c161
-rw-r--r--fs/btrfs/relocation.c3716
-rw-r--r--fs/btrfs/root-tree.c17
-rw-r--r--fs/btrfs/super.c65
-rw-r--r--fs/btrfs/transaction.c452
-rw-r--r--fs/btrfs/transaction.h13
-rw-r--r--fs/btrfs/tree-log.c105
-rw-r--r--fs/btrfs/volumes.c115
-rw-r--r--fs/btrfs/volumes.h12
-rw-r--r--fs/btrfs/zlib.c6
-rw-r--r--fs/buffer.c15
-rw-r--r--fs/cachefiles/interface.c4
-rw-r--r--fs/char_dev.c15
-rw-r--r--fs/cifs/CHANGES22
-rw-r--r--fs/cifs/README33
-rw-r--r--fs/cifs/asn1.c55
-rw-r--r--fs/cifs/cifs_debug.c8
-rw-r--r--fs/cifs/cifs_dfs_ref.c14
-rw-r--r--fs/cifs/cifs_spnego.c15
-rw-r--r--fs/cifs/cifs_unicode.c2
-rw-r--r--fs/cifs/cifsacl.c200
-rw-r--r--fs/cifs/cifsfs.c170
-rw-r--r--fs/cifs/cifsfs.h17
-rw-r--r--fs/cifs/cifsglob.h32
-rw-r--r--fs/cifs/cifspdu.h14
-rw-r--r--fs/cifs/cifsproto.h29
-rw-r--r--fs/cifs/cifssmb.c159
-rw-r--r--fs/cifs/connect.c129
-rw-r--r--fs/cifs/dir.c52
-rw-r--r--fs/cifs/dns_resolve.c25
-rw-r--r--fs/cifs/file.c42
-rw-r--r--fs/cifs/inode.c785
-rw-r--r--fs/cifs/link.c3
-rw-r--r--fs/cifs/netmisc.c80
-rw-r--r--fs/cifs/readdir.c517
-rw-r--r--fs/cifs/sess.c2
-rw-r--r--fs/cifs/xattr.c12
-rw-r--r--fs/coda/file.c9
-rw-r--r--fs/compat.c13
-rw-r--r--fs/compat_ioctl.c110
-rw-r--r--fs/configfs/configfs_internal.h3
-rw-r--r--fs/configfs/dir.c196
-rw-r--r--fs/configfs/inode.c38
-rw-r--r--fs/dcache.c7
-rw-r--r--fs/debugfs/file.c65
-rw-r--r--fs/debugfs/inode.c11
-rw-r--r--fs/devpts/inode.c14
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/dir.c7
-rw-r--r--fs/dlm/lock.c2
-rw-r--r--fs/dlm/lockspace.c17
-rw-r--r--fs/dlm/lowcomms.c26
-rw-r--r--fs/dlm/lowcomms.h3
-rw-r--r--fs/dlm/member.c19
-rw-r--r--fs/dlm/plock.c17
-rw-r--r--fs/dlm/requestqueue.c2
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/keystore.c13
-rw-r--r--fs/ecryptfs/super.c5
-rw-r--r--fs/efs/dir.c5
-rw-r--r--fs/efs/namei.c9
-rw-r--r--fs/efs/symlink.c7
-rw-r--r--fs/eventfd.c125
-rw-r--r--fs/eventpoll.c21
-rw-r--r--fs/exec.c25
-rw-r--r--fs/exofs/common.h10
-rw-r--r--fs/exofs/dir.c4
-rw-r--r--fs/exofs/exofs.h7
-rw-r--r--fs/exofs/file.c21
-rw-r--r--fs/exofs/inode.c15
-rw-r--r--fs/exofs/namei.c4
-rw-r--r--fs/exofs/osd.c34
-rw-r--r--fs/exofs/super.c30
-rw-r--r--fs/exofs/symlink.c4
-rw-r--r--fs/ext2/Makefile2
-rw-r--r--fs/ext2/acl.c81
-rw-r--r--fs/ext2/acl.h4
-rw-r--r--fs/ext2/dir.c7
-rw-r--r--fs/ext2/ext2.h11
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext2/fsync.c50
-rw-r--r--fs/ext2/inode.c15
-rw-r--r--fs/ext2/ioctl.c1
-rw-r--r--fs/ext2/namei.c17
-rw-r--r--fs/ext2/super.c77
-rw-r--r--fs/ext3/Kconfig32
-rw-r--r--fs/ext3/acl.c82
-rw-r--r--fs/ext3/acl.h4
-rw-r--r--fs/ext3/balloc.c3
-rw-r--r--fs/ext3/dir.c3
-rw-r--r--fs/ext3/ialloc.c3
-rw-r--r--fs/ext3/inode.c59
-rw-r--r--fs/ext3/resize.c4
-rw-r--r--fs/ext3/super.c97
-rw-r--r--fs/ext3/xattr.c1
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/acl.c64
-rw-r--r--fs/ext4/acl.h4
-rw-r--r--fs/ext4/balloc.c28
-rw-r--r--fs/ext4/block_validity.c244
-rw-r--r--fs/ext4/dir.c3
-rw-r--r--fs/ext4/ext4.h401
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_i.h140
-rw-r--r--fs/ext4/ext4_jbd2.c4
-rw-r--r--fs/ext4/ext4_jbd2.h6
-rw-r--r--fs/ext4/ext4_sb.h161
-rw-r--r--fs/ext4/extents.c90
-rw-r--r--fs/ext4/file.c36
-rw-r--r--fs/ext4/fsync.c8
-rw-r--r--fs/ext4/group.h29
-rw-r--r--fs/ext4/ialloc.c119
-rw-r--r--fs/ext4/inode.c1170
-rw-r--r--fs/ext4/ioctl.c57
-rw-r--r--fs/ext4/mballoc.c291
-rw-r--r--fs/ext4/mballoc.h2
-rw-r--r--fs/ext4/migrate.c8
-rw-r--r--fs/ext4/move_extent.c1320
-rw-r--r--fs/ext4/namei.c37
-rw-r--r--fs/ext4/namei.h8
-rw-r--r--fs/ext4/resize.c38
-rw-r--r--fs/ext4/super.c889
-rw-r--r--fs/fat/cache.c6
-rw-r--r--fs/fat/dir.c48
-rw-r--r--fs/fat/fat.h13
-rw-r--r--fs/fat/fatent.c17
-rw-r--r--fs/fat/file.c200
-rw-r--r--fs/fat/inode.c61
-rw-r--r--fs/fat/misc.c22
-rw-r--r--fs/fat/namei_msdos.c7
-rw-r--r--fs/fat/namei_vfat.c11
-rw-r--r--fs/fcntl.c34
-rw-r--r--fs/file_table.c40
-rw-r--r--fs/freevxfs/vxfs_super.c5
-rw-r--r--fs/fs-writeback.c198
-rw-r--r--fs/fuse/Makefile1
-rw-r--r--fs/fuse/cuse.c610
-rw-r--r--fs/fuse/dev.c106
-rw-r--r--fs/fuse/dir.c90
-rw-r--r--fs/fuse/file.c348
-rw-r--r--fs/fuse/fuse_i.h74
-rw-r--r--fs/fuse/inode.c189
-rw-r--r--fs/gfs2/Kconfig3
-rw-r--r--fs/gfs2/Makefile5
-rw-r--r--fs/gfs2/aops.c (renamed from fs/gfs2/ops_address.c)60
-rw-r--r--fs/gfs2/bmap.c15
-rw-r--r--fs/gfs2/dentry.c (renamed from fs/gfs2/ops_dentry.c)0
-rw-r--r--fs/gfs2/dir.c11
-rw-r--r--fs/gfs2/eattr.c14
-rw-r--r--fs/gfs2/export.c (renamed from fs/gfs2/ops_export.c)0
-rw-r--r--fs/gfs2/file.c (renamed from fs/gfs2/ops_file.c)36
-rw-r--r--fs/gfs2/glock.c171
-rw-r--r--fs/gfs2/glock.h3
-rw-r--r--fs/gfs2/glops.c41
-rw-r--r--fs/gfs2/incore.h29
-rw-r--r--fs/gfs2/inode.c150
-rw-r--r--fs/gfs2/inode.h52
-rw-r--r--fs/gfs2/log.c17
-rw-r--r--fs/gfs2/lops.c17
-rw-r--r--fs/gfs2/main.c8
-rw-r--r--fs/gfs2/meta_io.c105
-rw-r--r--fs/gfs2/mount.c185
-rw-r--r--fs/gfs2/ops_address.h23
-rw-r--r--fs/gfs2/ops_fstype.c74
-rw-r--r--fs/gfs2/ops_inode.c146
-rw-r--r--fs/gfs2/ops_super.c723
-rw-r--r--fs/gfs2/quota.c1
-rw-r--r--fs/gfs2/recovery.c102
-rw-r--r--fs/gfs2/recovery.h2
-rw-r--r--fs/gfs2/rgrp.c175
-rw-r--r--fs/gfs2/rgrp.h47
-rw-r--r--fs/gfs2/super.c930
-rw-r--r--fs/gfs2/super.h4
-rw-r--r--fs/gfs2/sys.c245
-rw-r--r--fs/gfs2/trace_gfs2.h407
-rw-r--r--fs/gfs2/trans.c9
-rw-r--r--fs/hfs/super.c24
-rw-r--r--fs/hfsplus/super.c26
-rw-r--r--fs/hostfs/hostfs_kern.c1
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/file.c1
-rw-r--r--fs/hpfs/hpfs_fn.h1
-rw-r--r--fs/hpfs/inode.c1
-rw-r--r--fs/hpfs/namei.c1
-rw-r--r--fs/hpfs/super.c12
-rw-r--r--fs/hugetlbfs/inode.c22
-rw-r--r--fs/inode.c81
-rw-r--r--fs/internal.h17
-rw-r--r--fs/ioctl.c51
-rw-r--r--fs/isofs/dir.c5
-rw-r--r--fs/isofs/inode.c127
-rw-r--r--fs/isofs/isofs.h27
-rw-r--r--fs/isofs/joliet.c36
-rw-r--r--fs/isofs/namei.c4
-rw-r--r--fs/jbd/journal.c26
-rw-r--r--fs/jbd/transaction.c116
-rw-r--r--fs/jbd2/checkpoint.c5
-rw-r--r--fs/jbd2/commit.c13
-rw-r--r--fs/jbd2/journal.c98
-rw-r--r--fs/jbd2/transaction.c117
-rw-r--r--fs/jffs2/acl.c87
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/erase.c10
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/fs.c18
-rw-r--r--fs/jffs2/jffs2_fs_i.h4
-rw-r--r--fs/jffs2/os-linux.h5
-rw-r--r--fs/jffs2/readinode.c1
-rw-r--r--fs/jffs2/scan.c8
-rw-r--r--fs/jffs2/super.c27
-rw-r--r--fs/jfs/acl.c45
-rw-r--r--fs/jfs/jfs_extent.c1
-rw-r--r--fs/jfs/jfs_imap.c1
-rw-r--r--fs/jfs/jfs_incore.h6
-rw-r--r--fs/jfs/super.c47
-rw-r--r--fs/jfs/xattr.c10
-rw-r--r--fs/libfs.c27
-rw-r--r--fs/lockd/clntproc.c5
-rw-r--r--fs/lockd/mon.c19
-rw-r--r--fs/lockd/svc4proc.c1
-rw-r--r--fs/lockd/svclock.c2
-rw-r--r--fs/lockd/svcproc.c1
-rw-r--r--fs/locks.c3
-rw-r--r--fs/minix/bitmap.c25
-rw-r--r--fs/minix/dir.c7
-rw-r--r--fs/minix/file.c20
-rw-r--r--fs/minix/inode.c35
-rw-r--r--fs/minix/minix.h7
-rw-r--r--fs/mpage.c6
-rw-r--r--fs/namei.c151
-rw-r--r--fs/namespace.c419
-rw-r--r--fs/ncpfs/inode.c4
-rw-r--r--fs/ncpfs/ncplib_kernel.c8
-rw-r--r--fs/nfs/Kconfig11
-rw-r--r--fs/nfs/callback.c218
-rw-r--r--fs/nfs/callback.h68
-rw-r--r--fs/nfs/callback_proc.c127
-rw-r--r--fs/nfs/callback_xdr.c280
-rw-r--r--fs/nfs/client.c179
-rw-r--r--fs/nfs/delegation.c33
-rw-r--r--fs/nfs/dir.c3
-rw-r--r--fs/nfs/direct.c29
-rw-r--r--fs/nfs/file.c38
-rw-r--r--fs/nfs/getroot.c1
-rw-r--r--fs/nfs/inode.c1
-rw-r--r--fs/nfs/internal.h70
-rw-r--r--fs/nfs/iostat.h6
-rw-r--r--fs/nfs/mount_clnt.c337
-rw-r--r--fs/nfs/namespace.c7
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs4_fs.h43
-rw-r--r--fs/nfs/nfs4proc.c1375
-rw-r--r--fs/nfs/nfs4renewd.c6
-rw-r--r--fs/nfs/nfs4state.c192
-rw-r--r--fs/nfs/nfs4xdr.c1072
-rw-r--r--fs/nfs/nfsroot.c5
-rw-r--r--fs/nfs/read.c40
-rw-r--r--fs/nfs/super.c499
-rw-r--r--fs/nfs/unlink.c20
-rw-r--r--fs/nfs/write.c45
-rw-r--r--fs/nfsd/export.c91
-rw-r--r--fs/nfsd/nfs3proc.c237
-rw-r--r--fs/nfsd/nfs3xdr.c1
-rw-r--r--fs/nfsd/nfs4callback.c247
-rw-r--r--fs/nfsd/nfs4proc.c129
-rw-r--r--fs/nfsd/nfs4state.c171
-rw-r--r--fs/nfsd/nfs4xdr.c296
-rw-r--r--fs/nfsd/nfscache.c33
-rw-r--r--fs/nfsd/nfsctl.c295
-rw-r--r--fs/nfsd/nfsfh.c6
-rw-r--r--fs/nfsd/nfsproc.c198
-rw-r--r--fs/nfsd/nfssvc.c13
-rw-r--r--fs/nfsd/vfs.c164
-rw-r--r--fs/nilfs2/Kconfig25
-rw-r--r--fs/nilfs2/bmap.c277
-rw-r--r--fs/nilfs2/bmap.h135
-rw-r--r--fs/nilfs2/btnode.c9
-rw-r--r--fs/nilfs2/btnode.h2
-rw-r--r--fs/nilfs2/btree.c366
-rw-r--r--fs/nilfs2/btree.h31
-rw-r--r--fs/nilfs2/cpfile.c58
-rw-r--r--fs/nilfs2/cpfile.h4
-rw-r--r--fs/nilfs2/dat.c45
-rw-r--r--fs/nilfs2/dat.h2
-rw-r--r--fs/nilfs2/dir.c1
-rw-r--r--fs/nilfs2/direct.c139
-rw-r--r--fs/nilfs2/direct.h20
-rw-r--r--fs/nilfs2/gcinode.c5
-rw-r--r--fs/nilfs2/inode.c26
-rw-r--r--fs/nilfs2/ioctl.c35
-rw-r--r--fs/nilfs2/mdt.c7
-rw-r--r--fs/nilfs2/nilfs.h5
-rw-r--r--fs/nilfs2/recovery.c37
-rw-r--r--fs/nilfs2/sb.h1
-rw-r--r--fs/nilfs2/segbuf.c3
-rw-r--r--fs/nilfs2/seglist.h85
-rw-r--r--fs/nilfs2/segment.c174
-rw-r--r--fs/nilfs2/segment.h12
-rw-r--r--fs/nilfs2/sufile.c119
-rw-r--r--fs/nilfs2/sufile.h62
-rw-r--r--fs/nilfs2/super.c277
-rw-r--r--fs/nilfs2/the_nilfs.c116
-rw-r--r--fs/nilfs2/the_nilfs.h23
-rw-r--r--fs/nls/nls_base.c166
-rw-r--r--fs/nls/nls_utf8.c13
-rw-r--r--fs/notify/Kconfig3
-rw-r--r--fs/notify/Makefile2
-rw-r--r--fs/notify/dnotify/Kconfig1
-rw-r--r--fs/notify/dnotify/dnotify.c464
-rw-r--r--fs/notify/fsnotify.c188
-rw-r--r--fs/notify/fsnotify.h34
-rw-r--r--fs/notify/group.c254
-rw-r--r--fs/notify/inode_mark.c426
-rw-r--r--fs/notify/inotify/Kconfig20
-rw-r--r--fs/notify/inotify/Makefile2
-rw-r--r--fs/notify/inotify/inotify.c20
-rw-r--r--fs/notify/inotify/inotify.h22
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c168
-rw-r--r--fs/notify/inotify/inotify_user.c911
-rw-r--r--fs/notify/notification.c421
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ntfs/logfile.c3
-rw-r--r--fs/ntfs/super.c60
-rw-r--r--fs/ocfs2/alloc.c129
-rw-r--r--fs/ocfs2/aops.c69
-rw-r--r--fs/ocfs2/blockcheck.c184
-rw-r--r--fs/ocfs2/blockcheck.h29
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/masklog.h35
-rw-r--r--fs/ocfs2/cluster/tcp.c7
-rw-r--r--fs/ocfs2/dcache.c35
-rw-r--r--fs/ocfs2/dcache.h3
-rw-r--r--fs/ocfs2/dir.c21
-rw-r--r--fs/ocfs2/dlm/dlmast.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/dlmglue.c144
-rw-r--r--fs/ocfs2/dlmglue.h31
-rw-r--r--fs/ocfs2/file.c67
-rw-r--r--fs/ocfs2/inode.c11
-rw-r--r--fs/ocfs2/ioctl.c1
-rw-r--r--fs/ocfs2/journal.c132
-rw-r--r--fs/ocfs2/journal.h23
-rw-r--r--fs/ocfs2/namei.c15
-rw-r--r--fs/ocfs2/ocfs2.h48
-rw-r--r--fs/ocfs2/ocfs2_lockid.h6
-rw-r--r--fs/ocfs2/quota.h1
-rw-r--r--fs/ocfs2/quota_global.c146
-rw-r--r--fs/ocfs2/quota_local.c131
-rw-r--r--fs/ocfs2/stack_o2cb.c14
-rw-r--r--fs/ocfs2/stack_user.c8
-rw-r--r--fs/ocfs2/stackglue.c13
-rw-r--r--fs/ocfs2/stackglue.h6
-rw-r--r--fs/ocfs2/suballoc.c28
-rw-r--r--fs/ocfs2/super.c159
-rw-r--r--fs/ocfs2/sysfile.c19
-rw-r--r--fs/ocfs2/xattr.c8
-rw-r--r--fs/omfs/file.c17
-rw-r--r--fs/open.c62
-rw-r--r--fs/partitions/check.c54
-rw-r--r--fs/partitions/ibm.c2
-rw-r--r--fs/partitions/msdos.c4
-rw-r--r--fs/pipe.c18
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/base.c33
-rw-r--r--fs/proc/internal.h25
-rw-r--r--fs/proc/loadavg.c18
-rw-r--r--fs/proc/meminfo.c4
-rw-r--r--fs/proc/page.c162
-rw-r--r--fs/proc/proc_devtree.c11
-rw-r--r--fs/proc/softirqs.c44
-rw-r--r--fs/proc/stat.c15
-rw-r--r--fs/proc/task_mmu.c1
-rw-r--r--fs/proc/task_nommu.c1
-rw-r--r--fs/proc/vmcore.c7
-rw-r--r--fs/qnx4/Makefile2
-rw-r--r--fs/qnx4/bitmap.c7
-rw-r--r--fs/qnx4/dir.c9
-rw-r--r--fs/qnx4/file.c5
-rw-r--r--fs/qnx4/fsync.c169
-rw-r--r--fs/qnx4/inode.c58
-rw-r--r--fs/qnx4/namei.c13
-rw-r--r--fs/qnx4/qnx4.h57
-rw-r--r--fs/qnx4/truncate.c6
-rw-r--r--fs/quota/dquot.c9
-rw-r--r--fs/quota/quota.c25
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/ramfs/inode.c9
-rw-r--r--fs/read_write.c7
-rw-r--r--fs/reiserfs/dir.c10
-rw-r--r--fs/reiserfs/do_balan.c5
-rw-r--r--fs/reiserfs/inode.c4
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/reiserfs/lbalance.c10
-rw-r--r--fs/reiserfs/resize.c1
-rw-r--r--fs/reiserfs/super.c58
-rw-r--r--fs/reiserfs/xattr.c4
-rw-r--r--fs/reiserfs/xattr_acl.c58
-rw-r--r--fs/select.c41
-rw-r--r--fs/seq_file.c20
-rw-r--r--fs/smbfs/inode.c4
-rw-r--r--fs/splice.c338
-rw-r--r--fs/squashfs/super.c5
-rw-r--r--fs/super.c189
-rw-r--r--fs/sync.c122
-rw-r--r--fs/sysfs/bin.c1
-rw-r--r--fs/sysfs/dir.c2
-rw-r--r--fs/sysfs/symlink.c5
-rw-r--r--fs/sysv/dir.c7
-rw-r--r--fs/sysv/file.c17
-rw-r--r--fs/sysv/inode.c74
-rw-r--r--fs/sysv/sysv.h1
-rw-r--r--fs/ubifs/budget.c4
-rw-r--r--fs/ubifs/dir.c19
-rw-r--r--fs/ubifs/io.c67
-rw-r--r--fs/ubifs/ioctl.c1
-rw-r--r--fs/ubifs/recovery.c88
-rw-r--r--fs/ubifs/replay.c9
-rw-r--r--fs/ubifs/scan.c20
-rw-r--r--fs/ubifs/super.c101
-rw-r--r--fs/ubifs/ubifs.h20
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/udf/Makefile2
-rw-r--r--fs/udf/balloc.c9
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/fsync.c52
-rw-r--r--fs/udf/lowlevel.c7
-rw-r--r--fs/udf/super.c25
-rw-r--r--fs/udf/udfdecl.h3
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/file.c23
-rw-r--r--fs/ufs/inode.c10
-rw-r--r--fs/ufs/super.c65
-rw-r--r--fs/ufs/ufs.h1
-rw-r--r--fs/xattr.c4
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile5
-rw-r--r--fs/xfs/linux-2.6/kmem.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c468
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c25
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c57
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c63
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c492
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h20
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c67
-rw-r--r--fs/xfs/quota/xfs_dquot.c5
-rw-r--r--fs/xfs/quota/xfs_dquot.h1
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c1
-rw-r--r--fs/xfs/quota/xfs_qm.c168
-rw-r--r--fs/xfs/quota/xfs_qm.h21
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c77
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c1
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c113
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c66
-rw-r--r--fs/xfs/xfs_acl.c874
-rw-r--r--fs/xfs/xfs_acl.h95
-rw-r--r--fs/xfs/xfs_ag.h2
-rw-r--r--fs/xfs/xfs_arch.h32
-rw-r--r--fs/xfs/xfs_attr.c21
-rw-r--r--fs/xfs/xfs_bmap.c36
-rw-r--r--fs/xfs/xfs_bmap_btree.c4
-rw-r--r--fs/xfs/xfs_btree.c4
-rw-r--r--fs/xfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/xfs_dir2.c2
-rw-r--r--fs/xfs/xfs_filestream.c6
-rw-r--r--fs/xfs/xfs_fs.h11
-rw-r--r--fs/xfs/xfs_fsops.c20
-rw-r--r--fs/xfs/xfs_iget.c259
-rw-r--r--fs/xfs/xfs_inode.c11
-rw-r--r--fs/xfs/xfs_inode.h18
-rw-r--r--fs/xfs/xfs_iomap.c13
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_recover.c38
-rw-r--r--fs/xfs/xfs_mount.c105
-rw-r--r--fs/xfs/xfs_mount.h84
-rw-r--r--fs/xfs/xfs_qmops.c152
-rw-r--r--fs/xfs/xfs_quota.h129
-rw-r--r--fs/xfs/xfs_rename.c3
-rw-r--r--fs/xfs/xfs_rw.c1
-rw-r--r--fs/xfs/xfs_trans.c17
-rw-r--r--fs/xfs/xfs_utils.c2
-rw-r--r--fs/xfs/xfs_vnodeops.c118
-rw-r--r--fs/xfs/xfs_vnodeops.h1
552 files changed, 32772 insertions, 16710 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 332b5ff02fec..f7003cfac63d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -76,7 +76,7 @@ static const match_table_t tokens = {
76 * Return 0 upon success, -ERRNO upon failure. 76 * Return 0 upon success, -ERRNO upon failure.
77 */ 77 */
78 78
79static int v9fs_parse_options(struct v9fs_session_info *v9ses) 79static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
80{ 80{
81 char *options; 81 char *options;
82 substring_t args[MAX_OPT_ARGS]; 82 substring_t args[MAX_OPT_ARGS];
@@ -90,10 +90,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
90 v9ses->debug = 0; 90 v9ses->debug = 0;
91 v9ses->cache = 0; 91 v9ses->cache = 0;
92 92
93 if (!v9ses->options) 93 if (!opts)
94 return 0; 94 return 0;
95 95
96 options = kstrdup(v9ses->options, GFP_KERNEL); 96 options = kstrdup(opts, GFP_KERNEL);
97 if (!options) { 97 if (!options) {
98 P9_DPRINTK(P9_DEBUG_ERROR, 98 P9_DPRINTK(P9_DEBUG_ERROR,
99 "failed to allocate copy of option string\n"); 99 "failed to allocate copy of option string\n");
@@ -206,24 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
206 v9ses->uid = ~0; 206 v9ses->uid = ~0;
207 v9ses->dfltuid = V9FS_DEFUID; 207 v9ses->dfltuid = V9FS_DEFUID;
208 v9ses->dfltgid = V9FS_DEFGID; 208 v9ses->dfltgid = V9FS_DEFGID;
209 if (data) {
210 v9ses->options = kstrdup(data, GFP_KERNEL);
211 if (!v9ses->options) {
212 P9_DPRINTK(P9_DEBUG_ERROR,
213 "failed to allocate copy of option string\n");
214 retval = -ENOMEM;
215 goto error;
216 }
217 }
218 209
219 rc = v9fs_parse_options(v9ses); 210 rc = v9fs_parse_options(v9ses, data);
220 if (rc < 0) { 211 if (rc < 0) {
221 retval = rc; 212 retval = rc;
222 goto error; 213 goto error;
223 } 214 }
224 215
225 v9ses->clnt = p9_client_create(dev_name, v9ses->options); 216 v9ses->clnt = p9_client_create(dev_name, data);
226
227 if (IS_ERR(v9ses->clnt)) { 217 if (IS_ERR(v9ses->clnt)) {
228 retval = PTR_ERR(v9ses->clnt); 218 retval = PTR_ERR(v9ses->clnt);
229 v9ses->clnt = NULL; 219 v9ses->clnt = NULL;
@@ -280,7 +270,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
280 270
281 __putname(v9ses->uname); 271 __putname(v9ses->uname);
282 __putname(v9ses->aname); 272 __putname(v9ses->aname);
283 kfree(v9ses->options);
284} 273}
285 274
286/** 275/**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a7d567192998..38762bf102a9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -85,7 +85,6 @@ struct v9fs_session_info {
85 unsigned int afid; 85 unsigned int afid;
86 unsigned int cache; 86 unsigned int cache;
87 87
88 char *options; /* copy of mount options */
89 char *uname; /* user name to mount as */ 88 char *uname; /* user name to mount as */
90 char *aname; /* name of remote hierarchy being mounted */ 89 char *aname; /* name of remote hierarchy being mounted */
91 unsigned int maxdata; /* max data for client interface */ 90 unsigned int maxdata; /* max data for client interface */
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6fcb1e7095cf..92828281a30b 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -57,7 +57,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
57 buffer = kmap(page); 57 buffer = kmap(page);
58 offset = page_offset(page); 58 offset = page_offset(page);
59 59
60 retval = v9fs_file_readn(filp, buffer, NULL, offset, PAGE_CACHE_SIZE); 60 retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
61 if (retval < 0) 61 if (retval < 0)
62 goto done; 62 goto done;
63 63
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 81f8bbf12f9f..06a223d50a81 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -171,7 +171,6 @@ int v9fs_uflags2omode(int uflags, int extended)
171 171
172/** 172/**
173 * v9fs_blank_wstat - helper function to setup a 9P stat structure 173 * v9fs_blank_wstat - helper function to setup a 9P stat structure
174 * @v9ses: 9P session info (for determining extended mode)
175 * @wstat: structure to initialize 174 * @wstat: structure to initialize
176 * 175 *
177 */ 176 */
@@ -207,65 +206,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
207 206
208struct inode *v9fs_get_inode(struct super_block *sb, int mode) 207struct inode *v9fs_get_inode(struct super_block *sb, int mode)
209{ 208{
209 int err;
210 struct inode *inode; 210 struct inode *inode;
211 struct v9fs_session_info *v9ses = sb->s_fs_info; 211 struct v9fs_session_info *v9ses = sb->s_fs_info;
212 212
213 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); 213 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
214 214
215 inode = new_inode(sb); 215 inode = new_inode(sb);
216 if (inode) { 216 if (!inode) {
217 inode->i_mode = mode;
218 inode->i_uid = current_fsuid();
219 inode->i_gid = current_fsgid();
220 inode->i_blocks = 0;
221 inode->i_rdev = 0;
222 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
223 inode->i_mapping->a_ops = &v9fs_addr_operations;
224
225 switch (mode & S_IFMT) {
226 case S_IFIFO:
227 case S_IFBLK:
228 case S_IFCHR:
229 case S_IFSOCK:
230 if (!v9fs_extended(v9ses)) {
231 P9_DPRINTK(P9_DEBUG_ERROR,
232 "special files without extended mode\n");
233 return ERR_PTR(-EINVAL);
234 }
235 init_special_inode(inode, inode->i_mode,
236 inode->i_rdev);
237 break;
238 case S_IFREG:
239 inode->i_op = &v9fs_file_inode_operations;
240 inode->i_fop = &v9fs_file_operations;
241 break;
242 case S_IFLNK:
243 if (!v9fs_extended(v9ses)) {
244 P9_DPRINTK(P9_DEBUG_ERROR,
245 "extended modes used w/o 9P2000.u\n");
246 return ERR_PTR(-EINVAL);
247 }
248 inode->i_op = &v9fs_symlink_inode_operations;
249 break;
250 case S_IFDIR:
251 inc_nlink(inode);
252 if (v9fs_extended(v9ses))
253 inode->i_op = &v9fs_dir_inode_operations_ext;
254 else
255 inode->i_op = &v9fs_dir_inode_operations;
256 inode->i_fop = &v9fs_dir_operations;
257 break;
258 default:
259 P9_DPRINTK(P9_DEBUG_ERROR,
260 "BAD mode 0x%x S_IFMT 0x%x\n",
261 mode, mode & S_IFMT);
262 return ERR_PTR(-EINVAL);
263 }
264 } else {
265 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); 217 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
266 return ERR_PTR(-ENOMEM); 218 return ERR_PTR(-ENOMEM);
267 } 219 }
220
221 inode->i_mode = mode;
222 inode->i_uid = current_fsuid();
223 inode->i_gid = current_fsgid();
224 inode->i_blocks = 0;
225 inode->i_rdev = 0;
226 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
227 inode->i_mapping->a_ops = &v9fs_addr_operations;
228
229 switch (mode & S_IFMT) {
230 case S_IFIFO:
231 case S_IFBLK:
232 case S_IFCHR:
233 case S_IFSOCK:
234 if (!v9fs_extended(v9ses)) {
235 P9_DPRINTK(P9_DEBUG_ERROR,
236 "special files without extended mode\n");
237 err = -EINVAL;
238 goto error;
239 }
240 init_special_inode(inode, inode->i_mode, inode->i_rdev);
241 break;
242 case S_IFREG:
243 inode->i_op = &v9fs_file_inode_operations;
244 inode->i_fop = &v9fs_file_operations;
245 break;
246 case S_IFLNK:
247 if (!v9fs_extended(v9ses)) {
248 P9_DPRINTK(P9_DEBUG_ERROR,
249 "extended modes used w/o 9P2000.u\n");
250 err = -EINVAL;
251 goto error;
252 }
253 inode->i_op = &v9fs_symlink_inode_operations;
254 break;
255 case S_IFDIR:
256 inc_nlink(inode);
257 if (v9fs_extended(v9ses))
258 inode->i_op = &v9fs_dir_inode_operations_ext;
259 else
260 inode->i_op = &v9fs_dir_inode_operations;
261 inode->i_fop = &v9fs_dir_operations;
262 break;
263 default:
264 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
265 mode, mode & S_IFMT);
266 err = -EINVAL;
267 goto error;
268 }
269
268 return inode; 270 return inode;
271
272error:
273 iput(inode);
274 return ERR_PTR(err);
269} 275}
270 276
271/* 277/*
@@ -338,30 +344,25 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
338 344
339 ret = NULL; 345 ret = NULL;
340 st = p9_client_stat(fid); 346 st = p9_client_stat(fid);
341 if (IS_ERR(st)) { 347 if (IS_ERR(st))
342 err = PTR_ERR(st); 348 return ERR_CAST(st);
343 st = NULL;
344 goto error;
345 }
346 349
347 umode = p9mode2unixmode(v9ses, st->mode); 350 umode = p9mode2unixmode(v9ses, st->mode);
348 ret = v9fs_get_inode(sb, umode); 351 ret = v9fs_get_inode(sb, umode);
349 if (IS_ERR(ret)) { 352 if (IS_ERR(ret)) {
350 err = PTR_ERR(ret); 353 err = PTR_ERR(ret);
351 ret = NULL;
352 goto error; 354 goto error;
353 } 355 }
354 356
355 v9fs_stat2inode(st, ret, sb); 357 v9fs_stat2inode(st, ret, sb);
356 ret->i_ino = v9fs_qid2ino(&st->qid); 358 ret->i_ino = v9fs_qid2ino(&st->qid);
359 p9stat_free(st);
357 kfree(st); 360 kfree(st);
358 return ret; 361 return ret;
359 362
360error: 363error:
364 p9stat_free(st);
361 kfree(st); 365 kfree(st);
362 if (ret)
363 iput(ret);
364
365 return ERR_PTR(err); 366 return ERR_PTR(err);
366} 367}
367 368
@@ -403,9 +404,9 @@ v9fs_open_created(struct inode *inode, struct file *file)
403 * @v9ses: session information 404 * @v9ses: session information
404 * @dir: directory that dentry is being created in 405 * @dir: directory that dentry is being created in
405 * @dentry: dentry that is being created 406 * @dentry: dentry that is being created
407 * @extension: 9p2000.u extension string to support devices, etc.
406 * @perm: create permissions 408 * @perm: create permissions
407 * @mode: open mode 409 * @mode: open mode
408 * @extension: 9p2000.u extension string to support devices, etc.
409 * 410 *
410 */ 411 */
411static struct p9_fid * 412static struct p9_fid *
@@ -470,7 +471,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
470 dentry->d_op = &v9fs_dentry_operations; 471 dentry->d_op = &v9fs_dentry_operations;
471 472
472 d_instantiate(dentry, inode); 473 d_instantiate(dentry, inode);
473 v9fs_fid_add(dentry, fid); 474 err = v9fs_fid_add(dentry, fid);
475 if (err < 0)
476 goto error;
477
474 return ofid; 478 return ofid;
475 479
476error: 480error:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index ab5547ff29a1..8961f1a8f668 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,7 +37,6 @@
37#include <linux/mount.h> 37#include <linux/mount.h>
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/smp_lock.h>
41#include <net/9p/9p.h> 40#include <net/9p/9p.h>
42#include <net/9p/client.h> 41#include <net/9p/client.h>
43 42
@@ -82,7 +81,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
82 81
83static void 82static void
84v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, 83v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
85 int flags) 84 int flags, void *data)
86{ 85{
87 sb->s_maxbytes = MAX_LFS_FILESIZE; 86 sb->s_maxbytes = MAX_LFS_FILESIZE;
88 sb->s_blocksize_bits = fls(v9ses->maxdata - 1); 87 sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
@@ -92,6 +91,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
92 91
93 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 92 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
94 MS_NOATIME; 93 MS_NOATIME;
94
95 save_mount_options(sb, data);
95} 96}
96 97
97/** 98/**
@@ -114,14 +115,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
114 struct v9fs_session_info *v9ses = NULL; 115 struct v9fs_session_info *v9ses = NULL;
115 struct p9_wstat *st = NULL; 116 struct p9_wstat *st = NULL;
116 int mode = S_IRWXUGO | S_ISVTX; 117 int mode = S_IRWXUGO | S_ISVTX;
117 uid_t uid = current_fsuid();
118 gid_t gid = current_fsgid();
119 struct p9_fid *fid; 118 struct p9_fid *fid;
120 int retval = 0; 119 int retval = 0;
121 120
122 P9_DPRINTK(P9_DEBUG_VFS, " \n"); 121 P9_DPRINTK(P9_DEBUG_VFS, " \n");
123 122
124 st = NULL;
125 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); 123 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
126 if (!v9ses) 124 if (!v9ses)
127 return -ENOMEM; 125 return -ENOMEM;
@@ -143,7 +141,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
143 retval = PTR_ERR(sb); 141 retval = PTR_ERR(sb);
144 goto free_stat; 142 goto free_stat;
145 } 143 }
146 v9fs_fill_super(sb, v9ses, flags); 144 v9fs_fill_super(sb, v9ses, flags, data);
147 145
148 inode = v9fs_get_inode(sb, S_IFDIR | mode); 146 inode = v9fs_get_inode(sb, S_IFDIR | mode);
149 if (IS_ERR(inode)) { 147 if (IS_ERR(inode)) {
@@ -151,9 +149,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
151 goto release_sb; 149 goto release_sb;
152 } 150 }
153 151
154 inode->i_uid = uid;
155 inode->i_gid = gid;
156
157 root = d_alloc_root(inode); 152 root = d_alloc_root(inode);
158 if (!root) { 153 if (!root) {
159 iput(inode); 154 iput(inode);
@@ -174,10 +169,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
174 simple_set_mnt(mnt, sb); 169 simple_set_mnt(mnt, sb);
175 return 0; 170 return 0;
176 171
177release_sb:
178 deactivate_locked_super(sb);
179
180free_stat: 172free_stat:
173 p9stat_free(st);
181 kfree(st); 174 kfree(st);
182 175
183clunk_fid: 176clunk_fid:
@@ -186,7 +179,12 @@ clunk_fid:
186close_session: 179close_session:
187 v9fs_session_close(v9ses); 180 v9fs_session_close(v9ses);
188 kfree(v9ses); 181 kfree(v9ses);
182 return retval;
189 183
184release_sb:
185 p9stat_free(st);
186 kfree(st);
187 deactivate_locked_super(sb);
190 return retval; 188 return retval;
191} 189}
192 190
@@ -208,39 +206,23 @@ static void v9fs_kill_super(struct super_block *s)
208 206
209 v9fs_session_close(v9ses); 207 v9fs_session_close(v9ses);
210 kfree(v9ses); 208 kfree(v9ses);
209 s->s_fs_info = NULL;
211 P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); 210 P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
212} 211}
213 212
214/**
215 * v9fs_show_options - Show mount options in /proc/mounts
216 * @m: seq_file to write to
217 * @mnt: mount descriptor
218 *
219 */
220
221static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
222{
223 struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
224
225 seq_printf(m, "%s", v9ses->options);
226 return 0;
227}
228
229static void 213static void
230v9fs_umount_begin(struct super_block *sb) 214v9fs_umount_begin(struct super_block *sb)
231{ 215{
232 struct v9fs_session_info *v9ses; 216 struct v9fs_session_info *v9ses;
233 217
234 lock_kernel();
235 v9ses = sb->s_fs_info; 218 v9ses = sb->s_fs_info;
236 v9fs_session_cancel(v9ses); 219 v9fs_session_cancel(v9ses);
237 unlock_kernel();
238} 220}
239 221
240static const struct super_operations v9fs_super_ops = { 222static const struct super_operations v9fs_super_ops = {
241 .statfs = simple_statfs, 223 .statfs = simple_statfs,
242 .clear_inode = v9fs_clear_inode, 224 .clear_inode = v9fs_clear_inode,
243 .show_options = v9fs_show_options, 225 .show_options = generic_show_options,
244 .umount_begin = v9fs_umount_begin, 226 .umount_begin = v9fs_umount_begin,
245}; 227};
246 228
diff --git a/fs/Kconfig b/fs/Kconfig
index 9f7270f36b2a..0e7da7bb5d93 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -39,6 +39,13 @@ config FS_POSIX_ACL
39 bool 39 bool
40 default n 40 default n
41 41
42source "fs/xfs/Kconfig"
43source "fs/gfs2/Kconfig"
44source "fs/ocfs2/Kconfig"
45source "fs/btrfs/Kconfig"
46
47endif # BLOCK
48
42config FILE_LOCKING 49config FILE_LOCKING
43 bool "Enable POSIX file locking API" if EMBEDDED 50 bool "Enable POSIX file locking API" if EMBEDDED
44 default y 51 default y
@@ -47,13 +54,6 @@ config FILE_LOCKING
47 for filesystems like NFS and for the flock() system 54 for filesystems like NFS and for the flock() system
48 call. Disabling this option saves about 11k. 55 call. Disabling this option saves about 11k.
49 56
50source "fs/xfs/Kconfig"
51source "fs/gfs2/Kconfig"
52source "fs/ocfs2/Kconfig"
53source "fs/btrfs/Kconfig"
54
55endif # BLOCK
56
57source "fs/notify/Kconfig" 57source "fs/notify/Kconfig"
58 58
59source "fs/quota/Kconfig" 59source "fs/quota/Kconfig"
@@ -62,6 +62,16 @@ source "fs/autofs/Kconfig"
62source "fs/autofs4/Kconfig" 62source "fs/autofs4/Kconfig"
63source "fs/fuse/Kconfig" 63source "fs/fuse/Kconfig"
64 64
65config CUSE
66 tristate "Character device in Userpace support"
67 depends on FUSE_FS
68 help
69 This FUSE extension allows character devices to be
70 implemented in userspace.
71
72 If you want to develop or use userspace character device
73 based on CUSE, answer Y or M.
74
65config GENERIC_ACL 75config GENERIC_ACL
66 bool 76 bool
67 select FS_POSIX_ACL 77 select FS_POSIX_ACL
@@ -124,7 +134,7 @@ config TMPFS_POSIX_ACL
124config HUGETLBFS 134config HUGETLBFS
125 bool "HugeTLB file system support" 135 bool "HugeTLB file system support"
126 depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \ 136 depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \
127 (S390 && 64BIT) || BROKEN 137 (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN
128 help 138 help
129 hugetlbfs is a filesystem backing for HugeTLB pages, based on 139 hugetlbfs is a filesystem backing for HugeTLB pages, based on
130 ramfs. For architectures that support it, say Y here and read 140 ramfs. For architectures that support it, say Y here and read
@@ -176,32 +186,7 @@ source "fs/romfs/Kconfig"
176source "fs/sysv/Kconfig" 186source "fs/sysv/Kconfig"
177source "fs/ufs/Kconfig" 187source "fs/ufs/Kconfig"
178source "fs/exofs/Kconfig" 188source "fs/exofs/Kconfig"
179 189source "fs/nilfs2/Kconfig"
180config NILFS2_FS
181 tristate "NILFS2 file system support (EXPERIMENTAL)"
182 depends on BLOCK && EXPERIMENTAL
183 select CRC32
184 help
185 NILFS2 is a log-structured file system (LFS) supporting continuous
186 snapshotting. In addition to versioning capability of the entire
187 file system, users can even restore files mistakenly overwritten or
188 destroyed just a few seconds ago. Since this file system can keep
189 consistency like conventional LFS, it achieves quick recovery after
190 system crashes.
191
192 NILFS2 creates a number of checkpoints every few seconds or per
193 synchronous write basis (unless there is no change). Users can
194 select significant versions among continuously created checkpoints,
195 and can change them into snapshots which will be preserved for long
196 periods until they are changed back to checkpoints. Each
197 snapshot is mountable as a read-only file system concurrently with
198 its writable mount, and this feature is convenient for online backup.
199
200 Some features including atime, extended attributes, and POSIX ACLs,
201 are not supported yet.
202
203 To compile this file system support as a module, choose M here: the
204 module will be called nilfs2. If unsure, say N.
205 190
206endif # MISC_FILESYSTEMS 191endif # MISC_FILESYSTEMS
207 192
@@ -226,10 +211,12 @@ source "fs/nfsd/Kconfig"
226 211
227config LOCKD 212config LOCKD
228 tristate 213 tristate
214 depends on FILE_LOCKING
229 215
230config LOCKD_V4 216config LOCKD_V4
231 bool 217 bool
232 depends on NFSD_V3 || NFS_V3 218 depends on NFSD_V3 || NFS_V3
219 depends on FILE_LOCKING
233 default y 220 default y
234 221
235config EXPORTFS 222config EXPORTFS
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index e0a85dbeeb88..9cc18775b832 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -1,3 +1,6 @@
1#include <linux/fs.h>
2#include <linux/adfs_fs.h>
3
1/* Internal data structures for ADFS */ 4/* Internal data structures for ADFS */
2 5
3#define ADFS_FREE_FRAG 0 6#define ADFS_FREE_FRAG 0
@@ -17,6 +20,58 @@
17struct buffer_head; 20struct buffer_head;
18 21
19/* 22/*
23 * adfs file system inode data in memory
24 */
25struct adfs_inode_info {
26 loff_t mmu_private;
27 unsigned long parent_id; /* object id of parent */
28 __u32 loadaddr; /* RISC OS load address */
29 __u32 execaddr; /* RISC OS exec address */
30 unsigned int filetype; /* RISC OS file type */
31 unsigned int attr; /* RISC OS permissions */
32 unsigned int stamped:1; /* RISC OS file has date/time */
33 struct inode vfs_inode;
34};
35
36/*
37 * Forward-declare this
38 */
39struct adfs_discmap;
40struct adfs_dir_ops;
41
42/*
43 * ADFS file system superblock data in memory
44 */
45struct adfs_sb_info {
46 struct adfs_discmap *s_map; /* bh list containing map */
47 struct adfs_dir_ops *s_dir; /* directory operations */
48
49 uid_t s_uid; /* owner uid */
50 gid_t s_gid; /* owner gid */
51 umode_t s_owner_mask; /* ADFS owner perm -> unix perm */
52 umode_t s_other_mask; /* ADFS other perm -> unix perm */
53
54 __u32 s_ids_per_zone; /* max. no ids in one zone */
55 __u32 s_idlen; /* length of ID in map */
56 __u32 s_map_size; /* sector size of a map */
57 unsigned long s_size; /* total size (in blocks) of this fs */
58 signed int s_map2blk; /* shift left by this for map->sector */
59 unsigned int s_log2sharesize;/* log2 share size */
60 __le32 s_version; /* disc format version */
61 unsigned int s_namelen; /* maximum number of characters in name */
62};
63
64static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb)
65{
66 return sb->s_fs_info;
67}
68
69static inline struct adfs_inode_info *ADFS_I(struct inode *inode)
70{
71 return container_of(inode, struct adfs_inode_info, vfs_inode);
72}
73
74/*
20 * Directory handling 75 * Directory handling
21 */ 76 */
22struct adfs_dir { 77struct adfs_dir {
@@ -53,6 +108,7 @@ struct adfs_dir_ops {
53 int (*update)(struct adfs_dir *dir, struct object_info *obj); 108 int (*update)(struct adfs_dir *dir, struct object_info *obj);
54 int (*create)(struct adfs_dir *dir, struct object_info *obj); 109 int (*create)(struct adfs_dir *dir, struct object_info *obj);
55 int (*remove)(struct adfs_dir *dir, struct object_info *obj); 110 int (*remove)(struct adfs_dir *dir, struct object_info *obj);
111 int (*sync)(struct adfs_dir *dir);
56 void (*free)(struct adfs_dir *dir); 112 void (*free)(struct adfs_dir *dir);
57}; 113};
58 114
@@ -90,7 +146,8 @@ extern const struct dentry_operations adfs_dentry_operations;
90extern struct adfs_dir_ops adfs_f_dir_ops; 146extern struct adfs_dir_ops adfs_f_dir_ops;
91extern struct adfs_dir_ops adfs_fplus_dir_ops; 147extern struct adfs_dir_ops adfs_fplus_dir_ops;
92 148
93extern int adfs_dir_update(struct super_block *sb, struct object_info *obj); 149extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
150 int wait);
94 151
95/* file.c */ 152/* file.c */
96extern const struct inode_operations adfs_file_inode_operations; 153extern const struct inode_operations adfs_file_inode_operations;
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index e867ccf37246..23aa52f548a0 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,15 +9,7 @@
9 * 9 *
10 * Common directory handling for ADFS 10 * Common directory handling for ADFS
11 */ 11 */
12#include <linux/errno.h>
13#include <linux/fs.h>
14#include <linux/adfs_fs.h>
15#include <linux/time.h>
16#include <linux/stat.h>
17#include <linux/spinlock.h>
18#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
19#include <linux/buffer_head.h> /* for file_fsync() */
20
21#include "adfs.h" 13#include "adfs.h"
22 14
23/* 15/*
@@ -83,7 +75,7 @@ out:
83} 75}
84 76
85int 77int
86adfs_dir_update(struct super_block *sb, struct object_info *obj) 78adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
87{ 79{
88 int ret = -EINVAL; 80 int ret = -EINVAL;
89#ifdef CONFIG_ADFS_FS_RW 81#ifdef CONFIG_ADFS_FS_RW
@@ -106,6 +98,12 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj)
106 ret = ops->update(&dir, obj); 98 ret = ops->update(&dir, obj);
107 write_unlock(&adfs_dir_lock); 99 write_unlock(&adfs_dir_lock);
108 100
101 if (wait) {
102 int err = ops->sync(&dir);
103 if (!ret)
104 ret = err;
105 }
106
109 ops->free(&dir); 107 ops->free(&dir);
110out: 108out:
111#endif 109#endif
@@ -199,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
199 .read = generic_read_dir, 197 .read = generic_read_dir,
200 .llseek = generic_file_llseek, 198 .llseek = generic_file_llseek,
201 .readdir = adfs_readdir, 199 .readdir = adfs_readdir,
202 .fsync = file_fsync, 200 .fsync = simple_fsync,
203}; 201};
204 202
205static int 203static int
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index ea7df2146921..bafc71222e25 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -9,15 +9,7 @@
9 * 9 *
10 * E and F format directory handling 10 * E and F format directory handling
11 */ 11 */
12#include <linux/errno.h>
13#include <linux/fs.h>
14#include <linux/adfs_fs.h>
15#include <linux/time.h>
16#include <linux/stat.h>
17#include <linux/spinlock.h>
18#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
19#include <linux/string.h>
20
21#include "adfs.h" 13#include "adfs.h"
22#include "dir_f.h" 14#include "dir_f.h"
23 15
@@ -437,6 +429,22 @@ bad_dir:
437#endif 429#endif
438} 430}
439 431
432static int
433adfs_f_sync(struct adfs_dir *dir)
434{
435 int err = 0;
436 int i;
437
438 for (i = dir->nr_buffers - 1; i >= 0; i--) {
439 struct buffer_head *bh = dir->bh[i];
440 sync_dirty_buffer(bh);
441 if (buffer_req(bh) && !buffer_uptodate(bh))
442 err = -EIO;
443 }
444
445 return err;
446}
447
440static void 448static void
441adfs_f_free(struct adfs_dir *dir) 449adfs_f_free(struct adfs_dir *dir)
442{ 450{
@@ -456,5 +464,6 @@ struct adfs_dir_ops adfs_f_dir_ops = {
456 .setpos = adfs_f_setpos, 464 .setpos = adfs_f_setpos,
457 .getnext = adfs_f_getnext, 465 .getnext = adfs_f_getnext,
458 .update = adfs_f_update, 466 .update = adfs_f_update,
467 .sync = adfs_f_sync,
459 .free = adfs_f_free 468 .free = adfs_f_free
460}; 469};
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 1ec644e32df9..1796bb352d05 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -7,15 +7,7 @@
7 * it under the terms of the GNU General Public License version 2 as 7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/errno.h>
11#include <linux/fs.h>
12#include <linux/adfs_fs.h>
13#include <linux/time.h>
14#include <linux/stat.h>
15#include <linux/spinlock.h>
16#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
17#include <linux/string.h>
18
19#include "adfs.h" 11#include "adfs.h"
20#include "dir_fplus.h" 12#include "dir_fplus.h"
21 13
@@ -161,6 +153,22 @@ out:
161 return ret; 153 return ret;
162} 154}
163 155
156static int
157adfs_fplus_sync(struct adfs_dir *dir)
158{
159 int err = 0;
160 int i;
161
162 for (i = dir->nr_buffers - 1; i >= 0; i--) {
163 struct buffer_head *bh = dir->bh[i];
164 sync_dirty_buffer(bh);
165 if (buffer_req(bh) && !buffer_uptodate(bh))
166 err = -EIO;
167 }
168
169 return err;
170}
171
164static void 172static void
165adfs_fplus_free(struct adfs_dir *dir) 173adfs_fplus_free(struct adfs_dir *dir)
166{ 174{
@@ -175,5 +183,6 @@ struct adfs_dir_ops adfs_fplus_dir_ops = {
175 .read = adfs_fplus_read, 183 .read = adfs_fplus_read,
176 .setpos = adfs_fplus_setpos, 184 .setpos = adfs_fplus_setpos,
177 .getnext = adfs_fplus_getnext, 185 .getnext = adfs_fplus_getnext,
186 .sync = adfs_fplus_sync,
178 .free = adfs_fplus_free 187 .free = adfs_fplus_free
179}; 188};
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 36e381c6a99a..005ea34d1758 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -19,10 +19,6 @@
19 * 19 *
20 * adfs regular file handling primitives 20 * adfs regular file handling primitives
21 */ 21 */
22#include <linux/fs.h>
23#include <linux/buffer_head.h> /* for file_fsync() */
24#include <linux/adfs_fs.h>
25
26#include "adfs.h" 22#include "adfs.h"
27 23
28const struct file_operations adfs_file_operations = { 24const struct file_operations adfs_file_operations = {
@@ -30,7 +26,7 @@ const struct file_operations adfs_file_operations = {
30 .read = do_sync_read, 26 .read = do_sync_read,
31 .aio_read = generic_file_aio_read, 27 .aio_read = generic_file_aio_read,
32 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
33 .fsync = file_fsync, 29 .fsync = simple_fsync,
34 .write = do_sync_write, 30 .write = do_sync_write,
35 .aio_write = generic_file_aio_write, 31 .aio_write = generic_file_aio_write,
36 .splice_read = generic_file_splice_read, 32 .splice_read = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e647200262a2..798cb071d132 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -7,17 +7,8 @@
7 * it under the terms of the GNU General Public License version 2 as 7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/errno.h>
11#include <linux/fs.h>
12#include <linux/adfs_fs.h>
13#include <linux/time.h>
14#include <linux/stat.h>
15#include <linux/string.h>
16#include <linux/mm.h>
17#include <linux/smp_lock.h> 10#include <linux/smp_lock.h>
18#include <linux/module.h>
19#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
20
21#include "adfs.h" 12#include "adfs.h"
22 13
23/* 14/*
@@ -376,7 +367,7 @@ out:
376 * The adfs-specific inode data has already been updated by 367 * The adfs-specific inode data has already been updated by
377 * adfs_notify_change() 368 * adfs_notify_change()
378 */ 369 */
379int adfs_write_inode(struct inode *inode, int unused) 370int adfs_write_inode(struct inode *inode, int wait)
380{ 371{
381 struct super_block *sb = inode->i_sb; 372 struct super_block *sb = inode->i_sb;
382 struct object_info obj; 373 struct object_info obj;
@@ -391,8 +382,7 @@ int adfs_write_inode(struct inode *inode, int unused)
391 obj.attr = ADFS_I(inode)->attr; 382 obj.attr = ADFS_I(inode)->attr;
392 obj.size = inode->i_size; 383 obj.size = inode->i_size;
393 384
394 ret = adfs_dir_update(sb, &obj); 385 ret = adfs_dir_update(sb, &obj, wait);
395 unlock_kernel(); 386 unlock_kernel();
396 return ret; 387 return ret;
397} 388}
398MODULE_LICENSE("GPL");
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index 92ab4fbc2031..d1a5932bb0f1 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -7,14 +7,8 @@
7 * it under the terms of the GNU General Public License version 2 as 7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/errno.h>
11#include <linux/fs.h>
12#include <linux/adfs_fs.h>
13#include <linux/spinlock.h>
14#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
15
16#include <asm/unaligned.h> 11#include <asm/unaligned.h>
17
18#include "adfs.h" 12#include "adfs.h"
19 13
20/* 14/*
@@ -62,7 +56,7 @@ static DEFINE_RWLOCK(adfs_map_lock);
62#define GET_FRAG_ID(_map,_start,_idmask) \ 56#define GET_FRAG_ID(_map,_start,_idmask) \
63 ({ \ 57 ({ \
64 unsigned char *_m = _map + (_start >> 3); \ 58 unsigned char *_m = _map + (_start >> 3); \
65 u32 _frag = get_unaligned((u32 *)_m); \ 59 u32 _frag = get_unaligned_le32(_m); \
66 _frag >>= (_start & 7); \ 60 _frag >>= (_start & 7); \
67 _frag & _idmask; \ 61 _frag & _idmask; \
68 }) 62 })
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index dd9becca4241..6910a98bd73c 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -8,26 +8,13 @@
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/errno.h>
12#include <linux/fs.h>
13#include <linux/adfs_fs.h>
14#include <linux/slab.h>
15#include <linux/time.h>
16#include <linux/stat.h>
17#include <linux/string.h>
18#include <linux/init.h> 11#include <linux/init.h>
19#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
20#include <linux/vfs.h>
21#include <linux/parser.h> 13#include <linux/parser.h>
22#include <linux/bitops.h>
23#include <linux/mount.h> 14#include <linux/mount.h>
24#include <linux/seq_file.h> 15#include <linux/seq_file.h>
25 16#include <linux/smp_lock.h>
26#include <asm/uaccess.h> 17#include <linux/statfs.h>
27#include <asm/system.h>
28
29#include <stdarg.h>
30
31#include "adfs.h" 18#include "adfs.h"
32#include "dir_f.h" 19#include "dir_f.h"
33#include "dir_fplus.h" 20#include "dir_fplus.h"
@@ -132,11 +119,15 @@ static void adfs_put_super(struct super_block *sb)
132 int i; 119 int i;
133 struct adfs_sb_info *asb = ADFS_SB(sb); 120 struct adfs_sb_info *asb = ADFS_SB(sb);
134 121
122 lock_kernel();
123
135 for (i = 0; i < asb->s_map_size; i++) 124 for (i = 0; i < asb->s_map_size; i++)
136 brelse(asb->s_map[i].dm_bh); 125 brelse(asb->s_map[i].dm_bh);
137 kfree(asb->s_map); 126 kfree(asb->s_map);
138 kfree(asb); 127 kfree(asb);
139 sb->s_fs_info = NULL; 128 sb->s_fs_info = NULL;
129
130 unlock_kernel();
140} 131}
141 132
142static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) 133static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -530,3 +521,4 @@ static void __exit exit_adfs_fs(void)
530 521
531module_init(init_adfs_fs) 522module_init(init_adfs_fs)
532module_exit(exit_adfs_fs) 523module_exit(exit_adfs_fs)
524MODULE_LICENSE("GPL");
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 1a2d5e3c7f4e..e511dc621a2e 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -182,6 +182,7 @@ extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dent
182 182
183void affs_free_prealloc(struct inode *inode); 183void affs_free_prealloc(struct inode *inode);
184extern void affs_truncate(struct inode *); 184extern void affs_truncate(struct inode *);
185int affs_file_fsync(struct file *, struct dentry *, int);
185 186
186/* dir.c */ 187/* dir.c */
187 188
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 7b36904dbeac..8ca8f3a55599 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -21,7 +21,7 @@ const struct file_operations affs_dir_operations = {
21 .read = generic_read_dir, 21 .read = generic_read_dir,
22 .llseek = generic_file_llseek, 22 .llseek = generic_file_llseek,
23 .readdir = affs_readdir, 23 .readdir = affs_readdir,
24 .fsync = file_fsync, 24 .fsync = affs_file_fsync,
25}; 25};
26 26
27/* 27/*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 9246cb4aa018..184e55c1c9ba 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -34,7 +34,7 @@ const struct file_operations affs_file_operations = {
34 .mmap = generic_file_mmap, 34 .mmap = generic_file_mmap,
35 .open = affs_file_open, 35 .open = affs_file_open,
36 .release = affs_file_release, 36 .release = affs_file_release,
37 .fsync = file_fsync, 37 .fsync = affs_file_fsync,
38 .splice_read = generic_file_splice_read, 38 .splice_read = generic_file_splice_read,
39}; 39};
40 40
@@ -915,3 +915,15 @@ affs_truncate(struct inode *inode)
915 } 915 }
916 affs_free_prealloc(inode); 916 affs_free_prealloc(inode);
917} 917}
918
919int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
920{
921 struct inode * inode = dentry->d_inode;
922 int ret, err;
923
924 ret = write_inode_now(inode, 0);
925 err = sync_blockdev(inode->i_sb->s_bdev);
926 if (!ret)
927 ret = err;
928 return ret;
929}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 63f5183f263b..104fdcb3a7fc 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,6 +16,7 @@
16#include <linux/parser.h> 16#include <linux/parser.h>
17#include <linux/magic.h> 17#include <linux/magic.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/smp_lock.h>
19#include "affs.h" 20#include "affs.h"
20 21
21extern struct timezone sys_tz; 22extern struct timezone sys_tz;
@@ -24,49 +25,67 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
24static int affs_remount (struct super_block *sb, int *flags, char *data); 25static int affs_remount (struct super_block *sb, int *flags, char *data);
25 26
26static void 27static void
28affs_commit_super(struct super_block *sb, int clean)
29{
30 struct affs_sb_info *sbi = AFFS_SB(sb);
31 struct buffer_head *bh = sbi->s_root_bh;
32 struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
33
34 tail->bm_flag = cpu_to_be32(clean);
35 secs_to_datestamp(get_seconds(), &tail->disk_change);
36 affs_fix_checksum(sb, bh);
37 mark_buffer_dirty(bh);
38}
39
40static void
27affs_put_super(struct super_block *sb) 41affs_put_super(struct super_block *sb)
28{ 42{
29 struct affs_sb_info *sbi = AFFS_SB(sb); 43 struct affs_sb_info *sbi = AFFS_SB(sb);
30 pr_debug("AFFS: put_super()\n"); 44 pr_debug("AFFS: put_super()\n");
31 45
32 if (!(sb->s_flags & MS_RDONLY)) { 46 lock_kernel();
33 AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1); 47
34 secs_to_datestamp(get_seconds(), 48 if (!(sb->s_flags & MS_RDONLY))
35 &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change); 49 affs_commit_super(sb, 1);
36 affs_fix_checksum(sb, sbi->s_root_bh);
37 mark_buffer_dirty(sbi->s_root_bh);
38 }
39 50
40 kfree(sbi->s_prefix); 51 kfree(sbi->s_prefix);
41 affs_free_bitmap(sb); 52 affs_free_bitmap(sb);
42 affs_brelse(sbi->s_root_bh); 53 affs_brelse(sbi->s_root_bh);
43 kfree(sbi); 54 kfree(sbi);
44 sb->s_fs_info = NULL; 55 sb->s_fs_info = NULL;
45 return; 56
57 unlock_kernel();
46} 58}
47 59
48static void 60static void
49affs_write_super(struct super_block *sb) 61affs_write_super(struct super_block *sb)
50{ 62{
51 int clean = 2; 63 int clean = 2;
52 struct affs_sb_info *sbi = AFFS_SB(sb);
53 64
65 lock_super(sb);
54 if (!(sb->s_flags & MS_RDONLY)) { 66 if (!(sb->s_flags & MS_RDONLY)) {
55 // if (sbi->s_bitmap[i].bm_bh) { 67 // if (sbi->s_bitmap[i].bm_bh) {
56 // if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) { 68 // if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) {
57 // clean = 0; 69 // clean = 0;
58 AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(clean); 70 affs_commit_super(sb, clean);
59 secs_to_datestamp(get_seconds(),
60 &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
61 affs_fix_checksum(sb, sbi->s_root_bh);
62 mark_buffer_dirty(sbi->s_root_bh);
63 sb->s_dirt = !clean; /* redo until bitmap synced */ 71 sb->s_dirt = !clean; /* redo until bitmap synced */
64 } else 72 } else
65 sb->s_dirt = 0; 73 sb->s_dirt = 0;
74 unlock_super(sb);
66 75
67 pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean); 76 pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
68} 77}
69 78
79static int
80affs_sync_fs(struct super_block *sb, int wait)
81{
82 lock_super(sb);
83 affs_commit_super(sb, 2);
84 sb->s_dirt = 0;
85 unlock_super(sb);
86 return 0;
87}
88
70static struct kmem_cache * affs_inode_cachep; 89static struct kmem_cache * affs_inode_cachep;
71 90
72static struct inode *affs_alloc_inode(struct super_block *sb) 91static struct inode *affs_alloc_inode(struct super_block *sb)
@@ -124,6 +143,7 @@ static const struct super_operations affs_sops = {
124 .clear_inode = affs_clear_inode, 143 .clear_inode = affs_clear_inode,
125 .put_super = affs_put_super, 144 .put_super = affs_put_super,
126 .write_super = affs_write_super, 145 .write_super = affs_write_super,
146 .sync_fs = affs_sync_fs,
127 .statfs = affs_statfs, 147 .statfs = affs_statfs,
128 .remount_fs = affs_remount, 148 .remount_fs = affs_remount,
129 .show_options = generic_show_options, 149 .show_options = generic_show_options,
@@ -507,6 +527,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
507 kfree(new_opts); 527 kfree(new_opts);
508 return -EINVAL; 528 return -EINVAL;
509 } 529 }
530 lock_kernel();
510 replace_mount_options(sb, new_opts); 531 replace_mount_options(sb, new_opts);
511 532
512 sbi->s_flags = mount_flags; 533 sbi->s_flags = mount_flags;
@@ -514,8 +535,10 @@ affs_remount(struct super_block *sb, int *flags, char *data)
514 sbi->s_uid = uid; 535 sbi->s_uid = uid;
515 sbi->s_gid = gid; 536 sbi->s_gid = gid;
516 537
517 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 538 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
539 unlock_kernel();
518 return 0; 540 return 0;
541 }
519 if (*flags & MS_RDONLY) { 542 if (*flags & MS_RDONLY) {
520 sb->s_dirt = 1; 543 sb->s_dirt = 1;
521 while (sb->s_dirt) 544 while (sb->s_dirt)
@@ -524,6 +547,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
524 } else 547 } else
525 res = affs_init_bitmap(sb, flags); 548 res = affs_init_bitmap(sb, flags);
526 549
550 unlock_kernel();
527 return res; 551 return res;
528} 552}
529 553
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 9bd757774c9e..88067f36e5e7 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -564,7 +564,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
564static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) 564static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
565{ 565{
566 struct afs_vnode *vnode, *dir; 566 struct afs_vnode *vnode, *dir;
567 struct afs_fid fid; 567 struct afs_fid uninitialized_var(fid);
568 struct dentry *parent; 568 struct dentry *parent;
569 struct key *key; 569 struct key *key;
570 void *dir_version; 570 void *dir_version;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0149dab365e7..681c2a7b013f 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page)
134 134
135 inode = page->mapping->host; 135 inode = page->mapping->host;
136 136
137 ASSERT(file != NULL); 137 if (file) {
138 key = file->private_data; 138 key = file->private_data;
139 ASSERT(key != NULL); 139 ASSERT(key != NULL);
140 } else {
141 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
142 if (IS_ERR(key)) {
143 ret = PTR_ERR(key);
144 goto error_nokey;
145 }
146 }
140 147
141 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); 148 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
142 149
@@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page)
207 unlock_page(page); 214 unlock_page(page);
208 } 215 }
209 216
217 if (!file)
218 key_put(key);
210 _leave(" = 0"); 219 _leave(" = 0");
211 return 0; 220 return 0;
212 221
213error: 222error:
214 SetPageError(page); 223 SetPageError(page);
215 unlock_page(page); 224 unlock_page(page);
225 if (!file)
226 key_put(key);
227error_nokey:
216 _leave(" = %d", ret); 228 _leave(" = %d", ret);
217 return ret; 229 return ret;
218} 230}
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 210acafe4a9b..3ff8bdd18fb3 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -432,7 +432,6 @@ vfs_rejected_lock:
432 list_del_init(&fl->fl_u.afs.link); 432 list_del_init(&fl->fl_u.afs.link);
433 if (list_empty(&vnode->granted_locks)) 433 if (list_empty(&vnode->granted_locks))
434 afs_defer_unlock(vnode, key); 434 afs_defer_unlock(vnode, key);
435 spin_unlock(&vnode->lock);
436 goto abort_attempt; 435 goto abort_attempt;
437} 436}
438 437
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 2d33a5f7d218..0dd4dafee10b 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -12,6 +12,7 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <rxrpc/packet.h>
15#include "internal.h" 16#include "internal.h"
16#include "afs_fs.h" 17#include "afs_fs.h"
17 18
@@ -54,6 +55,21 @@ int afs_abort_to_error(u32 abort_code)
54 case 0x2f6df24: return -ENOLCK; 55 case 0x2f6df24: return -ENOLCK;
55 case 0x2f6df26: return -ENOTEMPTY; 56 case 0x2f6df26: return -ENOTEMPTY;
56 case 0x2f6df78: return -EDQUOT; 57 case 0x2f6df78: return -EDQUOT;
58
59 case RXKADINCONSISTENCY: return -EPROTO;
60 case RXKADPACKETSHORT: return -EPROTO;
61 case RXKADLEVELFAIL: return -EKEYREJECTED;
62 case RXKADTICKETLEN: return -EKEYREJECTED;
63 case RXKADOUTOFSEQUENCE: return -EPROTO;
64 case RXKADNOAUTH: return -EKEYREJECTED;
65 case RXKADBADKEY: return -EKEYREJECTED;
66 case RXKADBADTICKET: return -EKEYREJECTED;
67 case RXKADUNKNOWNKEY: return -EKEYREJECTED;
68 case RXKADEXPIRED: return -EKEYEXPIRED;
69 case RXKADSEALEDINCON: return -EKEYREJECTED;
70 case RXKADDATALEN: return -EKEYREJECTED;
71 case RXKADILLEGALLEVEL: return -EKEYREJECTED;
72
57 default: return -EREMOTEIO; 73 default: return -EREMOTEIO;
58 } 74 }
59} 75}
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2b9e2d03a390..5ffb570cd3a8 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -17,7 +17,6 @@
17#include <linux/pagemap.h> 17#include <linux/pagemap.h>
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/namei.h> 19#include <linux/namei.h>
20#include <linux/mnt_namespace.h>
21#include "internal.h" 20#include "internal.h"
22 21
23 22
@@ -244,7 +243,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
244 case -EBUSY: 243 case -EBUSY:
245 /* someone else made a mount here whilst we were busy */ 244 /* someone else made a mount here whilst we were busy */
246 while (d_mountpoint(nd->path.dentry) && 245 while (d_mountpoint(nd->path.dentry) &&
247 follow_down(&nd->path.mnt, &nd->path.dentry)) 246 follow_down(&nd->path))
248 ; 247 ;
249 err = 0; 248 err = 0;
250 default: 249 default:
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 76828e5f8a39..e1ea1c240b6a 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -18,6 +18,7 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/init.h> 19#include <linux/init.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/smp_lock.h>
21#include <linux/fs.h> 22#include <linux/fs.h>
22#include <linux/pagemap.h> 23#include <linux/pagemap.h>
23#include <linux/parser.h> 24#include <linux/parser.h>
@@ -440,8 +441,12 @@ static void afs_put_super(struct super_block *sb)
440 441
441 _enter(""); 442 _enter("");
442 443
444 lock_kernel();
445
443 afs_put_volume(as->volume); 446 afs_put_volume(as->volume);
444 447
448 unlock_kernel();
449
445 _leave(""); 450 _leave("");
446} 451}
447 452
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index ec2a7431e458..6e689208def2 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -65,6 +65,8 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
65 goto out; 65 goto out;
66 goto rotate; 66 goto rotate;
67 case -ENOMEDIUM: 67 case -ENOMEDIUM:
68 case -EKEYREJECTED:
69 case -EKEYEXPIRED:
68 goto out; 70 goto out;
69 default: 71 default:
70 ret = -EIO; 72 ret = -EIO;
diff --git a/fs/aio.c b/fs/aio.c
index 76da12537956..d065b2c3273e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -485,6 +485,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
485{ 485{
486 assert_spin_locked(&ctx->ctx_lock); 486 assert_spin_locked(&ctx->ctx_lock);
487 487
488 if (req->ki_eventfd != NULL)
489 eventfd_ctx_put(req->ki_eventfd);
488 if (req->ki_dtor) 490 if (req->ki_dtor)
489 req->ki_dtor(req); 491 req->ki_dtor(req);
490 if (req->ki_iovec != &req->ki_inline_vec) 492 if (req->ki_iovec != &req->ki_inline_vec)
@@ -509,8 +511,6 @@ static void aio_fput_routine(struct work_struct *data)
509 /* Complete the fput(s) */ 511 /* Complete the fput(s) */
510 if (req->ki_filp != NULL) 512 if (req->ki_filp != NULL)
511 __fput(req->ki_filp); 513 __fput(req->ki_filp);
512 if (req->ki_eventfd != NULL)
513 __fput(req->ki_eventfd);
514 514
515 /* Link the iocb into the context's free list */ 515 /* Link the iocb into the context's free list */
516 spin_lock_irq(&ctx->ctx_lock); 516 spin_lock_irq(&ctx->ctx_lock);
@@ -528,8 +528,6 @@ static void aio_fput_routine(struct work_struct *data)
528 */ 528 */
529static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) 529static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
530{ 530{
531 int schedule_putreq = 0;
532
533 dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", 531 dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
534 req, atomic_long_read(&req->ki_filp->f_count)); 532 req, atomic_long_read(&req->ki_filp->f_count));
535 533
@@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
549 * we would not be holding the last reference to the file*, so 547 * we would not be holding the last reference to the file*, so
550 * this function will be executed w/out any aio kthread wakeup. 548 * this function will be executed w/out any aio kthread wakeup.
551 */ 549 */
552 if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) 550 if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
553 schedule_putreq++;
554 else
555 req->ki_filp = NULL;
556 if (req->ki_eventfd != NULL) {
557 if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
558 schedule_putreq++;
559 else
560 req->ki_eventfd = NULL;
561 }
562 if (unlikely(schedule_putreq)) {
563 get_ioctx(ctx); 551 get_ioctx(ctx);
564 spin_lock(&fput_lock); 552 spin_lock(&fput_lock);
565 list_add(&req->ki_list, &fput_head); 553 list_add(&req->ki_list, &fput_head);
566 spin_unlock(&fput_lock); 554 spin_unlock(&fput_lock);
567 queue_work(aio_wq, &fput_work); 555 queue_work(aio_wq, &fput_work);
568 } else 556 } else {
557 req->ki_filp = NULL;
569 really_put_req(ctx, req); 558 really_put_req(ctx, req);
559 }
570 return 1; 560 return 1;
571} 561}
572 562
@@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1622 * an eventfd() fd, and will be signaled for each completed 1612 * an eventfd() fd, and will be signaled for each completed
1623 * event using the eventfd_signal() function. 1613 * event using the eventfd_signal() function.
1624 */ 1614 */
1625 req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); 1615 req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
1626 if (IS_ERR(req->ki_eventfd)) { 1616 if (IS_ERR(req->ki_eventfd)) {
1627 ret = PTR_ERR(req->ki_eventfd); 1617 ret = PTR_ERR(req->ki_eventfd);
1628 req->ki_eventfd = NULL; 1618 req->ki_eventfd = NULL;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 1dd96d4406c0..47d4a01c5393 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -52,6 +52,19 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {
52 .d_delete = anon_inodefs_delete_dentry, 52 .d_delete = anon_inodefs_delete_dentry,
53}; 53};
54 54
55/*
56 * nop .set_page_dirty method so that people can use .page_mkwrite on
57 * anon inodes.
58 */
59static int anon_set_page_dirty(struct page *page)
60{
61 return 0;
62};
63
64static const struct address_space_operations anon_aops = {
65 .set_page_dirty = anon_set_page_dirty,
66};
67
55/** 68/**
56 * anon_inode_getfd - creates a new file instance by hooking it up to an 69 * anon_inode_getfd - creates a new file instance by hooking it up to an
57 * anonymous inode, and a dentry that describe the "class" 70 * anonymous inode, and a dentry that describe the "class"
@@ -151,6 +164,8 @@ static struct inode *anon_inode_mkinode(void)
151 164
152 inode->i_fop = &anon_inode_fops; 165 inode->i_fop = &anon_inode_fops;
153 166
167 inode->i_mapping->a_ops = &anon_aops;
168
154 /* 169 /*
155 * Mark the inode dirty from the very beginning, 170 * Mark the inode dirty from the very beginning,
156 * that way it will never be moved to the dirty 171 * that way it will never be moved to the dirty
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 4eb4d8dfb2f1..2316e944a109 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -85,13 +85,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
85 } 85 }
86 path.mnt = mnt; 86 path.mnt = mnt;
87 path_get(&path); 87 path_get(&path);
88 if (!follow_down(&path.mnt, &path.dentry)) { 88 if (!follow_down(&path)) {
89 path_put(&path); 89 path_put(&path);
90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); 90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
91 continue; 91 continue;
92 } 92 }
93 while (d_mountpoint(path.dentry) && 93 while (d_mountpoint(path.dentry) && follow_down(&path));
94 follow_down(&path.mnt, &path.dentry))
95 ; 94 ;
96 umount_ok = may_umount(path.mnt); 95 umount_ok = may_umount(path.mnt);
97 path_put(&path); 96 path_put(&path);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b7ff33c63101..8f7cdde41733 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -223,12 +223,12 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
223int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); 223int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
224void autofs4_catatonic_mode(struct autofs_sb_info *); 224void autofs4_catatonic_mode(struct autofs_sb_info *);
225 225
226static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **dentry) 226static inline int autofs4_follow_mount(struct path *path)
227{ 227{
228 int res = 0; 228 int res = 0;
229 229
230 while (d_mountpoint(*dentry)) { 230 while (d_mountpoint(path->dentry)) {
231 int followed = follow_down(mnt, dentry); 231 int followed = follow_down(path);
232 if (!followed) 232 if (!followed)
233 break; 233 break;
234 res = 1; 234 res = 1;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 84168c0dcc2d..00bf8fcb245f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -19,7 +19,6 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/compat.h> 20#include <linux/compat.h>
21#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/smp_lock.h>
23#include <linux/magic.h> 22#include <linux/magic.h>
24#include <linux/dcache.h> 23#include <linux/dcache.h>
25#include <linux/uaccess.h> 24#include <linux/uaccess.h>
@@ -192,77 +191,42 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
192 return 0; 191 return 0;
193} 192}
194 193
195/* 194static int find_autofs_mount(const char *pathname,
196 * Walk down the mount stack looking for an autofs mount that 195 struct path *res,
197 * has the requested device number (aka. new_encode_dev(sb->s_dev). 196 int test(struct path *path, void *data),
198 */ 197 void *data)
199static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno)
200{ 198{
201 struct dentry *dentry; 199 struct path path;
202 struct inode *inode; 200 int err = kern_path(pathname, 0, &path);
203 struct super_block *sb; 201 if (err)
204 dev_t s_dev; 202 return err;
205 unsigned int err;
206
207 err = -ENOENT; 203 err = -ENOENT;
208 204 while (path.dentry == path.mnt->mnt_root) {
209 /* Lookup the dentry name at the base of our mount point */ 205 if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) {
210 dentry = d_lookup(nd->path.dentry, &nd->last); 206 if (test(&path, data)) {
211 if (!dentry) 207 path_get(&path);
212 goto out; 208 if (!err) /* already found some */
213 209 path_put(res);
214 dput(nd->path.dentry); 210 *res = path;
215 nd->path.dentry = dentry;
216
217 /* And follow the mount stack looking for our autofs mount */
218 while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
219 inode = nd->path.dentry->d_inode;
220 if (!inode)
221 break;
222
223 sb = inode->i_sb;
224 s_dev = new_encode_dev(sb->s_dev);
225 if (devno == s_dev) {
226 if (sb->s_magic == AUTOFS_SUPER_MAGIC) {
227 err = 0; 211 err = 0;
228 break;
229 } 212 }
230 } 213 }
214 if (!follow_up(&path))
215 break;
231 } 216 }
232out: 217 path_put(&path);
233 return err; 218 return err;
234} 219}
235 220
236/* 221static int test_by_dev(struct path *path, void *p)
237 * Walk down the mount stack looking for an autofs mount that
238 * has the requested mount type (ie. indirect, direct or offset).
239 */
240static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type)
241{ 222{
242 struct dentry *dentry; 223 return path->mnt->mnt_sb->s_dev == *(dev_t *)p;
243 struct autofs_info *ino; 224}
244 unsigned int err;
245
246 err = -ENOENT;
247
248 /* Lookup the dentry name at the base of our mount point */
249 dentry = d_lookup(nd->path.dentry, &nd->last);
250 if (!dentry)
251 goto out;
252
253 dput(nd->path.dentry);
254 nd->path.dentry = dentry;
255 225
256 /* And follow the mount stack looking for our autofs mount */ 226static int test_by_type(struct path *path, void *p)
257 while (follow_down(&nd->path.mnt, &nd->path.dentry)) { 227{
258 ino = autofs4_dentry_ino(nd->path.dentry); 228 struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
259 if (ino && ino->sbi->type & type) { 229 return ino && ino->sbi->type & *(unsigned *)p;
260 err = 0;
261 break;
262 }
263 }
264out:
265 return err;
266} 230}
267 231
268static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) 232static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
@@ -283,31 +247,25 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
283 * Open a file descriptor on the autofs mount point corresponding 247 * Open a file descriptor on the autofs mount point corresponding
284 * to the given path and device number (aka. new_encode_dev(sb->s_dev)). 248 * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
285 */ 249 */
286static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid) 250static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
287{ 251{
288 struct file *filp;
289 struct nameidata nd;
290 int err, fd; 252 int err, fd;
291 253
292 fd = get_unused_fd(); 254 fd = get_unused_fd();
293 if (likely(fd >= 0)) { 255 if (likely(fd >= 0)) {
294 /* Get nameidata of the parent directory */ 256 struct file *filp;
295 err = path_lookup(path, LOOKUP_PARENT, &nd); 257 struct path path;
258
259 err = find_autofs_mount(name, &path, test_by_dev, &devid);
296 if (err) 260 if (err)
297 goto out; 261 goto out;
298 262
299 /* 263 /*
300 * Search down, within the parent, looking for an 264 * Find autofs super block that has the device number
301 * autofs super block that has the device number
302 * corresponding to the autofs fs we want to open. 265 * corresponding to the autofs fs we want to open.
303 */ 266 */
304 err = autofs_dev_ioctl_find_super(&nd, devid);
305 if (err) {
306 path_put(&nd.path);
307 goto out;
308 }
309 267
310 filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY, 268 filp = dentry_open(path.dentry, path.mnt, O_RDONLY,
311 current_cred()); 269 current_cred());
312 if (IS_ERR(filp)) { 270 if (IS_ERR(filp)) {
313 err = PTR_ERR(filp); 271 err = PTR_ERR(filp);
@@ -340,7 +298,7 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
340 param->ioctlfd = -1; 298 param->ioctlfd = -1;
341 299
342 path = param->path; 300 path = param->path;
343 devid = param->openmount.devid; 301 devid = new_decode_dev(param->openmount.devid);
344 302
345 err = 0; 303 err = 0;
346 fd = autofs_dev_ioctl_open_mountpoint(path, devid); 304 fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -475,8 +433,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
475 struct autofs_dev_ioctl *param) 433 struct autofs_dev_ioctl *param)
476{ 434{
477 struct autofs_info *ino; 435 struct autofs_info *ino;
478 struct nameidata nd; 436 struct path path;
479 const char *path;
480 dev_t devid; 437 dev_t devid;
481 int err = -ENOENT; 438 int err = -ENOENT;
482 439
@@ -485,32 +442,24 @@ static int autofs_dev_ioctl_requester(struct file *fp,
485 goto out; 442 goto out;
486 } 443 }
487 444
488 path = param->path; 445 devid = sbi->sb->s_dev;
489 devid = new_encode_dev(sbi->sb->s_dev);
490 446
491 param->requester.uid = param->requester.gid = -1; 447 param->requester.uid = param->requester.gid = -1;
492 448
493 /* Get nameidata of the parent directory */ 449 err = find_autofs_mount(param->path, &path, test_by_dev, &devid);
494 err = path_lookup(path, LOOKUP_PARENT, &nd);
495 if (err) 450 if (err)
496 goto out; 451 goto out;
497 452
498 err = autofs_dev_ioctl_find_super(&nd, devid); 453 ino = autofs4_dentry_ino(path.dentry);
499 if (err)
500 goto out_release;
501
502 ino = autofs4_dentry_ino(nd.path.dentry);
503 if (ino) { 454 if (ino) {
504 err = 0; 455 err = 0;
505 autofs4_expire_wait(nd.path.dentry); 456 autofs4_expire_wait(path.dentry);
506 spin_lock(&sbi->fs_lock); 457 spin_lock(&sbi->fs_lock);
507 param->requester.uid = ino->uid; 458 param->requester.uid = ino->uid;
508 param->requester.gid = ino->gid; 459 param->requester.gid = ino->gid;
509 spin_unlock(&sbi->fs_lock); 460 spin_unlock(&sbi->fs_lock);
510 } 461 }
511 462 path_put(&path);
512out_release:
513 path_put(&nd.path);
514out: 463out:
515 return err; 464 return err;
516} 465}
@@ -569,8 +518,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
569 struct autofs_sb_info *sbi, 518 struct autofs_sb_info *sbi,
570 struct autofs_dev_ioctl *param) 519 struct autofs_dev_ioctl *param)
571{ 520{
572 struct nameidata nd; 521 struct path path;
573 const char *path; 522 const char *name;
574 unsigned int type; 523 unsigned int type;
575 unsigned int devid, magic; 524 unsigned int devid, magic;
576 int err = -ENOENT; 525 int err = -ENOENT;
@@ -580,71 +529,46 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
580 goto out; 529 goto out;
581 } 530 }
582 531
583 path = param->path; 532 name = param->path;
584 type = param->ismountpoint.in.type; 533 type = param->ismountpoint.in.type;
585 534
586 param->ismountpoint.out.devid = devid = 0; 535 param->ismountpoint.out.devid = devid = 0;
587 param->ismountpoint.out.magic = magic = 0; 536 param->ismountpoint.out.magic = magic = 0;
588 537
589 if (!fp || param->ioctlfd == -1) { 538 if (!fp || param->ioctlfd == -1) {
590 if (autofs_type_any(type)) { 539 if (autofs_type_any(type))
591 struct super_block *sb; 540 err = kern_path(name, LOOKUP_FOLLOW, &path);
592 541 else
593 err = path_lookup(path, LOOKUP_FOLLOW, &nd); 542 err = find_autofs_mount(name, &path, test_by_type, &type);
594 if (err) 543 if (err)
595 goto out; 544 goto out;
596 545 devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
597 sb = nd.path.dentry->d_sb;
598 devid = new_encode_dev(sb->s_dev);
599 } else {
600 struct autofs_info *ino;
601
602 err = path_lookup(path, LOOKUP_PARENT, &nd);
603 if (err)
604 goto out;
605
606 err = autofs_dev_ioctl_find_sbi_type(&nd, type);
607 if (err)
608 goto out_release;
609
610 ino = autofs4_dentry_ino(nd.path.dentry);
611 devid = autofs4_get_dev(ino->sbi);
612 }
613
614 err = 0; 546 err = 0;
615 if (nd.path.dentry->d_inode && 547 if (path.dentry->d_inode &&
616 nd.path.mnt->mnt_root == nd.path.dentry) { 548 path.mnt->mnt_root == path.dentry) {
617 err = 1; 549 err = 1;
618 magic = nd.path.dentry->d_inode->i_sb->s_magic; 550 magic = path.dentry->d_inode->i_sb->s_magic;
619 } 551 }
620 } else { 552 } else {
621 dev_t dev = autofs4_get_dev(sbi); 553 dev_t dev = sbi->sb->s_dev;
622 554
623 err = path_lookup(path, LOOKUP_PARENT, &nd); 555 err = find_autofs_mount(name, &path, test_by_dev, &dev);
624 if (err) 556 if (err)
625 goto out; 557 goto out;
626 558
627 err = autofs_dev_ioctl_find_super(&nd, dev); 559 devid = new_encode_dev(dev);
628 if (err)
629 goto out_release;
630
631 devid = dev;
632 560
633 err = have_submounts(nd.path.dentry); 561 err = have_submounts(path.dentry);
634 562
635 if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { 563 if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
636 if (follow_down(&nd.path.mnt, &nd.path.dentry)) { 564 if (follow_down(&path))
637 struct inode *inode = nd.path.dentry->d_inode; 565 magic = path.mnt->mnt_sb->s_magic;
638 magic = inode->i_sb->s_magic;
639 }
640 } 566 }
641 } 567 }
642 568
643 param->ismountpoint.out.devid = devid; 569 param->ismountpoint.out.devid = devid;
644 param->ismountpoint.out.magic = magic; 570 param->ismountpoint.out.magic = magic;
645 571 path_put(&path);
646out_release:
647 path_put(&nd.path);
648out: 572out:
649 return err; 573 return err;
650} 574}
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3077d8f16523..aa39ae83f019 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -48,19 +48,19 @@ static inline int autofs4_can_expire(struct dentry *dentry,
48static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) 48static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
49{ 49{
50 struct dentry *top = dentry; 50 struct dentry *top = dentry;
51 struct path path = {.mnt = mnt, .dentry = dentry};
51 int status = 1; 52 int status = 1;
52 53
53 DPRINTK("dentry %p %.*s", 54 DPRINTK("dentry %p %.*s",
54 dentry, (int)dentry->d_name.len, dentry->d_name.name); 55 dentry, (int)dentry->d_name.len, dentry->d_name.name);
55 56
56 mntget(mnt); 57 path_get(&path);
57 dget(dentry);
58 58
59 if (!follow_down(&mnt, &dentry)) 59 if (!follow_down(&path))
60 goto done; 60 goto done;
61 61
62 if (is_autofs4_dentry(dentry)) { 62 if (is_autofs4_dentry(path.dentry)) {
63 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 63 struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb);
64 64
65 /* This is an autofs submount, we can't expire it */ 65 /* This is an autofs submount, we can't expire it */
66 if (autofs_type_indirect(sbi->type)) 66 if (autofs_type_indirect(sbi->type))
@@ -70,7 +70,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
70 * Otherwise it's an offset mount and we need to check 70 * Otherwise it's an offset mount and we need to check
71 * if we can umount its mount, if there is one. 71 * if we can umount its mount, if there is one.
72 */ 72 */
73 if (!d_mountpoint(dentry)) { 73 if (!d_mountpoint(path.dentry)) {
74 status = 0; 74 status = 0;
75 goto done; 75 goto done;
76 } 76 }
@@ -86,8 +86,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
86 status = 0; 86 status = 0;
87done: 87done:
88 DPRINTK("returning = %d", status); 88 DPRINTK("returning = %d", status);
89 dput(dentry); 89 path_put(&path);
90 mntput(mnt);
91 return status; 90 return status;
92} 91}
93 92
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e383bf0334f1..b96a3c57359d 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -181,7 +181,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
181 nd->flags); 181 nd->flags);
182 /* 182 /*
183 * For an expire of a covered direct or offset mount we need 183 * For an expire of a covered direct or offset mount we need
184 * to beeak out of follow_down() at the autofs mount trigger 184 * to break out of follow_down() at the autofs mount trigger
185 * (d_mounted--), so we can see the expiring flag, and manage 185 * (d_mounted--), so we can see the expiring flag, and manage
186 * the blocking and following here until the expire is completed. 186 * the blocking and following here until the expire is completed.
187 */ 187 */
@@ -190,7 +190,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
190 if (ino->flags & AUTOFS_INF_EXPIRING) { 190 if (ino->flags & AUTOFS_INF_EXPIRING) {
191 spin_unlock(&sbi->fs_lock); 191 spin_unlock(&sbi->fs_lock);
192 /* Follow down to our covering mount. */ 192 /* Follow down to our covering mount. */
193 if (!follow_down(&nd->path.mnt, &nd->path.dentry)) 193 if (!follow_down(&nd->path))
194 goto done; 194 goto done;
195 goto follow; 195 goto follow;
196 } 196 }
@@ -230,8 +230,7 @@ follow:
230 * to follow it. 230 * to follow it.
231 */ 231 */
232 if (d_mountpoint(dentry)) { 232 if (d_mountpoint(dentry)) {
233 if (!autofs4_follow_mount(&nd->path.mnt, 233 if (!autofs4_follow_mount(&nd->path)) {
234 &nd->path.dentry)) {
235 status = -ENOENT; 234 status = -ENOENT;
236 goto out_error; 235 goto out_error;
237 } 236 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 76afd0d6b86c..615d5496fe0f 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, const char *in,
513{ 513{
514 struct nls_table *nls = BEFS_SB(sb)->nls; 514 struct nls_table *nls = BEFS_SB(sb)->nls;
515 int i, o; 515 int i, o;
516 wchar_t uni; 516 unicode_t uni;
517 int unilen, utflen; 517 int unilen, utflen;
518 char *result; 518 char *result;
519 /* The utf8->nls conversion won't make the final nls string bigger 519 /* The utf8->nls conversion won't make the final nls string bigger
@@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
539 for (i = o = 0; i < in_len; i += utflen, o += unilen) { 539 for (i = o = 0; i < in_len; i += utflen, o += unilen) {
540 540
541 /* convert from UTF-8 to Unicode */ 541 /* convert from UTF-8 to Unicode */
542 utflen = utf8_mbtowc(&uni, &in[i], in_len - i); 542 utflen = utf8_to_utf32(&in[i], in_len - i, &uni);
543 if (utflen < 0) { 543 if (utflen < 0)
544 goto conv_err; 544 goto conv_err;
545 }
546 545
547 /* convert from Unicode to nls */ 546 /* convert from Unicode to nls */
547 if (uni > MAX_WCHAR_T)
548 goto conv_err;
548 unilen = nls->uni2char(uni, &result[o], in_len - o); 549 unilen = nls->uni2char(uni, &result[o], in_len - o);
549 if (unilen < 0) { 550 if (unilen < 0)
550 goto conv_err; 551 goto conv_err;
551 }
552 } 552 }
553 result[o] = '\0'; 553 result[o] = '\0';
554 *out_len = o; 554 *out_len = o;
@@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, const char *in,
619 619
620 /* convert from nls to unicode */ 620 /* convert from nls to unicode */
621 unilen = nls->char2uni(&in[i], in_len - i, &uni); 621 unilen = nls->char2uni(&in[i], in_len - i, &uni);
622 if (unilen < 0) { 622 if (unilen < 0)
623 goto conv_err; 623 goto conv_err;
624 }
625 624
626 /* convert from unicode to UTF-8 */ 625 /* convert from unicode to UTF-8 */
627 utflen = utf8_wctomb(&result[o], uni, 3); 626 utflen = utf32_to_utf8(uni, &result[o], 3);
628 if (utflen <= 0) { 627 if (utflen <= 0)
629 goto conv_err; 628 goto conv_err;
630 }
631 } 629 }
632 630
633 result[o] = '\0'; 631 result[o] = '\0';
@@ -747,7 +745,6 @@ befs_put_super(struct super_block *sb)
747 745
748 kfree(sb->s_fs_info); 746 kfree(sb->s_fs_info);
749 sb->s_fs_info = NULL; 747 sb->s_fs_info = NULL;
750 return;
751} 748}
752 749
753/* Allocate private field of the superblock, fill it. 750/* Allocate private field of the superblock, fill it.
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 4dd1b623f937..1e41aadb1068 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -8,7 +8,6 @@
8#include <linux/time.h> 8#include <linux/time.h>
9#include <linux/string.h> 9#include <linux/string.h>
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/smp_lock.h>
12#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
13#include <linux/sched.h> 12#include <linux/sched.h>
14#include "bfs.h" 13#include "bfs.h"
@@ -79,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
79const struct file_operations bfs_dir_operations = { 78const struct file_operations bfs_dir_operations = {
80 .read = generic_read_dir, 79 .read = generic_read_dir,
81 .readdir = bfs_readdir, 80 .readdir = bfs_readdir,
82 .fsync = file_fsync, 81 .fsync = simple_fsync,
83 .llseek = generic_file_llseek, 82 .llseek = generic_file_llseek,
84}; 83};
85 84
@@ -205,7 +204,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
205 inode->i_nlink = 1; 204 inode->i_nlink = 1;
206 } 205 }
207 de->ino = 0; 206 de->ino = 0;
208 mark_buffer_dirty(bh); 207 mark_buffer_dirty_inode(bh, dir);
209 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; 208 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
210 mark_inode_dirty(dir); 209 mark_inode_dirty(dir);
211 inode->i_ctime = dir->i_ctime; 210 inode->i_ctime = dir->i_ctime;
@@ -267,7 +266,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
267 new_inode->i_ctime = CURRENT_TIME_SEC; 266 new_inode->i_ctime = CURRENT_TIME_SEC;
268 inode_dec_link_count(new_inode); 267 inode_dec_link_count(new_inode);
269 } 268 }
270 mark_buffer_dirty(old_bh); 269 mark_buffer_dirty_inode(old_bh, old_dir);
271 error = 0; 270 error = 0;
272 271
273end_rename: 272end_rename:
@@ -320,7 +319,7 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name,
320 for (i = 0; i < BFS_NAMELEN; i++) 319 for (i = 0; i < BFS_NAMELEN; i++)
321 de->name[i] = 320 de->name[i] =
322 (i < namelen) ? name[i] : 0; 321 (i < namelen) ? name[i] : 0;
323 mark_buffer_dirty(bh); 322 mark_buffer_dirty_inode(bh, dir);
324 brelse(bh); 323 brelse(bh);
325 return 0; 324 return 0;
326 } 325 }
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 6a021265f018..88b9a3ff44e4 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -11,7 +11,6 @@
11 11
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/smp_lock.h>
15#include "bfs.h" 14#include "bfs.h"
16 15
17#undef DEBUG 16#undef DEBUG
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index cc4062d12ca2..6f60336c6628 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -30,6 +30,7 @@ MODULE_LICENSE("GPL");
30#define dprintf(x...) 30#define dprintf(x...)
31#endif 31#endif
32 32
33static void bfs_write_super(struct super_block *s);
33void dump_imap(const char *prefix, struct super_block *s); 34void dump_imap(const char *prefix, struct super_block *s);
34 35
35struct inode *bfs_iget(struct super_block *sb, unsigned long ino) 36struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
@@ -97,14 +98,15 @@ error:
97 return ERR_PTR(-EIO); 98 return ERR_PTR(-EIO);
98} 99}
99 100
100static int bfs_write_inode(struct inode *inode, int unused) 101static int bfs_write_inode(struct inode *inode, int wait)
101{ 102{
103 struct bfs_sb_info *info = BFS_SB(inode->i_sb);
102 unsigned int ino = (u16)inode->i_ino; 104 unsigned int ino = (u16)inode->i_ino;
103 unsigned long i_sblock; 105 unsigned long i_sblock;
104 struct bfs_inode *di; 106 struct bfs_inode *di;
105 struct buffer_head *bh; 107 struct buffer_head *bh;
106 int block, off; 108 int block, off;
107 struct bfs_sb_info *info = BFS_SB(inode->i_sb); 109 int err = 0;
108 110
109 dprintf("ino=%08x\n", ino); 111 dprintf("ino=%08x\n", ino);
110 112
@@ -145,9 +147,14 @@ static int bfs_write_inode(struct inode *inode, int unused)
145 di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); 147 di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
146 148
147 mark_buffer_dirty(bh); 149 mark_buffer_dirty(bh);
150 if (wait) {
151 sync_dirty_buffer(bh);
152 if (buffer_req(bh) && !buffer_uptodate(bh))
153 err = -EIO;
154 }
148 brelse(bh); 155 brelse(bh);
149 mutex_unlock(&info->bfs_lock); 156 mutex_unlock(&info->bfs_lock);
150 return 0; 157 return err;
151} 158}
152 159
153static void bfs_delete_inode(struct inode *inode) 160static void bfs_delete_inode(struct inode *inode)
@@ -209,6 +216,26 @@ static void bfs_delete_inode(struct inode *inode)
209 clear_inode(inode); 216 clear_inode(inode);
210} 217}
211 218
219static int bfs_sync_fs(struct super_block *sb, int wait)
220{
221 struct bfs_sb_info *info = BFS_SB(sb);
222
223 mutex_lock(&info->bfs_lock);
224 mark_buffer_dirty(info->si_sbh);
225 sb->s_dirt = 0;
226 mutex_unlock(&info->bfs_lock);
227
228 return 0;
229}
230
231static void bfs_write_super(struct super_block *sb)
232{
233 if (!(sb->s_flags & MS_RDONLY))
234 bfs_sync_fs(sb, 1);
235 else
236 sb->s_dirt = 0;
237}
238
212static void bfs_put_super(struct super_block *s) 239static void bfs_put_super(struct super_block *s)
213{ 240{
214 struct bfs_sb_info *info = BFS_SB(s); 241 struct bfs_sb_info *info = BFS_SB(s);
@@ -216,11 +243,18 @@ static void bfs_put_super(struct super_block *s)
216 if (!info) 243 if (!info)
217 return; 244 return;
218 245
246 lock_kernel();
247
248 if (s->s_dirt)
249 bfs_write_super(s);
250
219 brelse(info->si_sbh); 251 brelse(info->si_sbh);
220 mutex_destroy(&info->bfs_lock); 252 mutex_destroy(&info->bfs_lock);
221 kfree(info->si_imap); 253 kfree(info->si_imap);
222 kfree(info); 254 kfree(info);
223 s->s_fs_info = NULL; 255 s->s_fs_info = NULL;
256
257 unlock_kernel();
224} 258}
225 259
226static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) 260static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -240,17 +274,6 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
240 return 0; 274 return 0;
241} 275}
242 276
243static void bfs_write_super(struct super_block *s)
244{
245 struct bfs_sb_info *info = BFS_SB(s);
246
247 mutex_lock(&info->bfs_lock);
248 if (!(s->s_flags & MS_RDONLY))
249 mark_buffer_dirty(info->si_sbh);
250 s->s_dirt = 0;
251 mutex_unlock(&info->bfs_lock);
252}
253
254static struct kmem_cache *bfs_inode_cachep; 277static struct kmem_cache *bfs_inode_cachep;
255 278
256static struct inode *bfs_alloc_inode(struct super_block *sb) 279static struct inode *bfs_alloc_inode(struct super_block *sb)
@@ -298,6 +321,7 @@ static const struct super_operations bfs_sops = {
298 .delete_inode = bfs_delete_inode, 321 .delete_inode = bfs_delete_inode,
299 .put_super = bfs_put_super, 322 .put_super = bfs_put_super,
300 .write_super = bfs_write_super, 323 .write_super = bfs_write_super,
324 .sync_fs = bfs_sync_fs,
301 .statfs = bfs_statfs, 325 .statfs = bfs_statfs,
302}; 326};
303 327
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 40381df34869..b7c1603cd4bd 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1340,8 +1340,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
1340 prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; 1340 prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
1341 prstatus->pr_sigpend = p->pending.signal.sig[0]; 1341 prstatus->pr_sigpend = p->pending.signal.sig[0];
1342 prstatus->pr_sighold = p->blocked.sig[0]; 1342 prstatus->pr_sighold = p->blocked.sig[0];
1343 rcu_read_lock();
1344 prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
1345 rcu_read_unlock();
1343 prstatus->pr_pid = task_pid_vnr(p); 1346 prstatus->pr_pid = task_pid_vnr(p);
1344 prstatus->pr_ppid = task_pid_vnr(p->real_parent);
1345 prstatus->pr_pgrp = task_pgrp_vnr(p); 1347 prstatus->pr_pgrp = task_pgrp_vnr(p);
1346 prstatus->pr_sid = task_session_vnr(p); 1348 prstatus->pr_sid = task_session_vnr(p);
1347 if (thread_group_leader(p)) { 1349 if (thread_group_leader(p)) {
@@ -1382,8 +1384,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
1382 psinfo->pr_psargs[i] = ' '; 1384 psinfo->pr_psargs[i] = ' ';
1383 psinfo->pr_psargs[len] = 0; 1385 psinfo->pr_psargs[len] = 0;
1384 1386
1387 rcu_read_lock();
1388 psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
1389 rcu_read_unlock();
1385 psinfo->pr_pid = task_pid_vnr(p); 1390 psinfo->pr_pid = task_pid_vnr(p);
1386 psinfo->pr_ppid = task_pid_vnr(p->real_parent);
1387 psinfo->pr_pgrp = task_pgrp_vnr(p); 1391 psinfo->pr_pgrp = task_pgrp_vnr(p);
1388 psinfo->pr_sid = task_session_vnr(p); 1392 psinfo->pr_sid = task_session_vnr(p);
1389 1393
@@ -1518,11 +1522,11 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1518 info->thread = NULL; 1522 info->thread = NULL;
1519 1523
1520 psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); 1524 psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
1521 fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
1522
1523 if (psinfo == NULL) 1525 if (psinfo == NULL)
1524 return 0; 1526 return 0;
1525 1527
1528 fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
1529
1526 /* 1530 /*
1527 * Figure out how many notes we're going to need for each thread. 1531 * Figure out how many notes we're going to need for each thread.
1528 */ 1532 */
@@ -1925,7 +1929,10 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1925 elf = kmalloc(sizeof(*elf), GFP_KERNEL); 1929 elf = kmalloc(sizeof(*elf), GFP_KERNEL);
1926 if (!elf) 1930 if (!elf)
1927 goto out; 1931 goto out;
1928 1932 /*
1933 * The number of segs are recored into ELF header as 16bit value.
1934 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
1935 */
1929 segs = current->mm->map_count; 1936 segs = current->mm->map_count;
1930#ifdef ELF_CORE_EXTRA_PHDRS 1937#ifdef ELF_CORE_EXTRA_PHDRS
1931 segs += ELF_CORE_EXTRA_PHDRS; 1938 segs += ELF_CORE_EXTRA_PHDRS;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index fdb66faa24f1..20fbeced472b 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1387,8 +1387,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
1387 prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; 1387 prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
1388 prstatus->pr_sigpend = p->pending.signal.sig[0]; 1388 prstatus->pr_sigpend = p->pending.signal.sig[0];
1389 prstatus->pr_sighold = p->blocked.sig[0]; 1389 prstatus->pr_sighold = p->blocked.sig[0];
1390 rcu_read_lock();
1391 prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
1392 rcu_read_unlock();
1390 prstatus->pr_pid = task_pid_vnr(p); 1393 prstatus->pr_pid = task_pid_vnr(p);
1391 prstatus->pr_ppid = task_pid_vnr(p->real_parent);
1392 prstatus->pr_pgrp = task_pgrp_vnr(p); 1394 prstatus->pr_pgrp = task_pgrp_vnr(p);
1393 prstatus->pr_sid = task_session_vnr(p); 1395 prstatus->pr_sid = task_session_vnr(p);
1394 if (thread_group_leader(p)) { 1396 if (thread_group_leader(p)) {
@@ -1432,8 +1434,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
1432 psinfo->pr_psargs[i] = ' '; 1434 psinfo->pr_psargs[i] = ' ';
1433 psinfo->pr_psargs[len] = 0; 1435 psinfo->pr_psargs[len] = 0;
1434 1436
1437 rcu_read_lock();
1438 psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
1439 rcu_read_unlock();
1435 psinfo->pr_pid = task_pid_vnr(p); 1440 psinfo->pr_pid = task_pid_vnr(p);
1436 psinfo->pr_ppid = task_pid_vnr(p->real_parent);
1437 psinfo->pr_pgrp = task_pgrp_vnr(p); 1441 psinfo->pr_pgrp = task_pgrp_vnr(p);
1438 psinfo->pr_sid = task_session_vnr(p); 1442 psinfo->pr_sid = task_session_vnr(p);
1439 1443
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 697f6b5f1313..e92f229e3c6e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -828,15 +828,22 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
828 if (IS_ERR(bprm.file)) 828 if (IS_ERR(bprm.file))
829 return res; 829 return res;
830 830
831 bprm.cred = prepare_exec_creds();
832 res = -ENOMEM;
833 if (!bprm.cred)
834 goto out;
835
831 res = prepare_binprm(&bprm); 836 res = prepare_binprm(&bprm);
832 837
833 if (res <= (unsigned long)-4096) 838 if (res <= (unsigned long)-4096)
834 res = load_flat_file(&bprm, libs, id, NULL); 839 res = load_flat_file(&bprm, libs, id, NULL);
835 if (bprm.file) { 840
836 allow_write_access(bprm.file); 841 abort_creds(bprm.cred);
837 fput(bprm.file); 842
838 bprm.file = NULL; 843out:
839 } 844 allow_write_access(bprm.file);
845 fput(bprm.file);
846
840 return(res); 847 return(res);
841} 848}
842 849
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 31c46a241bac..49a34e7f7306 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * bio-integrity.c - bio data integrity extensions 2 * bio-integrity.c - bio data integrity extensions
3 * 3 *
4 * Copyright (C) 2007, 2008 Oracle Corporation 4 * Copyright (C) 2007, 2008, 2009 Oracle Corporation
5 * Written by: Martin K. Petersen <martin.petersen@oracle.com> 5 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
6 * 6 *
7 * This program is free software; you can redistribute it and/or 7 * This program is free software; you can redistribute it and/or
@@ -25,63 +25,121 @@
25#include <linux/bio.h> 25#include <linux/bio.h>
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27 27
28static struct kmem_cache *bio_integrity_slab __read_mostly; 28struct integrity_slab {
29static mempool_t *bio_integrity_pool; 29 struct kmem_cache *slab;
30static struct bio_set *integrity_bio_set; 30 unsigned short nr_vecs;
31 char name[8];
32};
33
34#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
35struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
36 IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
37};
38#undef IS
39
31static struct workqueue_struct *kintegrityd_wq; 40static struct workqueue_struct *kintegrityd_wq;
32 41
42static inline unsigned int vecs_to_idx(unsigned int nr)
43{
44 switch (nr) {
45 case 1:
46 return 0;
47 case 2 ... 4:
48 return 1;
49 case 5 ... 16:
50 return 2;
51 case 17 ... 64:
52 return 3;
53 case 65 ... 128:
54 return 4;
55 case 129 ... BIO_MAX_PAGES:
56 return 5;
57 default:
58 BUG();
59 }
60}
61
62static inline int use_bip_pool(unsigned int idx)
63{
64 if (idx == BIOVEC_NR_POOLS)
65 return 1;
66
67 return 0;
68}
69
33/** 70/**
34 * bio_integrity_alloc - Allocate integrity payload and attach it to bio 71 * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
35 * @bio: bio to attach integrity metadata to 72 * @bio: bio to attach integrity metadata to
36 * @gfp_mask: Memory allocation mask 73 * @gfp_mask: Memory allocation mask
37 * @nr_vecs: Number of integrity metadata scatter-gather elements 74 * @nr_vecs: Number of integrity metadata scatter-gather elements
75 * @bs: bio_set to allocate from
38 * 76 *
39 * Description: This function prepares a bio for attaching integrity 77 * Description: This function prepares a bio for attaching integrity
40 * metadata. nr_vecs specifies the maximum number of pages containing 78 * metadata. nr_vecs specifies the maximum number of pages containing
41 * integrity metadata that can be attached. 79 * integrity metadata that can be attached.
42 */ 80 */
43struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, 81struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
44 gfp_t gfp_mask, 82 gfp_t gfp_mask,
45 unsigned int nr_vecs) 83 unsigned int nr_vecs,
84 struct bio_set *bs)
46{ 85{
47 struct bio_integrity_payload *bip; 86 struct bio_integrity_payload *bip;
48 struct bio_vec *iv; 87 unsigned int idx = vecs_to_idx(nr_vecs);
49 unsigned long idx;
50 88
51 BUG_ON(bio == NULL); 89 BUG_ON(bio == NULL);
90 bip = NULL;
52 91
53 bip = mempool_alloc(bio_integrity_pool, gfp_mask); 92 /* Lower order allocations come straight from slab */
54 if (unlikely(bip == NULL)) { 93 if (!use_bip_pool(idx))
55 printk(KERN_ERR "%s: could not alloc bip\n", __func__); 94 bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
56 return NULL;
57 }
58 95
59 memset(bip, 0, sizeof(*bip)); 96 /* Use mempool if lower order alloc failed or max vecs were requested */
97 if (bip == NULL) {
98 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
60 99
61 iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, integrity_bio_set); 100 if (unlikely(bip == NULL)) {
62 if (unlikely(iv == NULL)) { 101 printk(KERN_ERR "%s: could not alloc bip\n", __func__);
63 printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__); 102 return NULL;
64 mempool_free(bip, bio_integrity_pool); 103 }
65 return NULL;
66 } 104 }
67 105
68 bip->bip_pool = idx; 106 memset(bip, 0, sizeof(*bip));
69 bip->bip_vec = iv; 107
108 bip->bip_slab = idx;
70 bip->bip_bio = bio; 109 bip->bip_bio = bio;
71 bio->bi_integrity = bip; 110 bio->bi_integrity = bip;
72 111
73 return bip; 112 return bip;
74} 113}
114EXPORT_SYMBOL(bio_integrity_alloc_bioset);
115
116/**
117 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
118 * @bio: bio to attach integrity metadata to
119 * @gfp_mask: Memory allocation mask
120 * @nr_vecs: Number of integrity metadata scatter-gather elements
121 *
122 * Description: This function prepares a bio for attaching integrity
123 * metadata. nr_vecs specifies the maximum number of pages containing
124 * integrity metadata that can be attached.
125 */
126struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
127 gfp_t gfp_mask,
128 unsigned int nr_vecs)
129{
130 return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
131}
75EXPORT_SYMBOL(bio_integrity_alloc); 132EXPORT_SYMBOL(bio_integrity_alloc);
76 133
77/** 134/**
78 * bio_integrity_free - Free bio integrity payload 135 * bio_integrity_free - Free bio integrity payload
79 * @bio: bio containing bip to be freed 136 * @bio: bio containing bip to be freed
137 * @bs: bio_set this bio was allocated from
80 * 138 *
81 * Description: Used to free the integrity portion of a bio. Usually 139 * Description: Used to free the integrity portion of a bio. Usually
82 * called from bio_free(). 140 * called from bio_free().
83 */ 141 */
84void bio_integrity_free(struct bio *bio) 142void bio_integrity_free(struct bio *bio, struct bio_set *bs)
85{ 143{
86 struct bio_integrity_payload *bip = bio->bi_integrity; 144 struct bio_integrity_payload *bip = bio->bi_integrity;
87 145
@@ -92,8 +150,10 @@ void bio_integrity_free(struct bio *bio)
92 && bip->bip_buf != NULL) 150 && bip->bip_buf != NULL)
93 kfree(bip->bip_buf); 151 kfree(bip->bip_buf);
94 152
95 bvec_free_bs(integrity_bio_set, bip->bip_vec, bip->bip_pool); 153 if (use_bip_pool(bip->bip_slab))
96 mempool_free(bip, bio_integrity_pool); 154 mempool_free(bip, bs->bio_integrity_pool);
155 else
156 kmem_cache_free(bip_slab[bip->bip_slab].slab, bip);
97 157
98 bio->bi_integrity = NULL; 158 bio->bi_integrity = NULL;
99} 159}
@@ -114,7 +174,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
114 struct bio_integrity_payload *bip = bio->bi_integrity; 174 struct bio_integrity_payload *bip = bio->bi_integrity;
115 struct bio_vec *iv; 175 struct bio_vec *iv;
116 176
117 if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) { 177 if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
118 printk(KERN_ERR "%s: bip_vec full\n", __func__); 178 printk(KERN_ERR "%s: bip_vec full\n", __func__);
119 return 0; 179 return 0;
120 } 180 }
@@ -647,8 +707,8 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
647 bp->iv1 = bip->bip_vec[0]; 707 bp->iv1 = bip->bip_vec[0];
648 bp->iv2 = bip->bip_vec[0]; 708 bp->iv2 = bip->bip_vec[0];
649 709
650 bp->bip1.bip_vec = &bp->iv1; 710 bp->bip1.bip_vec[0] = bp->iv1;
651 bp->bip2.bip_vec = &bp->iv2; 711 bp->bip2.bip_vec[0] = bp->iv2;
652 712
653 bp->iv1.bv_len = sectors * bi->tuple_size; 713 bp->iv1.bv_len = sectors * bi->tuple_size;
654 bp->iv2.bv_offset += sectors * bi->tuple_size; 714 bp->iv2.bv_offset += sectors * bi->tuple_size;
@@ -667,17 +727,19 @@ EXPORT_SYMBOL(bio_integrity_split);
667 * @bio: New bio 727 * @bio: New bio
668 * @bio_src: Original bio 728 * @bio_src: Original bio
669 * @gfp_mask: Memory allocation mask 729 * @gfp_mask: Memory allocation mask
730 * @bs: bio_set to allocate bip from
670 * 731 *
671 * Description: Called to allocate a bip when cloning a bio 732 * Description: Called to allocate a bip when cloning a bio
672 */ 733 */
673int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) 734int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
735 gfp_t gfp_mask, struct bio_set *bs)
674{ 736{
675 struct bio_integrity_payload *bip_src = bio_src->bi_integrity; 737 struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
676 struct bio_integrity_payload *bip; 738 struct bio_integrity_payload *bip;
677 739
678 BUG_ON(bip_src == NULL); 740 BUG_ON(bip_src == NULL);
679 741
680 bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); 742 bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
681 743
682 if (bip == NULL) 744 if (bip == NULL)
683 return -EIO; 745 return -EIO;
@@ -693,25 +755,43 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
693} 755}
694EXPORT_SYMBOL(bio_integrity_clone); 756EXPORT_SYMBOL(bio_integrity_clone);
695 757
696static int __init bio_integrity_init(void) 758int bioset_integrity_create(struct bio_set *bs, int pool_size)
697{ 759{
698 kintegrityd_wq = create_workqueue("kintegrityd"); 760 unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
761
762 bs->bio_integrity_pool =
763 mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
699 764
765 if (!bs->bio_integrity_pool)
766 return -1;
767
768 return 0;
769}
770EXPORT_SYMBOL(bioset_integrity_create);
771
772void bioset_integrity_free(struct bio_set *bs)
773{
774 if (bs->bio_integrity_pool)
775 mempool_destroy(bs->bio_integrity_pool);
776}
777EXPORT_SYMBOL(bioset_integrity_free);
778
779void __init bio_integrity_init(void)
780{
781 unsigned int i;
782
783 kintegrityd_wq = create_workqueue("kintegrityd");
700 if (!kintegrityd_wq) 784 if (!kintegrityd_wq)
701 panic("Failed to create kintegrityd\n"); 785 panic("Failed to create kintegrityd\n");
702 786
703 bio_integrity_slab = KMEM_CACHE(bio_integrity_payload, 787 for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) {
704 SLAB_HWCACHE_ALIGN|SLAB_PANIC); 788 unsigned int size;
705 789
706 bio_integrity_pool = mempool_create_slab_pool(BIO_POOL_SIZE, 790 size = sizeof(struct bio_integrity_payload)
707 bio_integrity_slab); 791 + bip_slab[i].nr_vecs * sizeof(struct bio_vec);
708 if (!bio_integrity_pool)
709 panic("bio_integrity: can't allocate bip pool\n");
710 792
711 integrity_bio_set = bioset_create(BIO_POOL_SIZE, 0); 793 bip_slab[i].slab =
712 if (!integrity_bio_set) 794 kmem_cache_create(bip_slab[i].name, size, 0,
713 panic("bio_integrity: can't allocate bio_set\n"); 795 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
714 796 }
715 return 0;
716} 797}
717subsys_initcall(bio_integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 98711647ece4..76738005c8e8 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,11 +25,9 @@
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/mempool.h> 26#include <linux/mempool.h>
27#include <linux/workqueue.h> 27#include <linux/workqueue.h>
28#include <linux/blktrace_api.h>
29#include <trace/block.h>
30#include <scsi/sg.h> /* for struct sg_iovec */ 28#include <scsi/sg.h> /* for struct sg_iovec */
31 29
32DEFINE_TRACE(block_split); 30#include <trace/events/block.h>
33 31
34/* 32/*
35 * Test patch to inline a certain number of bi_io_vec's inside the bio 33 * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -240,7 +238,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
240 bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); 238 bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
241 239
242 if (bio_integrity(bio)) 240 if (bio_integrity(bio))
243 bio_integrity_free(bio); 241 bio_integrity_free(bio, bs);
244 242
245 /* 243 /*
246 * If we have front padding, adjust the bio pointer before freeing 244 * If we have front padding, adjust the bio pointer before freeing
@@ -343,7 +341,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
343static void bio_kmalloc_destructor(struct bio *bio) 341static void bio_kmalloc_destructor(struct bio *bio)
344{ 342{
345 if (bio_integrity(bio)) 343 if (bio_integrity(bio))
346 bio_integrity_free(bio); 344 bio_integrity_free(bio, fs_bio_set);
347 kfree(bio); 345 kfree(bio);
348} 346}
349 347
@@ -359,9 +357,9 @@ static void bio_kmalloc_destructor(struct bio *bio)
359 * 357 *
360 * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate 358 * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
361 * a bio. This is due to the mempool guarantees. To make this work, callers 359 * a bio. This is due to the mempool guarantees. To make this work, callers
362 * must never allocate more than 1 bio at the time from this pool. Callers 360 * must never allocate more than 1 bio at a time from this pool. Callers
363 * that need to allocate more than 1 bio must always submit the previously 361 * that need to allocate more than 1 bio must always submit the previously
364 * allocate bio for IO before attempting to allocate a new one. Failure to 362 * allocated bio for IO before attempting to allocate a new one. Failure to
365 * do so can cause livelocks under memory pressure. 363 * do so can cause livelocks under memory pressure.
366 * 364 *
367 **/ 365 **/
@@ -474,7 +472,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
474 if (bio_integrity(bio)) { 472 if (bio_integrity(bio)) {
475 int ret; 473 int ret;
476 474
477 ret = bio_integrity_clone(b, bio, gfp_mask); 475 ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
478 476
479 if (ret < 0) { 477 if (ret < 0) {
480 bio_put(b); 478 bio_put(b);
@@ -499,11 +497,11 @@ int bio_get_nr_vecs(struct block_device *bdev)
499 struct request_queue *q = bdev_get_queue(bdev); 497 struct request_queue *q = bdev_get_queue(bdev);
500 int nr_pages; 498 int nr_pages;
501 499
502 nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; 500 nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
503 if (nr_pages > q->max_phys_segments) 501 if (nr_pages > queue_max_phys_segments(q))
504 nr_pages = q->max_phys_segments; 502 nr_pages = queue_max_phys_segments(q);
505 if (nr_pages > q->max_hw_segments) 503 if (nr_pages > queue_max_hw_segments(q))
506 nr_pages = q->max_hw_segments; 504 nr_pages = queue_max_hw_segments(q);
507 505
508 return nr_pages; 506 return nr_pages;
509} 507}
@@ -562,8 +560,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
562 * make this too complex. 560 * make this too complex.
563 */ 561 */
564 562
565 while (bio->bi_phys_segments >= q->max_phys_segments 563 while (bio->bi_phys_segments >= queue_max_phys_segments(q)
566 || bio->bi_phys_segments >= q->max_hw_segments) { 564 || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
567 565
568 if (retried_segments) 566 if (retried_segments)
569 return 0; 567 return 0;
@@ -634,7 +632,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
634int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, 632int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
635 unsigned int len, unsigned int offset) 633 unsigned int len, unsigned int offset)
636{ 634{
637 return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors); 635 return __bio_add_page(q, bio, page, len, offset,
636 queue_max_hw_sectors(q));
638} 637}
639 638
640/** 639/**
@@ -654,7 +653,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
654 unsigned int offset) 653 unsigned int offset)
655{ 654{
656 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 655 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
657 return __bio_add_page(q, bio, page, len, offset, q->max_sectors); 656 return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
658} 657}
659 658
660struct bio_map_data { 659struct bio_map_data {
@@ -706,14 +705,13 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
706} 705}
707 706
708static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, 707static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
709 struct sg_iovec *iov, int iov_count, int uncopy, 708 struct sg_iovec *iov, int iov_count,
710 int do_free_page) 709 int to_user, int from_user, int do_free_page)
711{ 710{
712 int ret = 0, i; 711 int ret = 0, i;
713 struct bio_vec *bvec; 712 struct bio_vec *bvec;
714 int iov_idx = 0; 713 int iov_idx = 0;
715 unsigned int iov_off = 0; 714 unsigned int iov_off = 0;
716 int read = bio_data_dir(bio) == READ;
717 715
718 __bio_for_each_segment(bvec, bio, i, 0) { 716 __bio_for_each_segment(bvec, bio, i, 0) {
719 char *bv_addr = page_address(bvec->bv_page); 717 char *bv_addr = page_address(bvec->bv_page);
@@ -721,20 +719,21 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
721 719
722 while (bv_len && iov_idx < iov_count) { 720 while (bv_len && iov_idx < iov_count) {
723 unsigned int bytes; 721 unsigned int bytes;
724 char *iov_addr; 722 char __user *iov_addr;
725 723
726 bytes = min_t(unsigned int, 724 bytes = min_t(unsigned int,
727 iov[iov_idx].iov_len - iov_off, bv_len); 725 iov[iov_idx].iov_len - iov_off, bv_len);
728 iov_addr = iov[iov_idx].iov_base + iov_off; 726 iov_addr = iov[iov_idx].iov_base + iov_off;
729 727
730 if (!ret) { 728 if (!ret) {
731 if (!read && !uncopy) 729 if (to_user)
732 ret = copy_from_user(bv_addr, iov_addr,
733 bytes);
734 if (read && uncopy)
735 ret = copy_to_user(iov_addr, bv_addr, 730 ret = copy_to_user(iov_addr, bv_addr,
736 bytes); 731 bytes);
737 732
733 if (from_user)
734 ret = copy_from_user(bv_addr, iov_addr,
735 bytes);
736
738 if (ret) 737 if (ret)
739 ret = -EFAULT; 738 ret = -EFAULT;
740 } 739 }
@@ -771,7 +770,8 @@ int bio_uncopy_user(struct bio *bio)
771 770
772 if (!bio_flagged(bio, BIO_NULL_MAPPED)) 771 if (!bio_flagged(bio, BIO_NULL_MAPPED))
773 ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, 772 ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
774 bmd->nr_sgvecs, 1, bmd->is_our_pages); 773 bmd->nr_sgvecs, bio_data_dir(bio) == READ,
774 0, bmd->is_our_pages);
775 bio_free_map_data(bmd); 775 bio_free_map_data(bmd);
776 bio_put(bio); 776 bio_put(bio);
777 return ret; 777 return ret;
@@ -876,8 +876,9 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
876 /* 876 /*
877 * success 877 * success
878 */ 878 */
879 if (!write_to_vm && (!map_data || !map_data->null_mapped)) { 879 if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
880 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); 880 (map_data && map_data->from_user)) {
881 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0);
881 if (ret) 882 if (ret)
882 goto cleanup; 883 goto cleanup;
883 } 884 }
@@ -1201,7 +1202,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
1201 char *addr = page_address(bvec->bv_page); 1202 char *addr = page_address(bvec->bv_page);
1202 int len = bmd->iovecs[i].bv_len; 1203 int len = bmd->iovecs[i].bv_len;
1203 1204
1204 if (read && !err) 1205 if (read)
1205 memcpy(p, addr, len); 1206 memcpy(p, addr, len);
1206 1207
1207 __free_page(bvec->bv_page); 1208 __free_page(bvec->bv_page);
@@ -1490,11 +1491,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1490sector_t bio_sector_offset(struct bio *bio, unsigned short index, 1491sector_t bio_sector_offset(struct bio *bio, unsigned short index,
1491 unsigned int offset) 1492 unsigned int offset)
1492{ 1493{
1493 unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue); 1494 unsigned int sector_sz;
1494 struct bio_vec *bv; 1495 struct bio_vec *bv;
1495 sector_t sectors; 1496 sector_t sectors;
1496 int i; 1497 int i;
1497 1498
1499 sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
1498 sectors = 0; 1500 sectors = 0;
1499 1501
1500 if (index >= bio->bi_idx) 1502 if (index >= bio->bi_idx)
@@ -1539,6 +1541,7 @@ void bioset_free(struct bio_set *bs)
1539 if (bs->bio_pool) 1541 if (bs->bio_pool)
1540 mempool_destroy(bs->bio_pool); 1542 mempool_destroy(bs->bio_pool);
1541 1543
1544 bioset_integrity_free(bs);
1542 biovec_free_pools(bs); 1545 biovec_free_pools(bs);
1543 bio_put_slab(bs); 1546 bio_put_slab(bs);
1544 1547
@@ -1579,6 +1582,9 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1579 if (!bs->bio_pool) 1582 if (!bs->bio_pool)
1580 goto bad; 1583 goto bad;
1581 1584
1585 if (bioset_integrity_create(bs, pool_size))
1586 goto bad;
1587
1582 if (!biovec_create_pools(bs, pool_size)) 1588 if (!biovec_create_pools(bs, pool_size))
1583 return bs; 1589 return bs;
1584 1590
@@ -1616,6 +1622,7 @@ static int __init init_bio(void)
1616 if (!bio_slabs) 1622 if (!bio_slabs)
1617 panic("bio: can't allocate bios\n"); 1623 panic("bio: can't allocate bios\n");
1618 1624
1625 bio_integrity_init();
1619 biovec_init_slabs(); 1626 biovec_init_slabs();
1620 1627
1621 fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); 1628 fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc18dd17..94dfda24c06e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,6 +25,7 @@
25#include <linux/uio.h> 25#include <linux/uio.h>
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/log2.h> 27#include <linux/log2.h>
28#include <linux/kmemleak.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29#include "internal.h" 30#include "internal.h"
30 31
@@ -76,7 +77,7 @@ int set_blocksize(struct block_device *bdev, int size)
76 return -EINVAL; 77 return -EINVAL;
77 78
78 /* Size cannot be smaller than the size supported by the device */ 79 /* Size cannot be smaller than the size supported by the device */
79 if (size < bdev_hardsect_size(bdev)) 80 if (size < bdev_logical_block_size(bdev))
80 return -EINVAL; 81 return -EINVAL;
81 82
82 /* Don't change the size if it is same as current */ 83 /* Don't change the size if it is same as current */
@@ -106,7 +107,7 @@ EXPORT_SYMBOL(sb_set_blocksize);
106 107
107int sb_min_blocksize(struct super_block *sb, int size) 108int sb_min_blocksize(struct super_block *sb, int size)
108{ 109{
109 int minsize = bdev_hardsect_size(sb->s_bdev); 110 int minsize = bdev_logical_block_size(sb->s_bdev);
110 if (size < minsize) 111 if (size < minsize)
111 size = minsize; 112 size = minsize;
112 return sb_set_blocksize(sb, size); 113 return sb_set_blocksize(sb, size);
@@ -175,17 +176,22 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
175 iov, offset, nr_segs, blkdev_get_blocks, NULL); 176 iov, offset, nr_segs, blkdev_get_blocks, NULL);
176} 177}
177 178
179int __sync_blockdev(struct block_device *bdev, int wait)
180{
181 if (!bdev)
182 return 0;
183 if (!wait)
184 return filemap_flush(bdev->bd_inode->i_mapping);
185 return filemap_write_and_wait(bdev->bd_inode->i_mapping);
186}
187
178/* 188/*
179 * Write out and wait upon all the dirty data associated with a block 189 * Write out and wait upon all the dirty data associated with a block
180 * device via its mapping. Does not take the superblock lock. 190 * device via its mapping. Does not take the superblock lock.
181 */ 191 */
182int sync_blockdev(struct block_device *bdev) 192int sync_blockdev(struct block_device *bdev)
183{ 193{
184 int ret = 0; 194 return __sync_blockdev(bdev, 1);
185
186 if (bdev)
187 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
188 return ret;
189} 195}
190EXPORT_SYMBOL(sync_blockdev); 196EXPORT_SYMBOL(sync_blockdev);
191 197
@@ -198,7 +204,7 @@ int fsync_bdev(struct block_device *bdev)
198{ 204{
199 struct super_block *sb = get_super(bdev); 205 struct super_block *sb = get_super(bdev);
200 if (sb) { 206 if (sb) {
201 int res = fsync_super(sb); 207 int res = sync_filesystem(sb);
202 drop_super(sb); 208 drop_super(sb);
203 return res; 209 return res;
204 } 210 }
@@ -240,7 +246,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
240 sb->s_frozen = SB_FREEZE_WRITE; 246 sb->s_frozen = SB_FREEZE_WRITE;
241 smp_wmb(); 247 smp_wmb();
242 248
243 __fsync_super(sb); 249 sync_filesystem(sb);
244 250
245 sb->s_frozen = SB_FREEZE_TRANS; 251 sb->s_frozen = SB_FREEZE_TRANS;
246 smp_wmb(); 252 smp_wmb();
@@ -492,6 +498,11 @@ void __init bdev_cache_init(void)
492 bd_mnt = kern_mount(&bd_type); 498 bd_mnt = kern_mount(&bd_type);
493 if (IS_ERR(bd_mnt)) 499 if (IS_ERR(bd_mnt))
494 panic("Cannot create bdev pseudo-fs"); 500 panic("Cannot create bdev pseudo-fs");
501 /*
502 * This vfsmount structure is only used to obtain the
503 * blockdev_superblock, so tell kmemleak not to report it.
504 */
505 kmemleak_not_leak(bd_mnt);
495 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 506 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
496} 507}
497 508
@@ -553,6 +564,16 @@ struct block_device *bdget(dev_t dev)
553 564
554EXPORT_SYMBOL(bdget); 565EXPORT_SYMBOL(bdget);
555 566
567/**
568 * bdgrab -- Grab a reference to an already referenced block device
569 * @bdev: Block device to grab a reference to.
570 */
571struct block_device *bdgrab(struct block_device *bdev)
572{
573 atomic_inc(&bdev->bd_inode->i_count);
574 return bdev;
575}
576
556long nr_blockdev_pages(void) 577long nr_blockdev_pages(void)
557{ 578{
558 struct block_device *bdev; 579 struct block_device *bdev;
@@ -1111,7 +1132,7 @@ EXPORT_SYMBOL(check_disk_change);
1111 1132
1112void bd_set_size(struct block_device *bdev, loff_t size) 1133void bd_set_size(struct block_device *bdev, loff_t size)
1113{ 1134{
1114 unsigned bsize = bdev_hardsect_size(bdev); 1135 unsigned bsize = bdev_logical_block_size(bdev);
1115 1136
1116 bdev->bd_inode->i_size = size; 1137 bdev->bd_inode->i_size = size;
1117 while (bsize < PAGE_CACHE_SIZE) { 1138 while (bsize < PAGE_CACHE_SIZE) {
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 94212844a9bc..a35eb36b32fd 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ 9 export.o tree-log.o acl.o free-space-cache.o zlib.o \
10 compression.o delayed-ref.o 10 compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index cbba000dccbe..f128427b995b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -29,51 +29,28 @@
29 29
30#ifdef CONFIG_FS_POSIX_ACL 30#ifdef CONFIG_FS_POSIX_ACL
31 31
32static void btrfs_update_cached_acl(struct inode *inode,
33 struct posix_acl **p_acl,
34 struct posix_acl *acl)
35{
36 spin_lock(&inode->i_lock);
37 if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
38 posix_acl_release(*p_acl);
39 *p_acl = posix_acl_dup(acl);
40 spin_unlock(&inode->i_lock);
41}
42
43static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
44{ 33{
45 int size; 34 int size;
46 const char *name; 35 const char *name;
47 char *value = NULL; 36 char *value = NULL;
48 struct posix_acl *acl = NULL, **p_acl; 37 struct posix_acl *acl;
38
39 acl = get_cached_acl(inode, type);
40 if (acl != ACL_NOT_CACHED)
41 return acl;
49 42
50 switch (type) { 43 switch (type) {
51 case ACL_TYPE_ACCESS: 44 case ACL_TYPE_ACCESS:
52 name = POSIX_ACL_XATTR_ACCESS; 45 name = POSIX_ACL_XATTR_ACCESS;
53 p_acl = &BTRFS_I(inode)->i_acl;
54 break; 46 break;
55 case ACL_TYPE_DEFAULT: 47 case ACL_TYPE_DEFAULT:
56 name = POSIX_ACL_XATTR_DEFAULT; 48 name = POSIX_ACL_XATTR_DEFAULT;
57 p_acl = &BTRFS_I(inode)->i_default_acl;
58 break; 49 break;
59 default: 50 default:
60 return ERR_PTR(-EINVAL); 51 BUG();
61 } 52 }
62 53
63 /* Handle the cached NULL acl case without locking */
64 acl = ACCESS_ONCE(*p_acl);
65 if (!acl)
66 return acl;
67
68 spin_lock(&inode->i_lock);
69 acl = *p_acl;
70 if (acl != BTRFS_ACL_NOT_CACHED)
71 acl = posix_acl_dup(acl);
72 spin_unlock(&inode->i_lock);
73
74 if (acl != BTRFS_ACL_NOT_CACHED)
75 return acl;
76
77 size = __btrfs_getxattr(inode, name, "", 0); 54 size = __btrfs_getxattr(inode, name, "", 0);
78 if (size > 0) { 55 if (size > 0) {
79 value = kzalloc(size, GFP_NOFS); 56 value = kzalloc(size, GFP_NOFS);
@@ -82,13 +59,13 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
82 size = __btrfs_getxattr(inode, name, value, size); 59 size = __btrfs_getxattr(inode, name, value, size);
83 if (size > 0) { 60 if (size > 0) {
84 acl = posix_acl_from_xattr(value, size); 61 acl = posix_acl_from_xattr(value, size);
85 btrfs_update_cached_acl(inode, p_acl, acl); 62 set_cached_acl(inode, type, acl);
86 } 63 }
87 kfree(value); 64 kfree(value);
88 } else if (size == -ENOENT || size == -ENODATA || size == 0) { 65 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
89 /* FIXME, who returns -ENOENT? I think nobody */ 66 /* FIXME, who returns -ENOENT? I think nobody */
90 acl = NULL; 67 acl = NULL;
91 btrfs_update_cached_acl(inode, p_acl, acl); 68 set_cached_acl(inode, type, acl);
92 } else { 69 } else {
93 acl = ERR_PTR(-EIO); 70 acl = ERR_PTR(-EIO);
94 } 71 }
@@ -121,7 +98,6 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
121{ 98{
122 int ret, size = 0; 99 int ret, size = 0;
123 const char *name; 100 const char *name;
124 struct posix_acl **p_acl;
125 char *value = NULL; 101 char *value = NULL;
126 mode_t mode; 102 mode_t mode;
127 103
@@ -141,13 +117,11 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
141 ret = 0; 117 ret = 0;
142 inode->i_mode = mode; 118 inode->i_mode = mode;
143 name = POSIX_ACL_XATTR_ACCESS; 119 name = POSIX_ACL_XATTR_ACCESS;
144 p_acl = &BTRFS_I(inode)->i_acl;
145 break; 120 break;
146 case ACL_TYPE_DEFAULT: 121 case ACL_TYPE_DEFAULT:
147 if (!S_ISDIR(inode->i_mode)) 122 if (!S_ISDIR(inode->i_mode))
148 return acl ? -EINVAL : 0; 123 return acl ? -EINVAL : 0;
149 name = POSIX_ACL_XATTR_DEFAULT; 124 name = POSIX_ACL_XATTR_DEFAULT;
150 p_acl = &BTRFS_I(inode)->i_default_acl;
151 break; 125 break;
152 default: 126 default:
153 return -EINVAL; 127 return -EINVAL;
@@ -172,7 +146,7 @@ out:
172 kfree(value); 146 kfree(value);
173 147
174 if (!ret) 148 if (!ret)
175 btrfs_update_cached_acl(inode, p_acl, acl); 149 set_cached_acl(inode, type, acl);
176 150
177 return ret; 151 return ret;
178} 152}
@@ -351,9 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
351 return 0; 325 return 0;
352} 326}
353 327
354int btrfs_check_acl(struct inode *inode, int mask)
355{
356 return 0;
357}
358
359#endif /* CONFIG_FS_POSIX_ACL */ 328#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 502c3d61de62..019e8af449ab 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -294,13 +294,13 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
294 INIT_LIST_HEAD(&worker->worker_list); 294 INIT_LIST_HEAD(&worker->worker_list);
295 spin_lock_init(&worker->lock); 295 spin_lock_init(&worker->lock);
296 atomic_set(&worker->num_pending, 0); 296 atomic_set(&worker->num_pending, 0);
297 worker->workers = workers;
297 worker->task = kthread_run(worker_loop, worker, 298 worker->task = kthread_run(worker_loop, worker,
298 "btrfs-%s-%d", workers->name, 299 "btrfs-%s-%d", workers->name,
299 workers->num_workers + i); 300 workers->num_workers + i);
300 worker->workers = workers;
301 if (IS_ERR(worker->task)) { 301 if (IS_ERR(worker->task)) {
302 kfree(worker);
303 ret = PTR_ERR(worker->task); 302 ret = PTR_ERR(worker->task);
303 kfree(worker);
304 goto fail; 304 goto fail;
305 } 305 }
306 306
@@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work)
424 * list 424 * list
425 */ 425 */
426 if (worker->idle) { 426 if (worker->idle) {
427 spin_lock_irqsave(&worker->workers->lock, flags); 427 spin_lock(&worker->workers->lock);
428 worker->idle = 0; 428 worker->idle = 0;
429 list_move_tail(&worker->worker_list, 429 list_move_tail(&worker->worker_list,
430 &worker->workers->worker_list); 430 &worker->workers->worker_list);
431 spin_unlock_irqrestore(&worker->workers->lock, flags); 431 spin_unlock(&worker->workers->lock);
432 } 432 }
433 if (!worker->working) { 433 if (!worker->working) {
434 wake = 1; 434 wake = 1;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b30986f00b9d..ea1ea0af8c0e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -53,10 +53,6 @@ struct btrfs_inode {
53 /* used to order data wrt metadata */ 53 /* used to order data wrt metadata */
54 struct btrfs_ordered_inode_tree ordered_tree; 54 struct btrfs_ordered_inode_tree ordered_tree;
55 55
56 /* standard acl pointers */
57 struct posix_acl *i_acl;
58 struct posix_acl *i_default_acl;
59
60 /* for keeping track of orphaned inodes */ 56 /* for keeping track of orphaned inodes */
61 struct list_head i_orphan; 57 struct list_head i_orphan;
62 58
@@ -72,6 +68,9 @@ struct btrfs_inode {
72 */ 68 */
73 struct list_head ordered_operations; 69 struct list_head ordered_operations;
74 70
71 /* node for the red-black tree that links inodes in subvolume root */
72 struct rb_node rb_node;
73
75 /* the space_info for where this inode's data allocations are done */ 74 /* the space_info for where this inode's data allocations are done */
76 struct btrfs_space_info *space_info; 75 struct btrfs_space_info *space_info;
77 76
@@ -154,5 +153,4 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
154 BTRFS_I(inode)->disk_i_size = size; 153 BTRFS_I(inode)->disk_i_size = size;
155} 154}
156 155
157
158#endif 156#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ab07627084f1..9d8ba4d54a37 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -26,7 +26,6 @@
26#include <linux/time.h> 26#include <linux/time.h>
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
31#include <linux/mpage.h> 30#include <linux/mpage.h>
32#include <linux/swap.h> 31#include <linux/swap.h>
@@ -123,7 +122,7 @@ static int check_compressed_csum(struct inode *inode,
123 u32 csum; 122 u32 csum;
124 u32 *cb_sum = &cb->sums; 123 u32 *cb_sum = &cb->sums;
125 124
126 if (btrfs_test_flag(inode, NODATASUM)) 125 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
127 return 0; 126 return 0;
128 127
129 for (i = 0; i < cb->nr_pages; i++) { 128 for (i = 0; i < cb->nr_pages; i++) {
@@ -670,7 +669,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
670 */ 669 */
671 atomic_inc(&cb->pending_bios); 670 atomic_inc(&cb->pending_bios);
672 671
673 if (!btrfs_test_flag(inode, NODATASUM)) { 672 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
674 btrfs_lookup_bio_sums(root, inode, comp_bio, 673 btrfs_lookup_bio_sums(root, inode, comp_bio,
675 sums); 674 sums);
676 } 675 }
@@ -697,7 +696,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
697 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 696 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
698 BUG_ON(ret); 697 BUG_ON(ret);
699 698
700 if (!btrfs_test_flag(inode, NODATASUM)) 699 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
701 btrfs_lookup_bio_sums(root, inode, comp_bio, sums); 700 btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
702 701
703 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 702 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
deleted file mode 100644
index 6e1b3de36700..000000000000
--- a/fs/btrfs/crc32c.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CRC32C__
20#define __BTRFS_CRC32C__
21#include <linux/crc32c.h>
22
23/*
24 * this file used to do more for selecting the HW version of crc32c,
25 * perhaps it will one day again soon.
26 */
27#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
28#endif
29
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fedf8b9f03a2..3fdcc0512d3a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -197,14 +197,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
197 u32 nritems; 197 u32 nritems;
198 int ret = 0; 198 int ret = 0;
199 int level; 199 int level;
200 struct btrfs_root *new_root; 200 struct btrfs_disk_key disk_key;
201
202 new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
203 if (!new_root)
204 return -ENOMEM;
205
206 memcpy(new_root, root, sizeof(*new_root));
207 new_root->root_key.objectid = new_root_objectid;
208 201
209 WARN_ON(root->ref_cows && trans->transid != 202 WARN_ON(root->ref_cows && trans->transid !=
210 root->fs_info->running_transaction->transid); 203 root->fs_info->running_transaction->transid);
@@ -212,28 +205,37 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
212 205
213 level = btrfs_header_level(buf); 206 level = btrfs_header_level(buf);
214 nritems = btrfs_header_nritems(buf); 207 nritems = btrfs_header_nritems(buf);
208 if (level == 0)
209 btrfs_item_key(buf, &disk_key, 0);
210 else
211 btrfs_node_key(buf, &disk_key, 0);
215 212
216 cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0, 213 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
217 new_root_objectid, trans->transid, 214 new_root_objectid, &disk_key, level,
218 level, buf->start, 0); 215 buf->start, 0);
219 if (IS_ERR(cow)) { 216 if (IS_ERR(cow))
220 kfree(new_root);
221 return PTR_ERR(cow); 217 return PTR_ERR(cow);
222 }
223 218
224 copy_extent_buffer(cow, buf, 0, 0, cow->len); 219 copy_extent_buffer(cow, buf, 0, 0, cow->len);
225 btrfs_set_header_bytenr(cow, cow->start); 220 btrfs_set_header_bytenr(cow, cow->start);
226 btrfs_set_header_generation(cow, trans->transid); 221 btrfs_set_header_generation(cow, trans->transid);
227 btrfs_set_header_owner(cow, new_root_objectid); 222 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
228 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); 223 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
224 BTRFS_HEADER_FLAG_RELOC);
225 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
226 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
227 else
228 btrfs_set_header_owner(cow, new_root_objectid);
229 229
230 write_extent_buffer(cow, root->fs_info->fsid, 230 write_extent_buffer(cow, root->fs_info->fsid,
231 (unsigned long)btrfs_header_fsid(cow), 231 (unsigned long)btrfs_header_fsid(cow),
232 BTRFS_FSID_SIZE); 232 BTRFS_FSID_SIZE);
233 233
234 WARN_ON(btrfs_header_generation(buf) > trans->transid); 234 WARN_ON(btrfs_header_generation(buf) > trans->transid);
235 ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL); 235 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
236 kfree(new_root); 236 ret = btrfs_inc_ref(trans, root, cow, 1);
237 else
238 ret = btrfs_inc_ref(trans, root, cow, 0);
237 239
238 if (ret) 240 if (ret)
239 return ret; 241 return ret;
@@ -244,6 +246,125 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
244} 246}
245 247
246/* 248/*
249 * check if the tree block can be shared by multiple trees
250 */
251int btrfs_block_can_be_shared(struct btrfs_root *root,
252 struct extent_buffer *buf)
253{
254 /*
255 * Tree blocks not in refernece counted trees and tree roots
256 * are never shared. If a block was allocated after the last
257 * snapshot and the block was not allocated by tree relocation,
258 * we know the block is not shared.
259 */
260 if (root->ref_cows &&
261 buf != root->node && buf != root->commit_root &&
262 (btrfs_header_generation(buf) <=
263 btrfs_root_last_snapshot(&root->root_item) ||
264 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
265 return 1;
266#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
267 if (root->ref_cows &&
268 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
269 return 1;
270#endif
271 return 0;
272}
273
274static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
275 struct btrfs_root *root,
276 struct extent_buffer *buf,
277 struct extent_buffer *cow)
278{
279 u64 refs;
280 u64 owner;
281 u64 flags;
282 u64 new_flags = 0;
283 int ret;
284
285 /*
286 * Backrefs update rules:
287 *
288 * Always use full backrefs for extent pointers in tree block
289 * allocated by tree relocation.
290 *
291 * If a shared tree block is no longer referenced by its owner
292 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
293 * use full backrefs for extent pointers in tree block.
294 *
295 * If a tree block is been relocating
296 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
297 * use full backrefs for extent pointers in tree block.
298 * The reason for this is some operations (such as drop tree)
299 * are only allowed for blocks use full backrefs.
300 */
301
302 if (btrfs_block_can_be_shared(root, buf)) {
303 ret = btrfs_lookup_extent_info(trans, root, buf->start,
304 buf->len, &refs, &flags);
305 BUG_ON(ret);
306 BUG_ON(refs == 0);
307 } else {
308 refs = 1;
309 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
310 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
311 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
312 else
313 flags = 0;
314 }
315
316 owner = btrfs_header_owner(buf);
317 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
318 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
319
320 if (refs > 1) {
321 if ((owner == root->root_key.objectid ||
322 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
323 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
324 ret = btrfs_inc_ref(trans, root, buf, 1);
325 BUG_ON(ret);
326
327 if (root->root_key.objectid ==
328 BTRFS_TREE_RELOC_OBJECTID) {
329 ret = btrfs_dec_ref(trans, root, buf, 0);
330 BUG_ON(ret);
331 ret = btrfs_inc_ref(trans, root, cow, 1);
332 BUG_ON(ret);
333 }
334 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
335 } else {
336
337 if (root->root_key.objectid ==
338 BTRFS_TREE_RELOC_OBJECTID)
339 ret = btrfs_inc_ref(trans, root, cow, 1);
340 else
341 ret = btrfs_inc_ref(trans, root, cow, 0);
342 BUG_ON(ret);
343 }
344 if (new_flags != 0) {
345 ret = btrfs_set_disk_extent_flags(trans, root,
346 buf->start,
347 buf->len,
348 new_flags, 0);
349 BUG_ON(ret);
350 }
351 } else {
352 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
353 if (root->root_key.objectid ==
354 BTRFS_TREE_RELOC_OBJECTID)
355 ret = btrfs_inc_ref(trans, root, cow, 1);
356 else
357 ret = btrfs_inc_ref(trans, root, cow, 0);
358 BUG_ON(ret);
359 ret = btrfs_dec_ref(trans, root, buf, 1);
360 BUG_ON(ret);
361 }
362 clean_tree_block(trans, root, buf);
363 }
364 return 0;
365}
366
367/*
247 * does the dirty work in cow of a single block. The parent block (if 368 * does the dirty work in cow of a single block. The parent block (if
248 * supplied) is updated to point to the new cow copy. The new buffer is marked 369 * supplied) is updated to point to the new cow copy. The new buffer is marked
249 * dirty and returned locked. If you modify the block it needs to be marked 370 * dirty and returned locked. If you modify the block it needs to be marked
@@ -262,34 +383,39 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
262 struct extent_buffer **cow_ret, 383 struct extent_buffer **cow_ret,
263 u64 search_start, u64 empty_size) 384 u64 search_start, u64 empty_size)
264{ 385{
265 u64 parent_start; 386 struct btrfs_disk_key disk_key;
266 struct extent_buffer *cow; 387 struct extent_buffer *cow;
267 u32 nritems;
268 int ret = 0;
269 int level; 388 int level;
270 int unlock_orig = 0; 389 int unlock_orig = 0;
390 u64 parent_start;
271 391
272 if (*cow_ret == buf) 392 if (*cow_ret == buf)
273 unlock_orig = 1; 393 unlock_orig = 1;
274 394
275 btrfs_assert_tree_locked(buf); 395 btrfs_assert_tree_locked(buf);
276 396
277 if (parent)
278 parent_start = parent->start;
279 else
280 parent_start = 0;
281
282 WARN_ON(root->ref_cows && trans->transid != 397 WARN_ON(root->ref_cows && trans->transid !=
283 root->fs_info->running_transaction->transid); 398 root->fs_info->running_transaction->transid);
284 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 399 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
285 400
286 level = btrfs_header_level(buf); 401 level = btrfs_header_level(buf);
287 nritems = btrfs_header_nritems(buf);
288 402
289 cow = btrfs_alloc_free_block(trans, root, buf->len, 403 if (level == 0)
290 parent_start, root->root_key.objectid, 404 btrfs_item_key(buf, &disk_key, 0);
291 trans->transid, level, 405 else
292 search_start, empty_size); 406 btrfs_node_key(buf, &disk_key, 0);
407
408 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
409 if (parent)
410 parent_start = parent->start;
411 else
412 parent_start = 0;
413 } else
414 parent_start = 0;
415
416 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
417 root->root_key.objectid, &disk_key,
418 level, search_start, empty_size);
293 if (IS_ERR(cow)) 419 if (IS_ERR(cow))
294 return PTR_ERR(cow); 420 return PTR_ERR(cow);
295 421
@@ -298,83 +424,53 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
298 copy_extent_buffer(cow, buf, 0, 0, cow->len); 424 copy_extent_buffer(cow, buf, 0, 0, cow->len);
299 btrfs_set_header_bytenr(cow, cow->start); 425 btrfs_set_header_bytenr(cow, cow->start);
300 btrfs_set_header_generation(cow, trans->transid); 426 btrfs_set_header_generation(cow, trans->transid);
301 btrfs_set_header_owner(cow, root->root_key.objectid); 427 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
302 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); 428 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
429 BTRFS_HEADER_FLAG_RELOC);
430 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
431 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
432 else
433 btrfs_set_header_owner(cow, root->root_key.objectid);
303 434
304 write_extent_buffer(cow, root->fs_info->fsid, 435 write_extent_buffer(cow, root->fs_info->fsid,
305 (unsigned long)btrfs_header_fsid(cow), 436 (unsigned long)btrfs_header_fsid(cow),
306 BTRFS_FSID_SIZE); 437 BTRFS_FSID_SIZE);
307 438
308 WARN_ON(btrfs_header_generation(buf) > trans->transid); 439 update_ref_for_cow(trans, root, buf, cow);
309 if (btrfs_header_generation(buf) != trans->transid) {
310 u32 nr_extents;
311 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
312 if (ret)
313 return ret;
314
315 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
316 WARN_ON(ret);
317 } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
318 /*
319 * There are only two places that can drop reference to
320 * tree blocks owned by living reloc trees, one is here,
321 * the other place is btrfs_drop_subtree. In both places,
322 * we check reference count while tree block is locked.
323 * Furthermore, if reference count is one, it won't get
324 * increased by someone else.
325 */
326 u32 refs;
327 ret = btrfs_lookup_extent_ref(trans, root, buf->start,
328 buf->len, &refs);
329 BUG_ON(ret);
330 if (refs == 1) {
331 ret = btrfs_update_ref(trans, root, buf, cow,
332 0, nritems);
333 clean_tree_block(trans, root, buf);
334 } else {
335 ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
336 }
337 BUG_ON(ret);
338 } else {
339 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
340 if (ret)
341 return ret;
342 clean_tree_block(trans, root, buf);
343 }
344
345 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
346 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
347 WARN_ON(ret);
348 }
349 440
350 if (buf == root->node) { 441 if (buf == root->node) {
351 WARN_ON(parent && parent != buf); 442 WARN_ON(parent && parent != buf);
443 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
444 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
445 parent_start = buf->start;
446 else
447 parent_start = 0;
352 448
353 spin_lock(&root->node_lock); 449 spin_lock(&root->node_lock);
354 root->node = cow; 450 root->node = cow;
355 extent_buffer_get(cow); 451 extent_buffer_get(cow);
356 spin_unlock(&root->node_lock); 452 spin_unlock(&root->node_lock);
357 453
358 if (buf != root->commit_root) { 454 btrfs_free_extent(trans, root, buf->start, buf->len,
359 btrfs_free_extent(trans, root, buf->start, 455 parent_start, root->root_key.objectid,
360 buf->len, buf->start, 456 level, 0);
361 root->root_key.objectid,
362 btrfs_header_generation(buf),
363 level, 1);
364 }
365 free_extent_buffer(buf); 457 free_extent_buffer(buf);
366 add_root_to_dirty_list(root); 458 add_root_to_dirty_list(root);
367 } else { 459 } else {
460 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
461 parent_start = parent->start;
462 else
463 parent_start = 0;
464
465 WARN_ON(trans->transid != btrfs_header_generation(parent));
368 btrfs_set_node_blockptr(parent, parent_slot, 466 btrfs_set_node_blockptr(parent, parent_slot,
369 cow->start); 467 cow->start);
370 WARN_ON(trans->transid == 0);
371 btrfs_set_node_ptr_generation(parent, parent_slot, 468 btrfs_set_node_ptr_generation(parent, parent_slot,
372 trans->transid); 469 trans->transid);
373 btrfs_mark_buffer_dirty(parent); 470 btrfs_mark_buffer_dirty(parent);
374 WARN_ON(btrfs_header_generation(parent) != trans->transid);
375 btrfs_free_extent(trans, root, buf->start, buf->len, 471 btrfs_free_extent(trans, root, buf->start, buf->len,
376 parent_start, btrfs_header_owner(parent), 472 parent_start, root->root_key.objectid,
377 btrfs_header_generation(parent), level, 1); 473 level, 0);
378 } 474 }
379 if (unlock_orig) 475 if (unlock_orig)
380 btrfs_tree_unlock(buf); 476 btrfs_tree_unlock(buf);
@@ -384,6 +480,18 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
384 return 0; 480 return 0;
385} 481}
386 482
483static inline int should_cow_block(struct btrfs_trans_handle *trans,
484 struct btrfs_root *root,
485 struct extent_buffer *buf)
486{
487 if (btrfs_header_generation(buf) == trans->transid &&
488 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
489 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
490 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
491 return 0;
492 return 1;
493}
494
387/* 495/*
388 * cows a single block, see __btrfs_cow_block for the real work. 496 * cows a single block, see __btrfs_cow_block for the real work.
389 * This version of it has extra checks so that a block isn't cow'd more than 497 * This version of it has extra checks so that a block isn't cow'd more than
@@ -411,9 +519,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
411 WARN_ON(1); 519 WARN_ON(1);
412 } 520 }
413 521
414 if (btrfs_header_generation(buf) == trans->transid && 522 if (!should_cow_block(trans, root, buf)) {
415 btrfs_header_owner(buf) == root->root_key.objectid &&
416 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
417 *cow_ret = buf; 523 *cow_ret = buf;
418 return 0; 524 return 0;
419 } 525 }
@@ -451,25 +557,13 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
451 557
452 btrfs_disk_key_to_cpu(&k1, disk); 558 btrfs_disk_key_to_cpu(&k1, disk);
453 559
454 if (k1.objectid > k2->objectid) 560 return btrfs_comp_cpu_keys(&k1, k2);
455 return 1;
456 if (k1.objectid < k2->objectid)
457 return -1;
458 if (k1.type > k2->type)
459 return 1;
460 if (k1.type < k2->type)
461 return -1;
462 if (k1.offset > k2->offset)
463 return 1;
464 if (k1.offset < k2->offset)
465 return -1;
466 return 0;
467} 561}
468 562
469/* 563/*
470 * same as comp_keys only with two btrfs_key's 564 * same as comp_keys only with two btrfs_key's
471 */ 565 */
472static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) 566int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
473{ 567{
474 if (k1->objectid > k2->objectid) 568 if (k1->objectid > k2->objectid)
475 return 1; 569 return 1;
@@ -845,6 +939,12 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
845 return -1; 939 return -1;
846} 940}
847 941
942int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
943 int level, int *slot)
944{
945 return bin_search(eb, key, level, slot);
946}
947
848/* given a node and slot number, this reads the blocks it points to. The 948/* given a node and slot number, this reads the blocks it points to. The
849 * extent buffer is returned with a reference taken (but unlocked). 949 * extent buffer is returned with a reference taken (but unlocked).
850 * NULL is returned on error. 950 * NULL is returned on error.
@@ -921,13 +1021,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
921 root->node = child; 1021 root->node = child;
922 spin_unlock(&root->node_lock); 1022 spin_unlock(&root->node_lock);
923 1023
924 ret = btrfs_update_extent_ref(trans, root, child->start,
925 child->len,
926 mid->start, child->start,
927 root->root_key.objectid,
928 trans->transid, level - 1);
929 BUG_ON(ret);
930
931 add_root_to_dirty_list(root); 1024 add_root_to_dirty_list(root);
932 btrfs_tree_unlock(child); 1025 btrfs_tree_unlock(child);
933 1026
@@ -938,9 +1031,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
938 /* once for the path */ 1031 /* once for the path */
939 free_extent_buffer(mid); 1032 free_extent_buffer(mid);
940 ret = btrfs_free_extent(trans, root, mid->start, mid->len, 1033 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
941 mid->start, root->root_key.objectid, 1034 0, root->root_key.objectid, level, 1);
942 btrfs_header_generation(mid),
943 level, 1);
944 /* once for the root ptr */ 1035 /* once for the root ptr */
945 free_extent_buffer(mid); 1036 free_extent_buffer(mid);
946 return ret; 1037 return ret;
@@ -949,10 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
949 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1040 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
950 return 0; 1041 return 0;
951 1042
952 if (trans->transaction->delayed_refs.flushing &&
953 btrfs_header_nritems(mid) > 2)
954 return 0;
955
956 if (btrfs_header_nritems(mid) < 2) 1043 if (btrfs_header_nritems(mid) < 2)
957 err_on_enospc = 1; 1044 err_on_enospc = 1;
958 1045
@@ -998,7 +1085,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
998 ret = wret; 1085 ret = wret;
999 if (btrfs_header_nritems(right) == 0) { 1086 if (btrfs_header_nritems(right) == 0) {
1000 u64 bytenr = right->start; 1087 u64 bytenr = right->start;
1001 u64 generation = btrfs_header_generation(parent);
1002 u32 blocksize = right->len; 1088 u32 blocksize = right->len;
1003 1089
1004 clean_tree_block(trans, root, right); 1090 clean_tree_block(trans, root, right);
@@ -1010,9 +1096,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1010 if (wret) 1096 if (wret)
1011 ret = wret; 1097 ret = wret;
1012 wret = btrfs_free_extent(trans, root, bytenr, 1098 wret = btrfs_free_extent(trans, root, bytenr,
1013 blocksize, parent->start, 1099 blocksize, 0,
1014 btrfs_header_owner(parent), 1100 root->root_key.objectid,
1015 generation, level, 1); 1101 level, 0);
1016 if (wret) 1102 if (wret)
1017 ret = wret; 1103 ret = wret;
1018 } else { 1104 } else {
@@ -1047,7 +1133,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1047 } 1133 }
1048 if (btrfs_header_nritems(mid) == 0) { 1134 if (btrfs_header_nritems(mid) == 0) {
1049 /* we've managed to empty the middle node, drop it */ 1135 /* we've managed to empty the middle node, drop it */
1050 u64 root_gen = btrfs_header_generation(parent);
1051 u64 bytenr = mid->start; 1136 u64 bytenr = mid->start;
1052 u32 blocksize = mid->len; 1137 u32 blocksize = mid->len;
1053 1138
@@ -1059,9 +1144,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1059 if (wret) 1144 if (wret)
1060 ret = wret; 1145 ret = wret;
1061 wret = btrfs_free_extent(trans, root, bytenr, blocksize, 1146 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1062 parent->start, 1147 0, root->root_key.objectid,
1063 btrfs_header_owner(parent), 1148 level, 0);
1064 root_gen, level, 1);
1065 if (wret) 1149 if (wret)
1066 ret = wret; 1150 ret = wret;
1067 } else { 1151 } else {
@@ -1437,7 +1521,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1437{ 1521{
1438 int i; 1522 int i;
1439 1523
1440 if (path->keep_locks || path->lowest_level) 1524 if (path->keep_locks)
1441 return; 1525 return;
1442 1526
1443 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1527 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1552,7 +1636,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
1552 } 1636 }
1553 b = p->nodes[level]; 1637 b = p->nodes[level];
1554 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1638 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1555 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { 1639 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1556 int sret; 1640 int sret;
1557 1641
1558 sret = reada_for_balance(root, p, level); 1642 sret = reada_for_balance(root, p, level);
@@ -1602,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1602 struct extent_buffer *b; 1686 struct extent_buffer *b;
1603 int slot; 1687 int slot;
1604 int ret; 1688 int ret;
1689 int err;
1605 int level; 1690 int level;
1606 int lowest_unlock = 1; 1691 int lowest_unlock = 1;
1607 u8 lowest_level = 0; 1692 u8 lowest_level = 0;
@@ -1614,10 +1699,17 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1614 lowest_unlock = 2; 1699 lowest_unlock = 2;
1615 1700
1616again: 1701again:
1617 if (p->skip_locking) 1702 if (p->search_commit_root) {
1618 b = btrfs_root_node(root); 1703 b = root->commit_root;
1619 else 1704 extent_buffer_get(b);
1620 b = btrfs_lock_root_node(root); 1705 if (!p->skip_locking)
1706 btrfs_tree_lock(b);
1707 } else {
1708 if (p->skip_locking)
1709 b = btrfs_root_node(root);
1710 else
1711 b = btrfs_lock_root_node(root);
1712 }
1621 1713
1622 while (b) { 1714 while (b) {
1623 level = btrfs_header_level(b); 1715 level = btrfs_header_level(b);
@@ -1631,26 +1723,22 @@ again:
1631 p->locks[level] = 1; 1723 p->locks[level] = 1;
1632 1724
1633 if (cow) { 1725 if (cow) {
1634 int wret;
1635
1636 /* 1726 /*
1637 * if we don't really need to cow this block 1727 * if we don't really need to cow this block
1638 * then we don't want to set the path blocking, 1728 * then we don't want to set the path blocking,
1639 * so we test it here 1729 * so we test it here
1640 */ 1730 */
1641 if (btrfs_header_generation(b) == trans->transid && 1731 if (!should_cow_block(trans, root, b))
1642 btrfs_header_owner(b) == root->root_key.objectid &&
1643 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1644 goto cow_done; 1732 goto cow_done;
1645 } 1733
1646 btrfs_set_path_blocking(p); 1734 btrfs_set_path_blocking(p);
1647 1735
1648 wret = btrfs_cow_block(trans, root, b, 1736 err = btrfs_cow_block(trans, root, b,
1649 p->nodes[level + 1], 1737 p->nodes[level + 1],
1650 p->slots[level + 1], &b); 1738 p->slots[level + 1], &b);
1651 if (wret) { 1739 if (err) {
1652 free_extent_buffer(b); 1740 free_extent_buffer(b);
1653 ret = wret; 1741 ret = err;
1654 goto done; 1742 goto done;
1655 } 1743 }
1656 } 1744 }
@@ -1689,41 +1777,45 @@ cow_done:
1689 ret = bin_search(b, key, level, &slot); 1777 ret = bin_search(b, key, level, &slot);
1690 1778
1691 if (level != 0) { 1779 if (level != 0) {
1692 if (ret && slot > 0) 1780 int dec = 0;
1781 if (ret && slot > 0) {
1782 dec = 1;
1693 slot -= 1; 1783 slot -= 1;
1784 }
1694 p->slots[level] = slot; 1785 p->slots[level] = slot;
1695 ret = setup_nodes_for_search(trans, root, p, b, level, 1786 err = setup_nodes_for_search(trans, root, p, b, level,
1696 ins_len); 1787 ins_len);
1697 if (ret == -EAGAIN) 1788 if (err == -EAGAIN)
1698 goto again; 1789 goto again;
1699 else if (ret) 1790 if (err) {
1791 ret = err;
1700 goto done; 1792 goto done;
1793 }
1701 b = p->nodes[level]; 1794 b = p->nodes[level];
1702 slot = p->slots[level]; 1795 slot = p->slots[level];
1703 1796
1704 unlock_up(p, level, lowest_unlock); 1797 unlock_up(p, level, lowest_unlock);
1705 1798
1706 /* this is only true while dropping a snapshot */
1707 if (level == lowest_level) { 1799 if (level == lowest_level) {
1708 ret = 0; 1800 if (dec)
1801 p->slots[level]++;
1709 goto done; 1802 goto done;
1710 } 1803 }
1711 1804
1712 ret = read_block_for_search(trans, root, p, 1805 err = read_block_for_search(trans, root, p,
1713 &b, level, slot, key); 1806 &b, level, slot, key);
1714 if (ret == -EAGAIN) 1807 if (err == -EAGAIN)
1715 goto again; 1808 goto again;
1716 1809 if (err) {
1717 if (ret == -EIO) 1810 ret = err;
1718 goto done; 1811 goto done;
1812 }
1719 1813
1720 if (!p->skip_locking) { 1814 if (!p->skip_locking) {
1721 int lret;
1722
1723 btrfs_clear_path_blocking(p, NULL); 1815 btrfs_clear_path_blocking(p, NULL);
1724 lret = btrfs_try_spin_lock(b); 1816 err = btrfs_try_spin_lock(b);
1725 1817
1726 if (!lret) { 1818 if (!err) {
1727 btrfs_set_path_blocking(p); 1819 btrfs_set_path_blocking(p);
1728 btrfs_tree_lock(b); 1820 btrfs_tree_lock(b);
1729 btrfs_clear_path_blocking(p, b); 1821 btrfs_clear_path_blocking(p, b);
@@ -1733,16 +1825,14 @@ cow_done:
1733 p->slots[level] = slot; 1825 p->slots[level] = slot;
1734 if (ins_len > 0 && 1826 if (ins_len > 0 &&
1735 btrfs_leaf_free_space(root, b) < ins_len) { 1827 btrfs_leaf_free_space(root, b) < ins_len) {
1736 int sret;
1737
1738 btrfs_set_path_blocking(p); 1828 btrfs_set_path_blocking(p);
1739 sret = split_leaf(trans, root, key, 1829 err = split_leaf(trans, root, key,
1740 p, ins_len, ret == 0); 1830 p, ins_len, ret == 0);
1741 btrfs_clear_path_blocking(p, NULL); 1831 btrfs_clear_path_blocking(p, NULL);
1742 1832
1743 BUG_ON(sret > 0); 1833 BUG_ON(err > 0);
1744 if (sret) { 1834 if (err) {
1745 ret = sret; 1835 ret = err;
1746 goto done; 1836 goto done;
1747 } 1837 }
1748 } 1838 }
@@ -1764,138 +1854,6 @@ done:
1764 return ret; 1854 return ret;
1765} 1855}
1766 1856
1767int btrfs_merge_path(struct btrfs_trans_handle *trans,
1768 struct btrfs_root *root,
1769 struct btrfs_key *node_keys,
1770 u64 *nodes, int lowest_level)
1771{
1772 struct extent_buffer *eb;
1773 struct extent_buffer *parent;
1774 struct btrfs_key key;
1775 u64 bytenr;
1776 u64 generation;
1777 u32 blocksize;
1778 int level;
1779 int slot;
1780 int key_match;
1781 int ret;
1782
1783 eb = btrfs_lock_root_node(root);
1784 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
1785 BUG_ON(ret);
1786
1787 btrfs_set_lock_blocking(eb);
1788
1789 parent = eb;
1790 while (1) {
1791 level = btrfs_header_level(parent);
1792 if (level == 0 || level <= lowest_level)
1793 break;
1794
1795 ret = bin_search(parent, &node_keys[lowest_level], level,
1796 &slot);
1797 if (ret && slot > 0)
1798 slot--;
1799
1800 bytenr = btrfs_node_blockptr(parent, slot);
1801 if (nodes[level - 1] == bytenr)
1802 break;
1803
1804 blocksize = btrfs_level_size(root, level - 1);
1805 generation = btrfs_node_ptr_generation(parent, slot);
1806 btrfs_node_key_to_cpu(eb, &key, slot);
1807 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1808
1809 if (generation == trans->transid) {
1810 eb = read_tree_block(root, bytenr, blocksize,
1811 generation);
1812 btrfs_tree_lock(eb);
1813 btrfs_set_lock_blocking(eb);
1814 }
1815
1816 /*
1817 * if node keys match and node pointer hasn't been modified
1818 * in the running transaction, we can merge the path. for
1819 * blocks owened by reloc trees, the node pointer check is
1820 * skipped, this is because these blocks are fully controlled
1821 * by the space balance code, no one else can modify them.
1822 */
1823 if (!nodes[level - 1] || !key_match ||
1824 (generation == trans->transid &&
1825 btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1826 if (level == 1 || level == lowest_level + 1) {
1827 if (generation == trans->transid) {
1828 btrfs_tree_unlock(eb);
1829 free_extent_buffer(eb);
1830 }
1831 break;
1832 }
1833
1834 if (generation != trans->transid) {
1835 eb = read_tree_block(root, bytenr, blocksize,
1836 generation);
1837 btrfs_tree_lock(eb);
1838 btrfs_set_lock_blocking(eb);
1839 }
1840
1841 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1842 &eb);
1843 BUG_ON(ret);
1844
1845 if (root->root_key.objectid ==
1846 BTRFS_TREE_RELOC_OBJECTID) {
1847 if (!nodes[level - 1]) {
1848 nodes[level - 1] = eb->start;
1849 memcpy(&node_keys[level - 1], &key,
1850 sizeof(node_keys[0]));
1851 } else {
1852 WARN_ON(1);
1853 }
1854 }
1855
1856 btrfs_tree_unlock(parent);
1857 free_extent_buffer(parent);
1858 parent = eb;
1859 continue;
1860 }
1861
1862 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1863 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1864 btrfs_mark_buffer_dirty(parent);
1865
1866 ret = btrfs_inc_extent_ref(trans, root,
1867 nodes[level - 1],
1868 blocksize, parent->start,
1869 btrfs_header_owner(parent),
1870 btrfs_header_generation(parent),
1871 level - 1);
1872 BUG_ON(ret);
1873
1874 /*
1875 * If the block was created in the running transaction,
1876 * it's possible this is the last reference to it, so we
1877 * should drop the subtree.
1878 */
1879 if (generation == trans->transid) {
1880 ret = btrfs_drop_subtree(trans, root, eb, parent);
1881 BUG_ON(ret);
1882 btrfs_tree_unlock(eb);
1883 free_extent_buffer(eb);
1884 } else {
1885 ret = btrfs_free_extent(trans, root, bytenr,
1886 blocksize, parent->start,
1887 btrfs_header_owner(parent),
1888 btrfs_header_generation(parent),
1889 level - 1, 1);
1890 BUG_ON(ret);
1891 }
1892 break;
1893 }
1894 btrfs_tree_unlock(parent);
1895 free_extent_buffer(parent);
1896 return 0;
1897}
1898
1899/* 1857/*
1900 * adjust the pointers going up the tree, starting at level 1858 * adjust the pointers going up the tree, starting at level
1901 * making sure the right key of each node is points to 'key'. 1859 * making sure the right key of each node is points to 'key'.
@@ -2021,9 +1979,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,
2021 btrfs_mark_buffer_dirty(src); 1979 btrfs_mark_buffer_dirty(src);
2022 btrfs_mark_buffer_dirty(dst); 1980 btrfs_mark_buffer_dirty(dst);
2023 1981
2024 ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
2025 BUG_ON(ret);
2026
2027 return ret; 1982 return ret;
2028} 1983}
2029 1984
@@ -2083,9 +2038,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
2083 btrfs_mark_buffer_dirty(src); 2038 btrfs_mark_buffer_dirty(src);
2084 btrfs_mark_buffer_dirty(dst); 2039 btrfs_mark_buffer_dirty(dst);
2085 2040
2086 ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
2087 BUG_ON(ret);
2088
2089 return ret; 2041 return ret;
2090} 2042}
2091 2043
@@ -2105,7 +2057,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2105 struct extent_buffer *c; 2057 struct extent_buffer *c;
2106 struct extent_buffer *old; 2058 struct extent_buffer *old;
2107 struct btrfs_disk_key lower_key; 2059 struct btrfs_disk_key lower_key;
2108 int ret;
2109 2060
2110 BUG_ON(path->nodes[level]); 2061 BUG_ON(path->nodes[level]);
2111 BUG_ON(path->nodes[level-1] != root->node); 2062 BUG_ON(path->nodes[level-1] != root->node);
@@ -2117,16 +2068,17 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2117 btrfs_node_key(lower, &lower_key, 0); 2068 btrfs_node_key(lower, &lower_key, 0);
2118 2069
2119 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2070 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2120 root->root_key.objectid, trans->transid, 2071 root->root_key.objectid, &lower_key,
2121 level, root->node->start, 0); 2072 level, root->node->start, 0);
2122 if (IS_ERR(c)) 2073 if (IS_ERR(c))
2123 return PTR_ERR(c); 2074 return PTR_ERR(c);
2124 2075
2125 memset_extent_buffer(c, 0, 0, root->nodesize); 2076 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2126 btrfs_set_header_nritems(c, 1); 2077 btrfs_set_header_nritems(c, 1);
2127 btrfs_set_header_level(c, level); 2078 btrfs_set_header_level(c, level);
2128 btrfs_set_header_bytenr(c, c->start); 2079 btrfs_set_header_bytenr(c, c->start);
2129 btrfs_set_header_generation(c, trans->transid); 2080 btrfs_set_header_generation(c, trans->transid);
2081 btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
2130 btrfs_set_header_owner(c, root->root_key.objectid); 2082 btrfs_set_header_owner(c, root->root_key.objectid);
2131 2083
2132 write_extent_buffer(c, root->fs_info->fsid, 2084 write_extent_buffer(c, root->fs_info->fsid,
@@ -2151,12 +2103,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2151 root->node = c; 2103 root->node = c;
2152 spin_unlock(&root->node_lock); 2104 spin_unlock(&root->node_lock);
2153 2105
2154 ret = btrfs_update_extent_ref(trans, root, lower->start,
2155 lower->len, lower->start, c->start,
2156 root->root_key.objectid,
2157 trans->transid, level - 1);
2158 BUG_ON(ret);
2159
2160 /* the super has an extra ref to root->node */ 2106 /* the super has an extra ref to root->node */
2161 free_extent_buffer(old); 2107 free_extent_buffer(old);
2162 2108
@@ -2233,7 +2179,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2233 ret = insert_new_root(trans, root, path, level + 1); 2179 ret = insert_new_root(trans, root, path, level + 1);
2234 if (ret) 2180 if (ret)
2235 return ret; 2181 return ret;
2236 } else if (!trans->transaction->delayed_refs.flushing) { 2182 } else {
2237 ret = push_nodes_for_insert(trans, root, path, level); 2183 ret = push_nodes_for_insert(trans, root, path, level);
2238 c = path->nodes[level]; 2184 c = path->nodes[level];
2239 if (!ret && btrfs_header_nritems(c) < 2185 if (!ret && btrfs_header_nritems(c) <
@@ -2244,20 +2190,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2244 } 2190 }
2245 2191
2246 c_nritems = btrfs_header_nritems(c); 2192 c_nritems = btrfs_header_nritems(c);
2193 mid = (c_nritems + 1) / 2;
2194 btrfs_node_key(c, &disk_key, mid);
2247 2195
2248 split = btrfs_alloc_free_block(trans, root, root->nodesize, 2196 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2249 path->nodes[level + 1]->start,
2250 root->root_key.objectid, 2197 root->root_key.objectid,
2251 trans->transid, level, c->start, 0); 2198 &disk_key, level, c->start, 0);
2252 if (IS_ERR(split)) 2199 if (IS_ERR(split))
2253 return PTR_ERR(split); 2200 return PTR_ERR(split);
2254 2201
2255 btrfs_set_header_flags(split, btrfs_header_flags(c)); 2202 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2256 btrfs_set_header_level(split, btrfs_header_level(c)); 2203 btrfs_set_header_level(split, btrfs_header_level(c));
2257 btrfs_set_header_bytenr(split, split->start); 2204 btrfs_set_header_bytenr(split, split->start);
2258 btrfs_set_header_generation(split, trans->transid); 2205 btrfs_set_header_generation(split, trans->transid);
2206 btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
2259 btrfs_set_header_owner(split, root->root_key.objectid); 2207 btrfs_set_header_owner(split, root->root_key.objectid);
2260 btrfs_set_header_flags(split, 0);
2261 write_extent_buffer(split, root->fs_info->fsid, 2208 write_extent_buffer(split, root->fs_info->fsid,
2262 (unsigned long)btrfs_header_fsid(split), 2209 (unsigned long)btrfs_header_fsid(split),
2263 BTRFS_FSID_SIZE); 2210 BTRFS_FSID_SIZE);
@@ -2265,7 +2212,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2265 (unsigned long)btrfs_header_chunk_tree_uuid(split), 2212 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2266 BTRFS_UUID_SIZE); 2213 BTRFS_UUID_SIZE);
2267 2214
2268 mid = (c_nritems + 1) / 2;
2269 2215
2270 copy_extent_buffer(split, c, 2216 copy_extent_buffer(split, c,
2271 btrfs_node_key_ptr_offset(0), 2217 btrfs_node_key_ptr_offset(0),
@@ -2278,16 +2224,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2278 btrfs_mark_buffer_dirty(c); 2224 btrfs_mark_buffer_dirty(c);
2279 btrfs_mark_buffer_dirty(split); 2225 btrfs_mark_buffer_dirty(split);
2280 2226
2281 btrfs_node_key(split, &disk_key, 0);
2282 wret = insert_ptr(trans, root, path, &disk_key, split->start, 2227 wret = insert_ptr(trans, root, path, &disk_key, split->start,
2283 path->slots[level + 1] + 1, 2228 path->slots[level + 1] + 1,
2284 level + 1); 2229 level + 1);
2285 if (wret) 2230 if (wret)
2286 ret = wret; 2231 ret = wret;
2287 2232
2288 ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2289 BUG_ON(ret);
2290
2291 if (path->slots[level] >= mid) { 2233 if (path->slots[level] >= mid) {
2292 path->slots[level] -= mid; 2234 path->slots[level] -= mid;
2293 btrfs_tree_unlock(c); 2235 btrfs_tree_unlock(c);
@@ -2360,7 +2302,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2360 u32 right_nritems; 2302 u32 right_nritems;
2361 u32 data_end; 2303 u32 data_end;
2362 u32 this_item_size; 2304 u32 this_item_size;
2363 int ret;
2364 2305
2365 if (empty) 2306 if (empty)
2366 nr = 0; 2307 nr = 0;
@@ -2473,9 +2414,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2473 btrfs_mark_buffer_dirty(left); 2414 btrfs_mark_buffer_dirty(left);
2474 btrfs_mark_buffer_dirty(right); 2415 btrfs_mark_buffer_dirty(right);
2475 2416
2476 ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2477 BUG_ON(ret);
2478
2479 btrfs_item_key(right, &disk_key, 0); 2417 btrfs_item_key(right, &disk_key, 0);
2480 btrfs_set_node_key(upper, &disk_key, slot + 1); 2418 btrfs_set_node_key(upper, &disk_key, slot + 1);
2481 btrfs_mark_buffer_dirty(upper); 2419 btrfs_mark_buffer_dirty(upper);
@@ -2720,10 +2658,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2720 if (right_nritems) 2658 if (right_nritems)
2721 btrfs_mark_buffer_dirty(right); 2659 btrfs_mark_buffer_dirty(right);
2722 2660
2723 ret = btrfs_update_ref(trans, root, right, left,
2724 old_left_nritems, push_items);
2725 BUG_ON(ret);
2726
2727 btrfs_item_key(right, &disk_key, 0); 2661 btrfs_item_key(right, &disk_key, 0);
2728 wret = fixup_low_keys(trans, root, path, &disk_key, 1); 2662 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2729 if (wret) 2663 if (wret)
@@ -2880,9 +2814,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2880 btrfs_mark_buffer_dirty(l); 2814 btrfs_mark_buffer_dirty(l);
2881 BUG_ON(path->slots[0] != slot); 2815 BUG_ON(path->slots[0] != slot);
2882 2816
2883 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2884 BUG_ON(ret);
2885
2886 if (mid <= slot) { 2817 if (mid <= slot) {
2887 btrfs_tree_unlock(path->nodes[0]); 2818 btrfs_tree_unlock(path->nodes[0]);
2888 free_extent_buffer(path->nodes[0]); 2819 free_extent_buffer(path->nodes[0]);
@@ -2911,6 +2842,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2911 struct btrfs_path *path, int data_size, 2842 struct btrfs_path *path, int data_size,
2912 int extend) 2843 int extend)
2913{ 2844{
2845 struct btrfs_disk_key disk_key;
2914 struct extent_buffer *l; 2846 struct extent_buffer *l;
2915 u32 nritems; 2847 u32 nritems;
2916 int mid; 2848 int mid;
@@ -2918,12 +2850,11 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2918 struct extent_buffer *right; 2850 struct extent_buffer *right;
2919 int ret = 0; 2851 int ret = 0;
2920 int wret; 2852 int wret;
2921 int double_split; 2853 int split;
2922 int num_doubles = 0; 2854 int num_doubles = 0;
2923 2855
2924 /* first try to make some room by pushing left and right */ 2856 /* first try to make some room by pushing left and right */
2925 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY && 2857 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2926 !trans->transaction->delayed_refs.flushing) {
2927 wret = push_leaf_right(trans, root, path, data_size, 0); 2858 wret = push_leaf_right(trans, root, path, data_size, 0);
2928 if (wret < 0) 2859 if (wret < 0)
2929 return wret; 2860 return wret;
@@ -2945,16 +2876,53 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2945 return ret; 2876 return ret;
2946 } 2877 }
2947again: 2878again:
2948 double_split = 0; 2879 split = 1;
2949 l = path->nodes[0]; 2880 l = path->nodes[0];
2950 slot = path->slots[0]; 2881 slot = path->slots[0];
2951 nritems = btrfs_header_nritems(l); 2882 nritems = btrfs_header_nritems(l);
2952 mid = (nritems + 1) / 2; 2883 mid = (nritems + 1) / 2;
2953 2884
2954 right = btrfs_alloc_free_block(trans, root, root->leafsize, 2885 if (mid <= slot) {
2955 path->nodes[1]->start, 2886 if (nritems == 1 ||
2887 leaf_space_used(l, mid, nritems - mid) + data_size >
2888 BTRFS_LEAF_DATA_SIZE(root)) {
2889 if (slot >= nritems) {
2890 split = 0;
2891 } else {
2892 mid = slot;
2893 if (mid != nritems &&
2894 leaf_space_used(l, mid, nritems - mid) +
2895 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2896 split = 2;
2897 }
2898 }
2899 }
2900 } else {
2901 if (leaf_space_used(l, 0, mid) + data_size >
2902 BTRFS_LEAF_DATA_SIZE(root)) {
2903 if (!extend && data_size && slot == 0) {
2904 split = 0;
2905 } else if ((extend || !data_size) && slot == 0) {
2906 mid = 1;
2907 } else {
2908 mid = slot;
2909 if (mid != nritems &&
2910 leaf_space_used(l, mid, nritems - mid) +
2911 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2912 split = 2 ;
2913 }
2914 }
2915 }
2916 }
2917
2918 if (split == 0)
2919 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2920 else
2921 btrfs_item_key(l, &disk_key, mid);
2922
2923 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2956 root->root_key.objectid, 2924 root->root_key.objectid,
2957 trans->transid, 0, l->start, 0); 2925 &disk_key, 0, l->start, 0);
2958 if (IS_ERR(right)) { 2926 if (IS_ERR(right)) {
2959 BUG_ON(1); 2927 BUG_ON(1);
2960 return PTR_ERR(right); 2928 return PTR_ERR(right);
@@ -2963,6 +2931,7 @@ again:
2963 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 2931 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2964 btrfs_set_header_bytenr(right, right->start); 2932 btrfs_set_header_bytenr(right, right->start);
2965 btrfs_set_header_generation(right, trans->transid); 2933 btrfs_set_header_generation(right, trans->transid);
2934 btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
2966 btrfs_set_header_owner(right, root->root_key.objectid); 2935 btrfs_set_header_owner(right, root->root_key.objectid);
2967 btrfs_set_header_level(right, 0); 2936 btrfs_set_header_level(right, 0);
2968 write_extent_buffer(right, root->fs_info->fsid, 2937 write_extent_buffer(right, root->fs_info->fsid,
@@ -2973,79 +2942,47 @@ again:
2973 (unsigned long)btrfs_header_chunk_tree_uuid(right), 2942 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2974 BTRFS_UUID_SIZE); 2943 BTRFS_UUID_SIZE);
2975 2944
2976 if (mid <= slot) { 2945 if (split == 0) {
2977 if (nritems == 1 || 2946 if (mid <= slot) {
2978 leaf_space_used(l, mid, nritems - mid) + data_size > 2947 btrfs_set_header_nritems(right, 0);
2979 BTRFS_LEAF_DATA_SIZE(root)) { 2948 wret = insert_ptr(trans, root, path,
2980 if (slot >= nritems) { 2949 &disk_key, right->start,
2981 struct btrfs_disk_key disk_key; 2950 path->slots[1] + 1, 1);
2982 2951 if (wret)
2983 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2952 ret = wret;
2984 btrfs_set_header_nritems(right, 0);
2985 wret = insert_ptr(trans, root, path,
2986 &disk_key, right->start,
2987 path->slots[1] + 1, 1);
2988 if (wret)
2989 ret = wret;
2990 2953
2991 btrfs_tree_unlock(path->nodes[0]); 2954 btrfs_tree_unlock(path->nodes[0]);
2992 free_extent_buffer(path->nodes[0]); 2955 free_extent_buffer(path->nodes[0]);
2993 path->nodes[0] = right; 2956 path->nodes[0] = right;
2994 path->slots[0] = 0; 2957 path->slots[0] = 0;
2995 path->slots[1] += 1; 2958 path->slots[1] += 1;
2996 btrfs_mark_buffer_dirty(right); 2959 } else {
2997 return ret; 2960 btrfs_set_header_nritems(right, 0);
2998 } 2961 wret = insert_ptr(trans, root, path,
2999 mid = slot; 2962 &disk_key,
3000 if (mid != nritems && 2963 right->start,
3001 leaf_space_used(l, mid, nritems - mid) + 2964 path->slots[1], 1);
3002 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 2965 if (wret)
3003 double_split = 1; 2966 ret = wret;
3004 } 2967 btrfs_tree_unlock(path->nodes[0]);
3005 } 2968 free_extent_buffer(path->nodes[0]);
3006 } else { 2969 path->nodes[0] = right;
3007 if (leaf_space_used(l, 0, mid) + data_size > 2970 path->slots[0] = 0;
3008 BTRFS_LEAF_DATA_SIZE(root)) { 2971 if (path->slots[1] == 0) {
3009 if (!extend && data_size && slot == 0) { 2972 wret = fixup_low_keys(trans, root,
3010 struct btrfs_disk_key disk_key; 2973 path, &disk_key, 1);
3011
3012 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3013 btrfs_set_header_nritems(right, 0);
3014 wret = insert_ptr(trans, root, path,
3015 &disk_key,
3016 right->start,
3017 path->slots[1], 1);
3018 if (wret) 2974 if (wret)
3019 ret = wret; 2975 ret = wret;
3020 btrfs_tree_unlock(path->nodes[0]);
3021 free_extent_buffer(path->nodes[0]);
3022 path->nodes[0] = right;
3023 path->slots[0] = 0;
3024 if (path->slots[1] == 0) {
3025 wret = fixup_low_keys(trans, root,
3026 path, &disk_key, 1);
3027 if (wret)
3028 ret = wret;
3029 }
3030 btrfs_mark_buffer_dirty(right);
3031 return ret;
3032 } else if ((extend || !data_size) && slot == 0) {
3033 mid = 1;
3034 } else {
3035 mid = slot;
3036 if (mid != nritems &&
3037 leaf_space_used(l, mid, nritems - mid) +
3038 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3039 double_split = 1;
3040 }
3041 } 2976 }
3042 } 2977 }
2978 btrfs_mark_buffer_dirty(right);
2979 return ret;
3043 } 2980 }
3044 2981
3045 ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); 2982 ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
3046 BUG_ON(ret); 2983 BUG_ON(ret);
3047 2984
3048 if (double_split) { 2985 if (split == 2) {
3049 BUG_ON(num_doubles != 0); 2986 BUG_ON(num_doubles != 0);
3050 num_doubles++; 2987 num_doubles++;
3051 goto again; 2988 goto again;
@@ -3447,7 +3384,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3447 /* figure out how many keys we can insert in here */ 3384 /* figure out how many keys we can insert in here */
3448 total_data = data_size[0]; 3385 total_data = data_size[0];
3449 for (i = 1; i < nr; i++) { 3386 for (i = 1; i < nr; i++) {
3450 if (comp_cpu_keys(&found_key, cpu_key + i) <= 0) 3387 if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3451 break; 3388 break;
3452 total_data += data_size[i]; 3389 total_data += data_size[i];
3453 } 3390 }
@@ -3745,9 +3682,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3745 3682
3746/* 3683/*
3747 * a helper function to delete the leaf pointed to by path->slots[1] and 3684 * a helper function to delete the leaf pointed to by path->slots[1] and
3748 * path->nodes[1]. bytenr is the node block pointer, but since the callers 3685 * path->nodes[1].
3749 * already know it, it is faster to have them pass it down than to
3750 * read it out of the node again.
3751 * 3686 *
3752 * This deletes the pointer in path->nodes[1] and frees the leaf 3687 * This deletes the pointer in path->nodes[1] and frees the leaf
3753 * block extent. zero is returned if it all worked out, < 0 otherwise. 3688 * block extent. zero is returned if it all worked out, < 0 otherwise.
@@ -3755,15 +3690,14 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3755 * The path must have already been setup for deleting the leaf, including 3690 * The path must have already been setup for deleting the leaf, including
3756 * all the proper balancing. path->nodes[1] must be locked. 3691 * all the proper balancing. path->nodes[1] must be locked.
3757 */ 3692 */
3758noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, 3693static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3759 struct btrfs_root *root, 3694 struct btrfs_root *root,
3760 struct btrfs_path *path, u64 bytenr) 3695 struct btrfs_path *path,
3696 struct extent_buffer *leaf)
3761{ 3697{
3762 int ret; 3698 int ret;
3763 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3764 u64 parent_start = path->nodes[1]->start;
3765 u64 parent_owner = btrfs_header_owner(path->nodes[1]);
3766 3699
3700 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3767 ret = del_ptr(trans, root, path, 1, path->slots[1]); 3701 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3768 if (ret) 3702 if (ret)
3769 return ret; 3703 return ret;
@@ -3774,10 +3708,8 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3774 */ 3708 */
3775 btrfs_unlock_up_safe(path, 0); 3709 btrfs_unlock_up_safe(path, 0);
3776 3710
3777 ret = btrfs_free_extent(trans, root, bytenr, 3711 ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
3778 btrfs_level_size(root, 0), 3712 0, root->root_key.objectid, 0, 0);
3779 parent_start, parent_owner,
3780 root_gen, 0, 1);
3781 return ret; 3713 return ret;
3782} 3714}
3783/* 3715/*
@@ -3845,7 +3777,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3845 if (leaf == root->node) { 3777 if (leaf == root->node) {
3846 btrfs_set_header_level(leaf, 0); 3778 btrfs_set_header_level(leaf, 0);
3847 } else { 3779 } else {
3848 ret = btrfs_del_leaf(trans, root, path, leaf->start); 3780 ret = btrfs_del_leaf(trans, root, path, leaf);
3849 BUG_ON(ret); 3781 BUG_ON(ret);
3850 } 3782 }
3851 } else { 3783 } else {
@@ -3861,8 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3861 } 3793 }
3862 3794
3863 /* delete the leaf if it is mostly empty */ 3795 /* delete the leaf if it is mostly empty */
3864 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 && 3796 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
3865 !trans->transaction->delayed_refs.flushing) {
3866 /* push_leaf_left fixes the path. 3797 /* push_leaf_left fixes the path.
3867 * make sure the path still points to our leaf 3798 * make sure the path still points to our leaf
3868 * for possible call to del_ptr below 3799 * for possible call to del_ptr below
@@ -3884,8 +3815,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3884 3815
3885 if (btrfs_header_nritems(leaf) == 0) { 3816 if (btrfs_header_nritems(leaf) == 0) {
3886 path->slots[1] = slot; 3817 path->slots[1] = slot;
3887 ret = btrfs_del_leaf(trans, root, path, 3818 ret = btrfs_del_leaf(trans, root, path, leaf);
3888 leaf->start);
3889 BUG_ON(ret); 3819 BUG_ON(ret);
3890 free_extent_buffer(leaf); 3820 free_extent_buffer(leaf);
3891 } else { 3821 } else {
@@ -4098,10 +4028,9 @@ out:
4098 * calling this function. 4028 * calling this function.
4099 */ 4029 */
4100int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4030int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4101 struct btrfs_key *key, int lowest_level, 4031 struct btrfs_key *key, int level,
4102 int cache_only, u64 min_trans) 4032 int cache_only, u64 min_trans)
4103{ 4033{
4104 int level = lowest_level;
4105 int slot; 4034 int slot;
4106 struct extent_buffer *c; 4035 struct extent_buffer *c;
4107 4036
@@ -4114,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4114 c = path->nodes[level]; 4043 c = path->nodes[level];
4115next: 4044next:
4116 if (slot >= btrfs_header_nritems(c)) { 4045 if (slot >= btrfs_header_nritems(c)) {
4117 level++; 4046 int ret;
4118 if (level == BTRFS_MAX_LEVEL) 4047 int orig_lowest;
4048 struct btrfs_key cur_key;
4049 if (level + 1 >= BTRFS_MAX_LEVEL ||
4050 !path->nodes[level + 1])
4119 return 1; 4051 return 1;
4120 continue; 4052
4053 if (path->locks[level + 1]) {
4054 level++;
4055 continue;
4056 }
4057
4058 slot = btrfs_header_nritems(c) - 1;
4059 if (level == 0)
4060 btrfs_item_key_to_cpu(c, &cur_key, slot);
4061 else
4062 btrfs_node_key_to_cpu(c, &cur_key, slot);
4063
4064 orig_lowest = path->lowest_level;
4065 btrfs_release_path(root, path);
4066 path->lowest_level = level;
4067 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4068 0, 0);
4069 path->lowest_level = orig_lowest;
4070 if (ret < 0)
4071 return ret;
4072
4073 c = path->nodes[level];
4074 slot = path->slots[level];
4075 if (ret == 0)
4076 slot++;
4077 goto next;
4121 } 4078 }
4079
4122 if (level == 0) 4080 if (level == 0)
4123 btrfs_item_key_to_cpu(c, key, slot); 4081 btrfs_item_key_to_cpu(c, key, slot);
4124 else { 4082 else {
@@ -4202,7 +4160,8 @@ again:
4202 * advance the path if there are now more items available. 4160 * advance the path if there are now more items available.
4203 */ 4161 */
4204 if (nritems > 0 && path->slots[0] < nritems - 1) { 4162 if (nritems > 0 && path->slots[0] < nritems - 1) {
4205 path->slots[0]++; 4163 if (ret == 0)
4164 path->slots[0]++;
4206 ret = 0; 4165 ret = 0;
4207 goto done; 4166 goto done;
4208 } 4167 }
@@ -4334,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root,
4334 path->slots[0]--; 4293 path->slots[0]--;
4335 4294
4336 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4295 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4337 if (found_key.type == type)
4338 return 0;
4339 if (found_key.objectid < min_objectid) 4296 if (found_key.objectid < min_objectid)
4340 break; 4297 break;
4298 if (found_key.type == type)
4299 return 0;
4341 if (found_key.objectid == min_objectid && 4300 if (found_key.objectid == min_objectid &&
4342 found_key.type < type) 4301 found_key.type < type)
4343 break; 4302 break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4414a5d9983a..837435ce84ca 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -41,10 +41,10 @@ struct btrfs_ordered_sum;
41 41
42#define BTRFS_MAGIC "_BHRfS_M" 42#define BTRFS_MAGIC "_BHRfS_M"
43 43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45
46#define BTRFS_MAX_LEVEL 8 44#define BTRFS_MAX_LEVEL 8
47 45
46#define BTRFS_COMPAT_EXTENT_TREE_V0
47
48/* 48/*
49 * files bigger than this get some pre-flushing when they are added 49 * files bigger than this get some pre-flushing when they are added
50 * to the ordered operations list. That way we limit the total 50 * to the ordered operations list. That way we limit the total
@@ -267,7 +267,18 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
267} 267}
268 268
269#define BTRFS_FSID_SIZE 16 269#define BTRFS_FSID_SIZE 16
270#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0) 270#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
271#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
272#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
273#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
274
275#define BTRFS_BACKREF_REV_MAX 256
276#define BTRFS_BACKREF_REV_SHIFT 56
277#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
278 BTRFS_BACKREF_REV_SHIFT)
279
280#define BTRFS_OLD_BACKREF_REV 0
281#define BTRFS_MIXED_BACKREF_REV 1
271 282
272/* 283/*
273 * every tree block (leaf or node) starts with this header. 284 * every tree block (leaf or node) starts with this header.
@@ -296,7 +307,6 @@ struct btrfs_header {
296 sizeof(struct btrfs_item) - \ 307 sizeof(struct btrfs_item) - \
297 sizeof(struct btrfs_file_extent_item)) 308 sizeof(struct btrfs_file_extent_item))
298 309
299#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
300 310
301/* 311/*
302 * this is a very generous portion of the super block, giving us 312 * this is a very generous portion of the super block, giving us
@@ -355,9 +365,12 @@ struct btrfs_super_block {
355 * Compat flags that we support. If any incompat flags are set other than the 365 * Compat flags that we support. If any incompat flags are set other than the
356 * ones specified below then we will fail to mount 366 * ones specified below then we will fail to mount
357 */ 367 */
358#define BTRFS_FEATURE_COMPAT_SUPP 0x0 368#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
359#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0 369
360#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0 370#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
371#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
372#define BTRFS_FEATURE_INCOMPAT_SUPP \
373 BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
361 374
362/* 375/*
363 * A leaf is full of items. offset and size tell us where to find 376 * A leaf is full of items. offset and size tell us where to find
@@ -421,23 +434,65 @@ struct btrfs_path {
421 unsigned int keep_locks:1; 434 unsigned int keep_locks:1;
422 unsigned int skip_locking:1; 435 unsigned int skip_locking:1;
423 unsigned int leave_spinning:1; 436 unsigned int leave_spinning:1;
437 unsigned int search_commit_root:1;
424}; 438};
425 439
426/* 440/*
427 * items in the extent btree are used to record the objectid of the 441 * items in the extent btree are used to record the objectid of the
428 * owner of the block and the number of references 442 * owner of the block and the number of references
429 */ 443 */
444
430struct btrfs_extent_item { 445struct btrfs_extent_item {
446 __le64 refs;
447 __le64 generation;
448 __le64 flags;
449} __attribute__ ((__packed__));
450
451struct btrfs_extent_item_v0 {
431 __le32 refs; 452 __le32 refs;
432} __attribute__ ((__packed__)); 453} __attribute__ ((__packed__));
433 454
434struct btrfs_extent_ref { 455#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
456 sizeof(struct btrfs_item))
457
458#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0)
459#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1)
460
461/* following flags only apply to tree blocks */
462
463/* use full backrefs for extent pointers in the block */
464#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
465
466struct btrfs_tree_block_info {
467 struct btrfs_disk_key key;
468 u8 level;
469} __attribute__ ((__packed__));
470
471struct btrfs_extent_data_ref {
472 __le64 root;
473 __le64 objectid;
474 __le64 offset;
475 __le32 count;
476} __attribute__ ((__packed__));
477
478struct btrfs_shared_data_ref {
479 __le32 count;
480} __attribute__ ((__packed__));
481
482struct btrfs_extent_inline_ref {
483 u8 type;
484 __le64 offset;
485} __attribute__ ((__packed__));
486
487/* old style backrefs item */
488struct btrfs_extent_ref_v0 {
435 __le64 root; 489 __le64 root;
436 __le64 generation; 490 __le64 generation;
437 __le64 objectid; 491 __le64 objectid;
438 __le32 num_refs; 492 __le32 count;
439} __attribute__ ((__packed__)); 493} __attribute__ ((__packed__));
440 494
495
441/* dev extents record free space on individual devices. The owner 496/* dev extents record free space on individual devices. The owner
442 * field points back to the chunk allocation mapping tree that allocated 497 * field points back to the chunk allocation mapping tree that allocated
443 * the extent. The chunk tree uuid field is a way to double check the owner 498 * the extent. The chunk tree uuid field is a way to double check the owner
@@ -634,6 +689,7 @@ struct btrfs_space_info {
634 struct list_head block_groups; 689 struct list_head block_groups;
635 spinlock_t lock; 690 spinlock_t lock;
636 struct rw_semaphore groups_sem; 691 struct rw_semaphore groups_sem;
692 atomic_t caching_threads;
637}; 693};
638 694
639/* 695/*
@@ -652,6 +708,9 @@ struct btrfs_free_cluster {
652 /* first extent starting offset */ 708 /* first extent starting offset */
653 u64 window_start; 709 u64 window_start;
654 710
711 /* if this cluster simply points at a bitmap in the block group */
712 bool points_to_bitmap;
713
655 struct btrfs_block_group_cache *block_group; 714 struct btrfs_block_group_cache *block_group;
656 /* 715 /*
657 * when a cluster is allocated from a block group, we put the 716 * when a cluster is allocated from a block group, we put the
@@ -661,24 +720,37 @@ struct btrfs_free_cluster {
661 struct list_head block_group_list; 720 struct list_head block_group_list;
662}; 721};
663 722
723enum btrfs_caching_type {
724 BTRFS_CACHE_NO = 0,
725 BTRFS_CACHE_STARTED = 1,
726 BTRFS_CACHE_FINISHED = 2,
727};
728
664struct btrfs_block_group_cache { 729struct btrfs_block_group_cache {
665 struct btrfs_key key; 730 struct btrfs_key key;
666 struct btrfs_block_group_item item; 731 struct btrfs_block_group_item item;
732 struct btrfs_fs_info *fs_info;
667 spinlock_t lock; 733 spinlock_t lock;
668 struct mutex cache_mutex;
669 u64 pinned; 734 u64 pinned;
670 u64 reserved; 735 u64 reserved;
671 u64 flags; 736 u64 flags;
672 int cached; 737 u64 sectorsize;
738 int extents_thresh;
739 int free_extents;
740 int total_bitmaps;
673 int ro; 741 int ro;
674 int dirty; 742 int dirty;
675 743
744 /* cache tracking stuff */
745 wait_queue_head_t caching_q;
746 int cached;
747
676 struct btrfs_space_info *space_info; 748 struct btrfs_space_info *space_info;
677 749
678 /* free space cache stuff */ 750 /* free space cache stuff */
679 spinlock_t tree_lock; 751 spinlock_t tree_lock;
680 struct rb_root free_space_bytes;
681 struct rb_root free_space_offset; 752 struct rb_root free_space_offset;
753 u64 free_space;
682 754
683 /* block group cache stuff */ 755 /* block group cache stuff */
684 struct rb_node cache_node; 756 struct rb_node cache_node;
@@ -695,12 +767,7 @@ struct btrfs_block_group_cache {
695 struct list_head cluster_list; 767 struct list_head cluster_list;
696}; 768};
697 769
698struct btrfs_leaf_ref_tree { 770struct reloc_control;
699 struct rb_root root;
700 struct list_head list;
701 spinlock_t lock;
702};
703
704struct btrfs_device; 771struct btrfs_device;
705struct btrfs_fs_devices; 772struct btrfs_fs_devices;
706struct btrfs_fs_info { 773struct btrfs_fs_info {
@@ -758,6 +825,7 @@ struct btrfs_fs_info {
758 struct mutex drop_mutex; 825 struct mutex drop_mutex;
759 struct mutex volume_mutex; 826 struct mutex volume_mutex;
760 struct mutex tree_reloc_mutex; 827 struct mutex tree_reloc_mutex;
828 struct rw_semaphore extent_commit_sem;
761 829
762 /* 830 /*
763 * this protects the ordered operations list only while we are 831 * this protects the ordered operations list only while we are
@@ -831,18 +899,11 @@ struct btrfs_fs_info {
831 struct task_struct *cleaner_kthread; 899 struct task_struct *cleaner_kthread;
832 int thread_pool_size; 900 int thread_pool_size;
833 901
834 /* tree relocation relocated fields */
835 struct list_head dead_reloc_roots;
836 struct btrfs_leaf_ref_tree reloc_ref_tree;
837 struct btrfs_leaf_ref_tree shared_ref_tree;
838
839 struct kobject super_kobj; 902 struct kobject super_kobj;
840 struct completion kobj_unregister; 903 struct completion kobj_unregister;
841 int do_barriers; 904 int do_barriers;
842 int closing; 905 int closing;
843 int log_root_recovering; 906 int log_root_recovering;
844 atomic_t throttles;
845 atomic_t throttle_gen;
846 907
847 u64 total_pinned; 908 u64 total_pinned;
848 909
@@ -861,6 +922,8 @@ struct btrfs_fs_info {
861 */ 922 */
862 struct list_head space_info; 923 struct list_head space_info;
863 924
925 struct reloc_control *reloc_ctl;
926
864 spinlock_t delalloc_lock; 927 spinlock_t delalloc_lock;
865 spinlock_t new_trans_lock; 928 spinlock_t new_trans_lock;
866 u64 delalloc_bytes; 929 u64 delalloc_bytes;
@@ -891,7 +954,6 @@ struct btrfs_fs_info {
891 * in ram representation of the tree. extent_root is used for all allocations 954 * in ram representation of the tree. extent_root is used for all allocations
892 * and for the extent tree extent_root root. 955 * and for the extent tree extent_root root.
893 */ 956 */
894struct btrfs_dirty_root;
895struct btrfs_root { 957struct btrfs_root {
896 struct extent_buffer *node; 958 struct extent_buffer *node;
897 959
@@ -899,9 +961,6 @@ struct btrfs_root {
899 spinlock_t node_lock; 961 spinlock_t node_lock;
900 962
901 struct extent_buffer *commit_root; 963 struct extent_buffer *commit_root;
902 struct btrfs_leaf_ref_tree *ref_tree;
903 struct btrfs_leaf_ref_tree ref_tree_struct;
904 struct btrfs_dirty_root *dirty_root;
905 struct btrfs_root *log_root; 964 struct btrfs_root *log_root;
906 struct btrfs_root *reloc_root; 965 struct btrfs_root *reloc_root;
907 966
@@ -952,10 +1011,15 @@ struct btrfs_root {
952 /* the dirty list is only used by non-reference counted roots */ 1011 /* the dirty list is only used by non-reference counted roots */
953 struct list_head dirty_list; 1012 struct list_head dirty_list;
954 1013
1014 struct list_head root_list;
1015
955 spinlock_t list_lock; 1016 spinlock_t list_lock;
956 struct list_head dead_list;
957 struct list_head orphan_list; 1017 struct list_head orphan_list;
958 1018
1019 spinlock_t inode_lock;
1020 /* red-black tree that keeps track of in-memory inodes */
1021 struct rb_root inode_tree;
1022
959 /* 1023 /*
960 * right now this just gets used so that a root has its own devid 1024 * right now this just gets used so that a root has its own devid
961 * for stat. It may be used for more later 1025 * for stat. It may be used for more later
@@ -1017,7 +1081,16 @@ struct btrfs_root {
1017 * are used, and how many references there are to each block 1081 * are used, and how many references there are to each block
1018 */ 1082 */
1019#define BTRFS_EXTENT_ITEM_KEY 168 1083#define BTRFS_EXTENT_ITEM_KEY 168
1020#define BTRFS_EXTENT_REF_KEY 180 1084
1085#define BTRFS_TREE_BLOCK_REF_KEY 176
1086
1087#define BTRFS_EXTENT_DATA_REF_KEY 178
1088
1089#define BTRFS_EXTENT_REF_V0_KEY 180
1090
1091#define BTRFS_SHARED_BLOCK_REF_KEY 182
1092
1093#define BTRFS_SHARED_DATA_REF_KEY 184
1021 1094
1022/* 1095/*
1023 * block groups give us hints into the extent allocation trees. Which 1096 * block groups give us hints into the extent allocation trees. Which
@@ -1043,6 +1116,8 @@ struct btrfs_root {
1043#define BTRFS_MOUNT_COMPRESS (1 << 5) 1116#define BTRFS_MOUNT_COMPRESS (1 << 5)
1044#define BTRFS_MOUNT_NOTREELOG (1 << 6) 1117#define BTRFS_MOUNT_NOTREELOG (1 << 6)
1045#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) 1118#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
1119#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
1120#define BTRFS_MOUNT_NOSSD (1 << 9)
1046 1121
1047#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1122#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1048#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1123#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1056,12 +1131,14 @@ struct btrfs_root {
1056#define BTRFS_INODE_READONLY (1 << 2) 1131#define BTRFS_INODE_READONLY (1 << 2)
1057#define BTRFS_INODE_NOCOMPRESS (1 << 3) 1132#define BTRFS_INODE_NOCOMPRESS (1 << 3)
1058#define BTRFS_INODE_PREALLOC (1 << 4) 1133#define BTRFS_INODE_PREALLOC (1 << 4)
1059#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ 1134#define BTRFS_INODE_SYNC (1 << 5)
1060 ~BTRFS_INODE_##flag) 1135#define BTRFS_INODE_IMMUTABLE (1 << 6)
1061#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ 1136#define BTRFS_INODE_APPEND (1 << 7)
1062 BTRFS_INODE_##flag) 1137#define BTRFS_INODE_NODUMP (1 << 8)
1063#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \ 1138#define BTRFS_INODE_NOATIME (1 << 9)
1064 BTRFS_INODE_##flag) 1139#define BTRFS_INODE_DIRSYNC (1 << 10)
1140
1141
1065/* some macros to generate set/get funcs for the struct fields. This 1142/* some macros to generate set/get funcs for the struct fields. This
1066 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1143 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
1067 * one for u8: 1144 * one for u8:
@@ -1317,24 +1394,67 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
1317 return (u8 *)((unsigned long)dev + ptr); 1394 return (u8 *)((unsigned long)dev + ptr);
1318} 1395}
1319 1396
1320/* struct btrfs_extent_ref */ 1397BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
1321BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64); 1398BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
1322BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64); 1399 generation, 64);
1323BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64); 1400BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
1324BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32); 1401
1402BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
1325 1403
1326BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
1327BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
1328 generation, 64);
1329BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
1330 objectid, 64);
1331BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
1332 num_refs, 32);
1333 1404
1334/* struct btrfs_extent_item */ 1405BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
1335BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); 1406
1336BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item, 1407static inline void btrfs_tree_block_key(struct extent_buffer *eb,
1337 refs, 32); 1408 struct btrfs_tree_block_info *item,
1409 struct btrfs_disk_key *key)
1410{
1411 read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
1412}
1413
1414static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
1415 struct btrfs_tree_block_info *item,
1416 struct btrfs_disk_key *key)
1417{
1418 write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
1419}
1420
1421BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
1422 root, 64);
1423BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
1424 objectid, 64);
1425BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
1426 offset, 64);
1427BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
1428 count, 32);
1429
1430BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
1431 count, 32);
1432
1433BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
1434 type, 8);
1435BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
1436 offset, 64);
1437
1438static inline u32 btrfs_extent_inline_ref_size(int type)
1439{
1440 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1441 type == BTRFS_SHARED_BLOCK_REF_KEY)
1442 return sizeof(struct btrfs_extent_inline_ref);
1443 if (type == BTRFS_SHARED_DATA_REF_KEY)
1444 return sizeof(struct btrfs_shared_data_ref) +
1445 sizeof(struct btrfs_extent_inline_ref);
1446 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1447 return sizeof(struct btrfs_extent_data_ref) +
1448 offsetof(struct btrfs_extent_inline_ref, offset);
1449 BUG();
1450 return 0;
1451}
1452
1453BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
1454BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
1455 generation, 64);
1456BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
1457BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
1338 1458
1339/* struct btrfs_node */ 1459/* struct btrfs_node */
1340BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); 1460BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
@@ -1558,6 +1678,21 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
1558 return (flags & flag) == flag; 1678 return (flags & flag) == flag;
1559} 1679}
1560 1680
1681static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
1682{
1683 u64 flags = btrfs_header_flags(eb);
1684 return flags >> BTRFS_BACKREF_REV_SHIFT;
1685}
1686
1687static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
1688 int rev)
1689{
1690 u64 flags = btrfs_header_flags(eb);
1691 flags &= ~BTRFS_BACKREF_REV_MASK;
1692 flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
1693 btrfs_set_header_flags(eb, flags);
1694}
1695
1561static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) 1696static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
1562{ 1697{
1563 unsigned long ptr = offsetof(struct btrfs_header, fsid); 1698 unsigned long ptr = offsetof(struct btrfs_header, fsid);
@@ -1790,39 +1925,32 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
1790int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 1925int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1791 struct btrfs_root *root, struct extent_buffer *leaf); 1926 struct btrfs_root *root, struct extent_buffer *leaf);
1792int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 1927int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1793 struct btrfs_root *root, u64 objectid, u64 bytenr); 1928 struct btrfs_root *root,
1929 u64 objectid, u64 offset, u64 bytenr);
1794int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); 1930int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1795struct btrfs_block_group_cache *btrfs_lookup_block_group( 1931struct btrfs_block_group_cache *btrfs_lookup_block_group(
1796 struct btrfs_fs_info *info, 1932 struct btrfs_fs_info *info,
1797 u64 bytenr); 1933 u64 bytenr);
1934void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1798u64 btrfs_find_block_group(struct btrfs_root *root, 1935u64 btrfs_find_block_group(struct btrfs_root *root,
1799 u64 search_start, u64 search_hint, int owner); 1936 u64 search_start, u64 search_hint, int owner);
1800struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 1937struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1801 struct btrfs_root *root, 1938 struct btrfs_root *root, u32 blocksize,
1802 u32 blocksize, u64 parent, 1939 u64 parent, u64 root_objectid,
1803 u64 root_objectid, 1940 struct btrfs_disk_key *key, int level,
1804 u64 ref_generation, 1941 u64 hint, u64 empty_size);
1805 int level,
1806 u64 hint,
1807 u64 empty_size);
1808struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 1942struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1809 struct btrfs_root *root, 1943 struct btrfs_root *root,
1810 u64 bytenr, u32 blocksize, 1944 u64 bytenr, u32 blocksize,
1811 int level); 1945 int level);
1812int btrfs_alloc_extent(struct btrfs_trans_handle *trans, 1946int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
1813 struct btrfs_root *root, 1947 struct btrfs_root *root,
1814 u64 num_bytes, u64 parent, u64 min_bytes, 1948 u64 root_objectid, u64 owner,
1815 u64 root_objectid, u64 ref_generation, 1949 u64 offset, struct btrfs_key *ins);
1816 u64 owner, u64 empty_size, u64 hint_byte, 1950int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
1817 u64 search_end, struct btrfs_key *ins, u64 data); 1951 struct btrfs_root *root,
1818int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 1952 u64 root_objectid, u64 owner, u64 offset,
1819 struct btrfs_root *root, u64 parent, 1953 struct btrfs_key *ins);
1820 u64 root_objectid, u64 ref_generation,
1821 u64 owner, struct btrfs_key *ins);
1822int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
1823 struct btrfs_root *root, u64 parent,
1824 u64 root_objectid, u64 ref_generation,
1825 u64 owner, struct btrfs_key *ins);
1826int btrfs_reserve_extent(struct btrfs_trans_handle *trans, 1954int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1827 struct btrfs_root *root, 1955 struct btrfs_root *root,
1828 u64 num_bytes, u64 min_alloc_size, 1956 u64 num_bytes, u64 min_alloc_size,
@@ -1830,18 +1958,18 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1830 u64 search_end, struct btrfs_key *ins, 1958 u64 search_end, struct btrfs_key *ins,
1831 u64 data); 1959 u64 data);
1832int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1960int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1833 struct extent_buffer *orig_buf, struct extent_buffer *buf, 1961 struct extent_buffer *buf, int full_backref);
1834 u32 *nr_extents); 1962int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1835int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1963 struct extent_buffer *buf, int full_backref);
1836 struct extent_buffer *buf, u32 nr_extents); 1964int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
1837int btrfs_update_ref(struct btrfs_trans_handle *trans, 1965 struct btrfs_root *root,
1838 struct btrfs_root *root, struct extent_buffer *orig_buf, 1966 u64 bytenr, u64 num_bytes, u64 flags,
1839 struct extent_buffer *buf, int start_slot, int nr); 1967 int is_data);
1840int btrfs_free_extent(struct btrfs_trans_handle *trans, 1968int btrfs_free_extent(struct btrfs_trans_handle *trans,
1841 struct btrfs_root *root, 1969 struct btrfs_root *root,
1842 u64 bytenr, u64 num_bytes, u64 parent, 1970 u64 bytenr, u64 num_bytes, u64 parent,
1843 u64 root_objectid, u64 ref_generation, 1971 u64 root_objectid, u64 owner, u64 offset);
1844 u64 owner_objectid, int pin); 1972
1845int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 1973int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
1846int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 1974int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1847 struct btrfs_root *root, 1975 struct btrfs_root *root,
@@ -1849,13 +1977,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1849int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1977int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1850 struct btrfs_root *root, 1978 struct btrfs_root *root,
1851 u64 bytenr, u64 num_bytes, u64 parent, 1979 u64 bytenr, u64 num_bytes, u64 parent,
1852 u64 root_objectid, u64 ref_generation, 1980 u64 root_objectid, u64 owner, u64 offset);
1853 u64 owner_objectid); 1981
1854int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1855 struct btrfs_root *root, u64 bytenr, u64 num_bytes,
1856 u64 orig_parent, u64 parent,
1857 u64 root_objectid, u64 ref_generation,
1858 u64 owner_objectid);
1859int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 1982int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1860 struct btrfs_root *root); 1983 struct btrfs_root *root);
1861int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); 1984int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
@@ -1867,16 +1990,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1867 u64 size); 1990 u64 size);
1868int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 1991int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1869 struct btrfs_root *root, u64 group_start); 1992 struct btrfs_root *root, u64 group_start);
1870int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); 1993int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
1871int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, 1994 struct btrfs_block_group_cache *group);
1872 struct btrfs_root *root); 1995
1873int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
1874int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
1875 struct btrfs_root *root,
1876 struct extent_buffer *buf, u64 orig_start);
1877int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1878int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1879int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
1880u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 1996u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1881void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 1997void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
1882void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 1998void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -1890,14 +2006,14 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1890 u64 bytes); 2006 u64 bytes);
1891void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2007void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1892 u64 bytes); 2008 u64 bytes);
2009void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
1893/* ctree.c */ 2010/* ctree.c */
2011int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2012 int level, int *slot);
2013int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
1894int btrfs_previous_item(struct btrfs_root *root, 2014int btrfs_previous_item(struct btrfs_root *root,
1895 struct btrfs_path *path, u64 min_objectid, 2015 struct btrfs_path *path, u64 min_objectid,
1896 int type); 2016 int type);
1897int btrfs_merge_path(struct btrfs_trans_handle *trans,
1898 struct btrfs_root *root,
1899 struct btrfs_key *node_keys,
1900 u64 *nodes, int lowest_level);
1901int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, 2017int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1902 struct btrfs_root *root, struct btrfs_path *path, 2018 struct btrfs_root *root, struct btrfs_path *path,
1903 struct btrfs_key *new_key); 2019 struct btrfs_key *new_key);
@@ -1918,6 +2034,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
1918 struct btrfs_root *root, 2034 struct btrfs_root *root,
1919 struct extent_buffer *buf, 2035 struct extent_buffer *buf,
1920 struct extent_buffer **cow_ret, u64 new_root_objectid); 2036 struct extent_buffer **cow_ret, u64 new_root_objectid);
2037int btrfs_block_can_be_shared(struct btrfs_root *root,
2038 struct extent_buffer *buf);
1921int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root 2039int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
1922 *root, struct btrfs_path *path, u32 data_size); 2040 *root, struct btrfs_path *path, u32 data_size);
1923int btrfs_truncate_item(struct btrfs_trans_handle *trans, 2041int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -1944,9 +2062,6 @@ void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
1944 2062
1945int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2063int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1946 struct btrfs_path *path, int slot, int nr); 2064 struct btrfs_path *path, int slot, int nr);
1947int btrfs_del_leaf(struct btrfs_trans_handle *trans,
1948 struct btrfs_root *root,
1949 struct btrfs_path *path, u64 bytenr);
1950static inline int btrfs_del_item(struct btrfs_trans_handle *trans, 2065static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
1951 struct btrfs_root *root, 2066 struct btrfs_root *root,
1952 struct btrfs_path *path) 2067 struct btrfs_path *path)
@@ -1978,8 +2093,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
1978int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2093int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
1979int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2094int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
1980int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2095int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
1981int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root 2096int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
1982 *root);
1983int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2097int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
1984 struct btrfs_root *root, 2098 struct btrfs_root *root,
1985 struct extent_buffer *node, 2099 struct extent_buffer *node,
@@ -2005,8 +2119,9 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
2005 btrfs_root_item *item, struct btrfs_key *key); 2119 btrfs_root_item *item, struct btrfs_key *key);
2006int btrfs_search_root(struct btrfs_root *root, u64 search_start, 2120int btrfs_search_root(struct btrfs_root *root, u64 search_start,
2007 u64 *found_objectid); 2121 u64 *found_objectid);
2008int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, 2122int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
2009 struct btrfs_root *latest_root); 2123int btrfs_set_root_node(struct btrfs_root_item *item,
2124 struct extent_buffer *node);
2010/* dir-item.c */ 2125/* dir-item.c */
2011int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 2126int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
2012 struct btrfs_root *root, const char *name, 2127 struct btrfs_root *root, const char *name,
@@ -2139,7 +2254,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2139int btrfs_readpage(struct file *file, struct page *page); 2254int btrfs_readpage(struct file *file, struct page *page);
2140void btrfs_delete_inode(struct inode *inode); 2255void btrfs_delete_inode(struct inode *inode);
2141void btrfs_put_inode(struct inode *inode); 2256void btrfs_put_inode(struct inode *inode);
2142void btrfs_read_locked_inode(struct inode *inode);
2143int btrfs_write_inode(struct inode *inode, int wait); 2257int btrfs_write_inode(struct inode *inode, int wait);
2144void btrfs_dirty_inode(struct inode *inode); 2258void btrfs_dirty_inode(struct inode *inode);
2145struct inode *btrfs_alloc_inode(struct super_block *sb); 2259struct inode *btrfs_alloc_inode(struct super_block *sb);
@@ -2147,12 +2261,8 @@ void btrfs_destroy_inode(struct inode *inode);
2147int btrfs_init_cachep(void); 2261int btrfs_init_cachep(void);
2148void btrfs_destroy_cachep(void); 2262void btrfs_destroy_cachep(void);
2149long btrfs_ioctl_trans_end(struct file *file); 2263long btrfs_ioctl_trans_end(struct file *file);
2150struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
2151 struct btrfs_root *root, int wait);
2152struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
2153 struct btrfs_root *root);
2154struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 2264struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2155 struct btrfs_root *root, int *is_new); 2265 struct btrfs_root *root);
2156int btrfs_commit_write(struct file *file, struct page *page, 2266int btrfs_commit_write(struct file *file, struct page *page,
2157 unsigned from, unsigned to); 2267 unsigned from, unsigned to);
2158struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2268struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2168,6 +2278,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size);
2168 2278
2169/* ioctl.c */ 2279/* ioctl.c */
2170long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 2280long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
2281void btrfs_update_iflags(struct inode *inode);
2282void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2171 2283
2172/* file.c */ 2284/* file.c */
2173int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); 2285int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
@@ -2205,8 +2317,20 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
2205int btrfs_sync_fs(struct super_block *sb, int wait); 2317int btrfs_sync_fs(struct super_block *sb, int wait);
2206 2318
2207/* acl.c */ 2319/* acl.c */
2320#ifdef CONFIG_FS_POSIX_ACL
2208int btrfs_check_acl(struct inode *inode, int mask); 2321int btrfs_check_acl(struct inode *inode, int mask);
2322#else
2323#define btrfs_check_acl NULL
2324#endif
2209int btrfs_init_acl(struct inode *inode, struct inode *dir); 2325int btrfs_init_acl(struct inode *inode, struct inode *dir);
2210int btrfs_acl_chmod(struct inode *inode); 2326int btrfs_acl_chmod(struct inode *inode);
2211 2327
2328/* relocation.c */
2329int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
2330int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
2331 struct btrfs_root *root);
2332int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
2333 struct btrfs_root *root);
2334int btrfs_recover_relocation(struct btrfs_root *root);
2335int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
2212#endif 2336#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index d6c01c096a40..84e6781413b1 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -29,27 +29,87 @@
29 * add extents in the middle of btrfs_search_slot, and it allows 29 * add extents in the middle of btrfs_search_slot, and it allows
30 * us to buffer up frequently modified backrefs in an rb tree instead 30 * us to buffer up frequently modified backrefs in an rb tree instead
31 * of hammering updates on the extent allocation tree. 31 * of hammering updates on the extent allocation tree.
32 *
33 * Right now this code is only used for reference counted trees, but
34 * the long term goal is to get rid of the similar code for delayed
35 * extent tree modifications.
36 */ 32 */
37 33
38/* 34/*
39 * entries in the rb tree are ordered by the byte number of the extent 35 * compare two delayed tree backrefs with same bytenr and type
40 * and by the byte number of the parent block. 36 */
37static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
38 struct btrfs_delayed_tree_ref *ref1)
39{
40 if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
41 if (ref1->root < ref2->root)
42 return -1;
43 if (ref1->root > ref2->root)
44 return 1;
45 } else {
46 if (ref1->parent < ref2->parent)
47 return -1;
48 if (ref1->parent > ref2->parent)
49 return 1;
50 }
51 return 0;
52}
53
54/*
55 * compare two delayed data backrefs with same bytenr and type
41 */ 56 */
42static int comp_entry(struct btrfs_delayed_ref_node *ref, 57static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
43 u64 bytenr, u64 parent) 58 struct btrfs_delayed_data_ref *ref1)
44{ 59{
45 if (bytenr < ref->bytenr) 60 if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
61 if (ref1->root < ref2->root)
62 return -1;
63 if (ref1->root > ref2->root)
64 return 1;
65 if (ref1->objectid < ref2->objectid)
66 return -1;
67 if (ref1->objectid > ref2->objectid)
68 return 1;
69 if (ref1->offset < ref2->offset)
70 return -1;
71 if (ref1->offset > ref2->offset)
72 return 1;
73 } else {
74 if (ref1->parent < ref2->parent)
75 return -1;
76 if (ref1->parent > ref2->parent)
77 return 1;
78 }
79 return 0;
80}
81
82/*
83 * entries in the rb tree are ordered by the byte number of the extent,
84 * type of the delayed backrefs and content of delayed backrefs.
85 */
86static int comp_entry(struct btrfs_delayed_ref_node *ref2,
87 struct btrfs_delayed_ref_node *ref1)
88{
89 if (ref1->bytenr < ref2->bytenr)
46 return -1; 90 return -1;
47 if (bytenr > ref->bytenr) 91 if (ref1->bytenr > ref2->bytenr)
48 return 1; 92 return 1;
49 if (parent < ref->parent) 93 if (ref1->is_head && ref2->is_head)
94 return 0;
95 if (ref2->is_head)
50 return -1; 96 return -1;
51 if (parent > ref->parent) 97 if (ref1->is_head)
52 return 1; 98 return 1;
99 if (ref1->type < ref2->type)
100 return -1;
101 if (ref1->type > ref2->type)
102 return 1;
103 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
104 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
105 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
106 btrfs_delayed_node_to_tree_ref(ref1));
107 } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
108 ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
109 return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
110 btrfs_delayed_node_to_data_ref(ref1));
111 }
112 BUG();
53 return 0; 113 return 0;
54} 114}
55 115
@@ -59,20 +119,21 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref,
59 * inserted. 119 * inserted.
60 */ 120 */
61static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, 121static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
62 u64 bytenr, u64 parent,
63 struct rb_node *node) 122 struct rb_node *node)
64{ 123{
65 struct rb_node **p = &root->rb_node; 124 struct rb_node **p = &root->rb_node;
66 struct rb_node *parent_node = NULL; 125 struct rb_node *parent_node = NULL;
67 struct btrfs_delayed_ref_node *entry; 126 struct btrfs_delayed_ref_node *entry;
127 struct btrfs_delayed_ref_node *ins;
68 int cmp; 128 int cmp;
69 129
130 ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
70 while (*p) { 131 while (*p) {
71 parent_node = *p; 132 parent_node = *p;
72 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 133 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
73 rb_node); 134 rb_node);
74 135
75 cmp = comp_entry(entry, bytenr, parent); 136 cmp = comp_entry(entry, ins);
76 if (cmp < 0) 137 if (cmp < 0)
77 p = &(*p)->rb_left; 138 p = &(*p)->rb_left;
78 else if (cmp > 0) 139 else if (cmp > 0)
@@ -81,18 +142,17 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
81 return entry; 142 return entry;
82 } 143 }
83 144
84 entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
85 rb_link_node(node, parent_node, p); 145 rb_link_node(node, parent_node, p);
86 rb_insert_color(node, root); 146 rb_insert_color(node, root);
87 return NULL; 147 return NULL;
88} 148}
89 149
90/* 150/*
91 * find an entry based on (bytenr,parent). This returns the delayed 151 * find an head entry based on bytenr. This returns the delayed ref
92 * ref if it was able to find one, or NULL if nothing was in that spot 152 * head if it was able to find one, or NULL if nothing was in that spot
93 */ 153 */
94static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, 154static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
95 u64 bytenr, u64 parent, 155 u64 bytenr,
96 struct btrfs_delayed_ref_node **last) 156 struct btrfs_delayed_ref_node **last)
97{ 157{
98 struct rb_node *n = root->rb_node; 158 struct rb_node *n = root->rb_node;
@@ -105,7 +165,15 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
105 if (last) 165 if (last)
106 *last = entry; 166 *last = entry;
107 167
108 cmp = comp_entry(entry, bytenr, parent); 168 if (bytenr < entry->bytenr)
169 cmp = -1;
170 else if (bytenr > entry->bytenr)
171 cmp = 1;
172 else if (!btrfs_delayed_ref_is_head(entry))
173 cmp = 1;
174 else
175 cmp = 0;
176
109 if (cmp < 0) 177 if (cmp < 0)
110 n = n->rb_left; 178 n = n->rb_left;
111 else if (cmp > 0) 179 else if (cmp > 0)
@@ -154,7 +222,7 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
154 node = rb_first(&delayed_refs->root); 222 node = rb_first(&delayed_refs->root);
155 } else { 223 } else {
156 ref = NULL; 224 ref = NULL;
157 tree_search(&delayed_refs->root, start, (u64)-1, &ref); 225 find_ref_head(&delayed_refs->root, start, &ref);
158 if (ref) { 226 if (ref) {
159 struct btrfs_delayed_ref_node *tmp; 227 struct btrfs_delayed_ref_node *tmp;
160 228
@@ -234,7 +302,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
234 delayed_refs = &trans->transaction->delayed_refs; 302 delayed_refs = &trans->transaction->delayed_refs;
235 spin_lock(&delayed_refs->lock); 303 spin_lock(&delayed_refs->lock);
236 304
237 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); 305 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
238 if (ref) { 306 if (ref) {
239 prev_node = rb_prev(&ref->rb_node); 307 prev_node = rb_prev(&ref->rb_node);
240 if (!prev_node) 308 if (!prev_node)
@@ -250,25 +318,28 @@ out:
250} 318}
251 319
252/* 320/*
253 * helper function to lookup reference count 321 * helper function to lookup reference count and flags of extent.
254 * 322 *
255 * the head node for delayed ref is used to store the sum of all the 323 * the head node for delayed ref is used to store the sum of all the
256 * reference count modifications queued up in the rbtree. This way you 324 * reference count modifications queued up in the rbtree. the head
257 * can check to see what the reference count would be if all of the 325 * node may also store the extent flags to set. This way you can check
258 * delayed refs are processed. 326 * to see what the reference count and extent flags would be if all of
327 * the delayed refs are not processed.
259 */ 328 */
260int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, 329int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
261 struct btrfs_root *root, u64 bytenr, 330 struct btrfs_root *root, u64 bytenr,
262 u64 num_bytes, u32 *refs) 331 u64 num_bytes, u64 *refs, u64 *flags)
263{ 332{
264 struct btrfs_delayed_ref_node *ref; 333 struct btrfs_delayed_ref_node *ref;
265 struct btrfs_delayed_ref_head *head; 334 struct btrfs_delayed_ref_head *head;
266 struct btrfs_delayed_ref_root *delayed_refs; 335 struct btrfs_delayed_ref_root *delayed_refs;
267 struct btrfs_path *path; 336 struct btrfs_path *path;
268 struct extent_buffer *leaf;
269 struct btrfs_extent_item *ei; 337 struct btrfs_extent_item *ei;
338 struct extent_buffer *leaf;
270 struct btrfs_key key; 339 struct btrfs_key key;
271 u32 num_refs; 340 u32 item_size;
341 u64 num_refs;
342 u64 extent_flags;
272 int ret; 343 int ret;
273 344
274 path = btrfs_alloc_path(); 345 path = btrfs_alloc_path();
@@ -287,37 +358,60 @@ again:
287 358
288 if (ret == 0) { 359 if (ret == 0) {
289 leaf = path->nodes[0]; 360 leaf = path->nodes[0];
290 ei = btrfs_item_ptr(leaf, path->slots[0], 361 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
291 struct btrfs_extent_item); 362 if (item_size >= sizeof(*ei)) {
292 num_refs = btrfs_extent_refs(leaf, ei); 363 ei = btrfs_item_ptr(leaf, path->slots[0],
364 struct btrfs_extent_item);
365 num_refs = btrfs_extent_refs(leaf, ei);
366 extent_flags = btrfs_extent_flags(leaf, ei);
367 } else {
368#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
369 struct btrfs_extent_item_v0 *ei0;
370 BUG_ON(item_size != sizeof(*ei0));
371 ei0 = btrfs_item_ptr(leaf, path->slots[0],
372 struct btrfs_extent_item_v0);
373 num_refs = btrfs_extent_refs_v0(leaf, ei0);
374 /* FIXME: this isn't correct for data */
375 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
376#else
377 BUG();
378#endif
379 }
380 BUG_ON(num_refs == 0);
293 } else { 381 } else {
294 num_refs = 0; 382 num_refs = 0;
383 extent_flags = 0;
295 ret = 0; 384 ret = 0;
296 } 385 }
297 386
298 spin_lock(&delayed_refs->lock); 387 spin_lock(&delayed_refs->lock);
299 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); 388 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
300 if (ref) { 389 if (ref) {
301 head = btrfs_delayed_node_to_head(ref); 390 head = btrfs_delayed_node_to_head(ref);
302 if (mutex_trylock(&head->mutex)) { 391 if (!mutex_trylock(&head->mutex)) {
303 num_refs += ref->ref_mod; 392 atomic_inc(&ref->refs);
304 mutex_unlock(&head->mutex); 393 spin_unlock(&delayed_refs->lock);
305 *refs = num_refs;
306 goto out;
307 }
308 394
309 atomic_inc(&ref->refs); 395 btrfs_release_path(root->fs_info->extent_root, path);
310 spin_unlock(&delayed_refs->lock);
311 396
312 btrfs_release_path(root->fs_info->extent_root, path); 397 mutex_lock(&head->mutex);
398 mutex_unlock(&head->mutex);
399 btrfs_put_delayed_ref(ref);
400 goto again;
401 }
402 if (head->extent_op && head->extent_op->update_flags)
403 extent_flags |= head->extent_op->flags_to_set;
404 else
405 BUG_ON(num_refs == 0);
313 406
314 mutex_lock(&head->mutex); 407 num_refs += ref->ref_mod;
315 mutex_unlock(&head->mutex); 408 mutex_unlock(&head->mutex);
316 btrfs_put_delayed_ref(ref);
317 goto again;
318 } else {
319 *refs = num_refs;
320 } 409 }
410 WARN_ON(num_refs == 0);
411 if (refs)
412 *refs = num_refs;
413 if (flags)
414 *flags = extent_flags;
321out: 415out:
322 spin_unlock(&delayed_refs->lock); 416 spin_unlock(&delayed_refs->lock);
323 btrfs_free_path(path); 417 btrfs_free_path(path);
@@ -338,16 +432,7 @@ update_existing_ref(struct btrfs_trans_handle *trans,
338 struct btrfs_delayed_ref_node *existing, 432 struct btrfs_delayed_ref_node *existing,
339 struct btrfs_delayed_ref_node *update) 433 struct btrfs_delayed_ref_node *update)
340{ 434{
341 struct btrfs_delayed_ref *existing_ref; 435 if (update->action != existing->action) {
342 struct btrfs_delayed_ref *ref;
343
344 existing_ref = btrfs_delayed_node_to_ref(existing);
345 ref = btrfs_delayed_node_to_ref(update);
346
347 if (ref->pin)
348 existing_ref->pin = 1;
349
350 if (ref->action != existing_ref->action) {
351 /* 436 /*
352 * this is effectively undoing either an add or a 437 * this is effectively undoing either an add or a
353 * drop. We decrement the ref_mod, and if it goes 438 * drop. We decrement the ref_mod, and if it goes
@@ -363,20 +448,13 @@ update_existing_ref(struct btrfs_trans_handle *trans,
363 delayed_refs->num_entries--; 448 delayed_refs->num_entries--;
364 if (trans->delayed_ref_updates) 449 if (trans->delayed_ref_updates)
365 trans->delayed_ref_updates--; 450 trans->delayed_ref_updates--;
451 } else {
452 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
453 existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
366 } 454 }
367 } else { 455 } else {
368 if (existing_ref->action == BTRFS_ADD_DELAYED_REF) { 456 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
369 /* if we're adding refs, make sure all the 457 existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
370 * details match up. The extent could
371 * have been totally freed and reallocated
372 * by a different owner before the delayed
373 * ref entries were removed.
374 */
375 existing_ref->owner_objectid = ref->owner_objectid;
376 existing_ref->generation = ref->generation;
377 existing_ref->root = ref->root;
378 existing->num_bytes = update->num_bytes;
379 }
380 /* 458 /*
381 * the action on the existing ref matches 459 * the action on the existing ref matches
382 * the action on the ref we're trying to add. 460 * the action on the ref we're trying to add.
@@ -401,6 +479,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
401 479
402 existing_ref = btrfs_delayed_node_to_head(existing); 480 existing_ref = btrfs_delayed_node_to_head(existing);
403 ref = btrfs_delayed_node_to_head(update); 481 ref = btrfs_delayed_node_to_head(update);
482 BUG_ON(existing_ref->is_data != ref->is_data);
404 483
405 if (ref->must_insert_reserved) { 484 if (ref->must_insert_reserved) {
406 /* if the extent was freed and then 485 /* if the extent was freed and then
@@ -420,6 +499,24 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
420 499
421 } 500 }
422 501
502 if (ref->extent_op) {
503 if (!existing_ref->extent_op) {
504 existing_ref->extent_op = ref->extent_op;
505 } else {
506 if (ref->extent_op->update_key) {
507 memcpy(&existing_ref->extent_op->key,
508 &ref->extent_op->key,
509 sizeof(ref->extent_op->key));
510 existing_ref->extent_op->update_key = 1;
511 }
512 if (ref->extent_op->update_flags) {
513 existing_ref->extent_op->flags_to_set |=
514 ref->extent_op->flags_to_set;
515 existing_ref->extent_op->update_flags = 1;
516 }
517 kfree(ref->extent_op);
518 }
519 }
423 /* 520 /*
424 * update the reference mod on the head to reflect this new operation 521 * update the reference mod on the head to reflect this new operation
425 */ 522 */
@@ -427,19 +524,16 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
427} 524}
428 525
429/* 526/*
430 * helper function to actually insert a delayed ref into the rbtree. 527 * helper function to actually insert a head node into the rbtree.
431 * this does all the dirty work in terms of maintaining the correct 528 * this does all the dirty work in terms of maintaining the correct
432 * overall modification count in the head node and properly dealing 529 * overall modification count.
433 * with updating existing nodes as new modifications are queued.
434 */ 530 */
435static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, 531static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
436 struct btrfs_delayed_ref_node *ref, 532 struct btrfs_delayed_ref_node *ref,
437 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, 533 u64 bytenr, u64 num_bytes,
438 u64 ref_generation, u64 owner_objectid, int action, 534 int action, int is_data)
439 int pin)
440{ 535{
441 struct btrfs_delayed_ref_node *existing; 536 struct btrfs_delayed_ref_node *existing;
442 struct btrfs_delayed_ref *full_ref;
443 struct btrfs_delayed_ref_head *head_ref = NULL; 537 struct btrfs_delayed_ref_head *head_ref = NULL;
444 struct btrfs_delayed_ref_root *delayed_refs; 538 struct btrfs_delayed_ref_root *delayed_refs;
445 int count_mod = 1; 539 int count_mod = 1;
@@ -449,12 +543,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
449 * the head node stores the sum of all the mods, so dropping a ref 543 * the head node stores the sum of all the mods, so dropping a ref
450 * should drop the sum in the head node by one. 544 * should drop the sum in the head node by one.
451 */ 545 */
452 if (parent == (u64)-1) { 546 if (action == BTRFS_UPDATE_DELAYED_HEAD)
453 if (action == BTRFS_DROP_DELAYED_REF) 547 count_mod = 0;
454 count_mod = -1; 548 else if (action == BTRFS_DROP_DELAYED_REF)
455 else if (action == BTRFS_UPDATE_DELAYED_HEAD) 549 count_mod = -1;
456 count_mod = 0;
457 }
458 550
459 /* 551 /*
460 * BTRFS_ADD_DELAYED_EXTENT means that we need to update 552 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
@@ -467,57 +559,148 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
467 * Once we record must_insert_reserved, switch the action to 559 * Once we record must_insert_reserved, switch the action to
468 * BTRFS_ADD_DELAYED_REF because other special casing is not required. 560 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
469 */ 561 */
470 if (action == BTRFS_ADD_DELAYED_EXTENT) { 562 if (action == BTRFS_ADD_DELAYED_EXTENT)
471 must_insert_reserved = 1; 563 must_insert_reserved = 1;
472 action = BTRFS_ADD_DELAYED_REF; 564 else
473 } else {
474 must_insert_reserved = 0; 565 must_insert_reserved = 0;
475 }
476
477 566
478 delayed_refs = &trans->transaction->delayed_refs; 567 delayed_refs = &trans->transaction->delayed_refs;
479 568
480 /* first set the basic ref node struct up */ 569 /* first set the basic ref node struct up */
481 atomic_set(&ref->refs, 1); 570 atomic_set(&ref->refs, 1);
482 ref->bytenr = bytenr; 571 ref->bytenr = bytenr;
483 ref->parent = parent; 572 ref->num_bytes = num_bytes;
484 ref->ref_mod = count_mod; 573 ref->ref_mod = count_mod;
574 ref->type = 0;
575 ref->action = 0;
576 ref->is_head = 1;
485 ref->in_tree = 1; 577 ref->in_tree = 1;
578
579 head_ref = btrfs_delayed_node_to_head(ref);
580 head_ref->must_insert_reserved = must_insert_reserved;
581 head_ref->is_data = is_data;
582
583 INIT_LIST_HEAD(&head_ref->cluster);
584 mutex_init(&head_ref->mutex);
585
586 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
587
588 if (existing) {
589 update_existing_head_ref(existing, ref);
590 /*
591 * we've updated the existing ref, free the newly
592 * allocated ref
593 */
594 kfree(ref);
595 } else {
596 delayed_refs->num_heads++;
597 delayed_refs->num_heads_ready++;
598 delayed_refs->num_entries++;
599 trans->delayed_ref_updates++;
600 }
601 return 0;
602}
603
604/*
605 * helper to insert a delayed tree ref into the rbtree.
606 */
607static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
608 struct btrfs_delayed_ref_node *ref,
609 u64 bytenr, u64 num_bytes, u64 parent,
610 u64 ref_root, int level, int action)
611{
612 struct btrfs_delayed_ref_node *existing;
613 struct btrfs_delayed_tree_ref *full_ref;
614 struct btrfs_delayed_ref_root *delayed_refs;
615
616 if (action == BTRFS_ADD_DELAYED_EXTENT)
617 action = BTRFS_ADD_DELAYED_REF;
618
619 delayed_refs = &trans->transaction->delayed_refs;
620
621 /* first set the basic ref node struct up */
622 atomic_set(&ref->refs, 1);
623 ref->bytenr = bytenr;
486 ref->num_bytes = num_bytes; 624 ref->num_bytes = num_bytes;
625 ref->ref_mod = 1;
626 ref->action = action;
627 ref->is_head = 0;
628 ref->in_tree = 1;
487 629
488 if (btrfs_delayed_ref_is_head(ref)) { 630 full_ref = btrfs_delayed_node_to_tree_ref(ref);
489 head_ref = btrfs_delayed_node_to_head(ref); 631 if (parent) {
490 head_ref->must_insert_reserved = must_insert_reserved; 632 full_ref->parent = parent;
491 INIT_LIST_HEAD(&head_ref->cluster); 633 ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
492 mutex_init(&head_ref->mutex);
493 } else { 634 } else {
494 full_ref = btrfs_delayed_node_to_ref(ref);
495 full_ref->root = ref_root; 635 full_ref->root = ref_root;
496 full_ref->generation = ref_generation; 636 ref->type = BTRFS_TREE_BLOCK_REF_KEY;
497 full_ref->owner_objectid = owner_objectid;
498 full_ref->pin = pin;
499 full_ref->action = action;
500 } 637 }
638 full_ref->level = level;
501 639
502 existing = tree_insert(&delayed_refs->root, bytenr, 640 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
503 parent, &ref->rb_node);
504 641
505 if (existing) { 642 if (existing) {
506 if (btrfs_delayed_ref_is_head(ref)) 643 update_existing_ref(trans, delayed_refs, existing, ref);
507 update_existing_head_ref(existing, ref); 644 /*
508 else 645 * we've updated the existing ref, free the newly
509 update_existing_ref(trans, delayed_refs, existing, ref); 646 * allocated ref
647 */
648 kfree(ref);
649 } else {
650 delayed_refs->num_entries++;
651 trans->delayed_ref_updates++;
652 }
653 return 0;
654}
655
656/*
657 * helper to insert a delayed data ref into the rbtree.
658 */
659static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
660 struct btrfs_delayed_ref_node *ref,
661 u64 bytenr, u64 num_bytes, u64 parent,
662 u64 ref_root, u64 owner, u64 offset,
663 int action)
664{
665 struct btrfs_delayed_ref_node *existing;
666 struct btrfs_delayed_data_ref *full_ref;
667 struct btrfs_delayed_ref_root *delayed_refs;
668
669 if (action == BTRFS_ADD_DELAYED_EXTENT)
670 action = BTRFS_ADD_DELAYED_REF;
671
672 delayed_refs = &trans->transaction->delayed_refs;
673
674 /* first set the basic ref node struct up */
675 atomic_set(&ref->refs, 1);
676 ref->bytenr = bytenr;
677 ref->num_bytes = num_bytes;
678 ref->ref_mod = 1;
679 ref->action = action;
680 ref->is_head = 0;
681 ref->in_tree = 1;
682
683 full_ref = btrfs_delayed_node_to_data_ref(ref);
684 if (parent) {
685 full_ref->parent = parent;
686 ref->type = BTRFS_SHARED_DATA_REF_KEY;
687 } else {
688 full_ref->root = ref_root;
689 ref->type = BTRFS_EXTENT_DATA_REF_KEY;
690 }
691 full_ref->objectid = owner;
692 full_ref->offset = offset;
510 693
694 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
695
696 if (existing) {
697 update_existing_ref(trans, delayed_refs, existing, ref);
511 /* 698 /*
512 * we've updated the existing ref, free the newly 699 * we've updated the existing ref, free the newly
513 * allocated ref 700 * allocated ref
514 */ 701 */
515 kfree(ref); 702 kfree(ref);
516 } else { 703 } else {
517 if (btrfs_delayed_ref_is_head(ref)) {
518 delayed_refs->num_heads++;
519 delayed_refs->num_heads_ready++;
520 }
521 delayed_refs->num_entries++; 704 delayed_refs->num_entries++;
522 trans->delayed_ref_updates++; 705 trans->delayed_ref_updates++;
523 } 706 }
@@ -525,37 +708,78 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
525} 708}
526 709
527/* 710/*
528 * add a delayed ref to the tree. This does all of the accounting required 711 * add a delayed tree ref. This does all of the accounting required
529 * to make sure the delayed ref is eventually processed before this 712 * to make sure the delayed ref is eventually processed before this
530 * transaction commits. 713 * transaction commits.
531 */ 714 */
532int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, 715int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
533 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, 716 u64 bytenr, u64 num_bytes, u64 parent,
534 u64 ref_generation, u64 owner_objectid, int action, 717 u64 ref_root, int level, int action,
535 int pin) 718 struct btrfs_delayed_extent_op *extent_op)
536{ 719{
537 struct btrfs_delayed_ref *ref; 720 struct btrfs_delayed_tree_ref *ref;
538 struct btrfs_delayed_ref_head *head_ref; 721 struct btrfs_delayed_ref_head *head_ref;
539 struct btrfs_delayed_ref_root *delayed_refs; 722 struct btrfs_delayed_ref_root *delayed_refs;
540 int ret; 723 int ret;
541 724
725 BUG_ON(extent_op && extent_op->is_data);
542 ref = kmalloc(sizeof(*ref), GFP_NOFS); 726 ref = kmalloc(sizeof(*ref), GFP_NOFS);
543 if (!ref) 727 if (!ref)
544 return -ENOMEM; 728 return -ENOMEM;
545 729
730 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
731 if (!head_ref) {
732 kfree(ref);
733 return -ENOMEM;
734 }
735
736 head_ref->extent_op = extent_op;
737
738 delayed_refs = &trans->transaction->delayed_refs;
739 spin_lock(&delayed_refs->lock);
740
546 /* 741 /*
547 * the parent = 0 case comes from cases where we don't actually 742 * insert both the head node and the new ref without dropping
548 * know the parent yet. It will get updated later via a add/drop 743 * the spin lock
549 * pair.
550 */ 744 */
551 if (parent == 0) 745 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
552 parent = bytenr; 746 action, 0);
747 BUG_ON(ret);
748
749 ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
750 parent, ref_root, level, action);
751 BUG_ON(ret);
752 spin_unlock(&delayed_refs->lock);
753 return 0;
754}
755
756/*
757 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
758 */
759int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
760 u64 bytenr, u64 num_bytes,
761 u64 parent, u64 ref_root,
762 u64 owner, u64 offset, int action,
763 struct btrfs_delayed_extent_op *extent_op)
764{
765 struct btrfs_delayed_data_ref *ref;
766 struct btrfs_delayed_ref_head *head_ref;
767 struct btrfs_delayed_ref_root *delayed_refs;
768 int ret;
769
770 BUG_ON(extent_op && !extent_op->is_data);
771 ref = kmalloc(sizeof(*ref), GFP_NOFS);
772 if (!ref)
773 return -ENOMEM;
553 774
554 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 775 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
555 if (!head_ref) { 776 if (!head_ref) {
556 kfree(ref); 777 kfree(ref);
557 return -ENOMEM; 778 return -ENOMEM;
558 } 779 }
780
781 head_ref->extent_op = extent_op;
782
559 delayed_refs = &trans->transaction->delayed_refs; 783 delayed_refs = &trans->transaction->delayed_refs;
560 spin_lock(&delayed_refs->lock); 784 spin_lock(&delayed_refs->lock);
561 785
@@ -563,14 +787,39 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
563 * insert both the head node and the new ref without dropping 787 * insert both the head node and the new ref without dropping
564 * the spin lock 788 * the spin lock
565 */ 789 */
566 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, 790 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
567 (u64)-1, 0, 0, 0, action, pin); 791 action, 1);
568 BUG_ON(ret); 792 BUG_ON(ret);
569 793
570 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, 794 ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
571 parent, ref_root, ref_generation, 795 parent, ref_root, owner, offset, action);
572 owner_objectid, action, pin); 796 BUG_ON(ret);
797 spin_unlock(&delayed_refs->lock);
798 return 0;
799}
800
801int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
802 u64 bytenr, u64 num_bytes,
803 struct btrfs_delayed_extent_op *extent_op)
804{
805 struct btrfs_delayed_ref_head *head_ref;
806 struct btrfs_delayed_ref_root *delayed_refs;
807 int ret;
808
809 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
810 if (!head_ref)
811 return -ENOMEM;
812
813 head_ref->extent_op = extent_op;
814
815 delayed_refs = &trans->transaction->delayed_refs;
816 spin_lock(&delayed_refs->lock);
817
818 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
819 num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
820 extent_op->is_data);
573 BUG_ON(ret); 821 BUG_ON(ret);
822
574 spin_unlock(&delayed_refs->lock); 823 spin_unlock(&delayed_refs->lock);
575 return 0; 824 return 0;
576} 825}
@@ -587,7 +836,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
587 struct btrfs_delayed_ref_root *delayed_refs; 836 struct btrfs_delayed_ref_root *delayed_refs;
588 837
589 delayed_refs = &trans->transaction->delayed_refs; 838 delayed_refs = &trans->transaction->delayed_refs;
590 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); 839 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
591 if (ref) 840 if (ref)
592 return btrfs_delayed_node_to_head(ref); 841 return btrfs_delayed_node_to_head(ref);
593 return NULL; 842 return NULL;
@@ -603,6 +852,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
603 * 852 *
604 * It is the same as doing a ref add and delete in two separate calls. 853 * It is the same as doing a ref add and delete in two separate calls.
605 */ 854 */
855#if 0
606int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 856int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
607 u64 bytenr, u64 num_bytes, u64 orig_parent, 857 u64 bytenr, u64 num_bytes, u64 orig_parent,
608 u64 parent, u64 orig_ref_root, u64 ref_root, 858 u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -666,3 +916,4 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
666 spin_unlock(&delayed_refs->lock); 916 spin_unlock(&delayed_refs->lock);
667 return 0; 917 return 0;
668} 918}
919#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 3bec2ff0b15c..f6fc67ddad36 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -30,9 +30,6 @@ struct btrfs_delayed_ref_node {
30 /* the starting bytenr of the extent */ 30 /* the starting bytenr of the extent */
31 u64 bytenr; 31 u64 bytenr;
32 32
33 /* the parent our backref will point to */
34 u64 parent;
35
36 /* the size of the extent */ 33 /* the size of the extent */
37 u64 num_bytes; 34 u64 num_bytes;
38 35
@@ -50,10 +47,21 @@ struct btrfs_delayed_ref_node {
50 */ 47 */
51 int ref_mod; 48 int ref_mod;
52 49
50 unsigned int action:8;
51 unsigned int type:8;
53 /* is this node still in the rbtree? */ 52 /* is this node still in the rbtree? */
53 unsigned int is_head:1;
54 unsigned int in_tree:1; 54 unsigned int in_tree:1;
55}; 55};
56 56
57struct btrfs_delayed_extent_op {
58 struct btrfs_disk_key key;
59 u64 flags_to_set;
60 unsigned int update_key:1;
61 unsigned int update_flags:1;
62 unsigned int is_data:1;
63};
64
57/* 65/*
58 * the head refs are used to hold a lock on a given extent, which allows us 66 * the head refs are used to hold a lock on a given extent, which allows us
59 * to make sure that only one process is running the delayed refs 67 * to make sure that only one process is running the delayed refs
@@ -71,6 +79,7 @@ struct btrfs_delayed_ref_head {
71 79
72 struct list_head cluster; 80 struct list_head cluster;
73 81
82 struct btrfs_delayed_extent_op *extent_op;
74 /* 83 /*
75 * when a new extent is allocated, it is just reserved in memory 84 * when a new extent is allocated, it is just reserved in memory
76 * The actual extent isn't inserted into the extent allocation tree 85 * The actual extent isn't inserted into the extent allocation tree
@@ -84,27 +93,26 @@ struct btrfs_delayed_ref_head {
84 * the free has happened. 93 * the free has happened.
85 */ 94 */
86 unsigned int must_insert_reserved:1; 95 unsigned int must_insert_reserved:1;
96 unsigned int is_data:1;
87}; 97};
88 98
89struct btrfs_delayed_ref { 99struct btrfs_delayed_tree_ref {
90 struct btrfs_delayed_ref_node node; 100 struct btrfs_delayed_ref_node node;
101 union {
102 u64 root;
103 u64 parent;
104 };
105 int level;
106};
91 107
92 /* the root objectid our ref will point to */ 108struct btrfs_delayed_data_ref {
93 u64 root; 109 struct btrfs_delayed_ref_node node;
94 110 union {
95 /* the generation for the backref */ 111 u64 root;
96 u64 generation; 112 u64 parent;
97 113 };
98 /* owner_objectid of the backref */ 114 u64 objectid;
99 u64 owner_objectid; 115 u64 offset;
100
101 /* operation done by this entry in the rbtree */
102 u8 action;
103
104 /* if pin == 1, when the extent is freed it will be pinned until
105 * transaction commit
106 */
107 unsigned int pin:1;
108}; 116};
109 117
110struct btrfs_delayed_ref_root { 118struct btrfs_delayed_ref_root {
@@ -143,17 +151,25 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
143 } 151 }
144} 152}
145 153
146int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, 154int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
147 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, 155 u64 bytenr, u64 num_bytes, u64 parent,
148 u64 ref_generation, u64 owner_objectid, int action, 156 u64 ref_root, int level, int action,
149 int pin); 157 struct btrfs_delayed_extent_op *extent_op);
158int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
159 u64 bytenr, u64 num_bytes,
160 u64 parent, u64 ref_root,
161 u64 owner, u64 offset, int action,
162 struct btrfs_delayed_extent_op *extent_op);
163int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
164 u64 bytenr, u64 num_bytes,
165 struct btrfs_delayed_extent_op *extent_op);
150 166
151struct btrfs_delayed_ref_head * 167struct btrfs_delayed_ref_head *
152btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
153int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); 169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
154int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, 170int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
155 struct btrfs_root *root, u64 bytenr, 171 struct btrfs_root *root, u64 bytenr,
156 u64 num_bytes, u32 *refs); 172 u64 num_bytes, u64 *refs, u64 *flags);
157int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 173int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
158 u64 bytenr, u64 num_bytes, u64 orig_parent, 174 u64 bytenr, u64 num_bytes, u64 orig_parent,
159 u64 parent, u64 orig_ref_root, u64 ref_root, 175 u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -169,18 +185,24 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
169 */ 185 */
170static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) 186static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
171{ 187{
172 return node->parent == (u64)-1; 188 return node->is_head;
173} 189}
174 190
175/* 191/*
176 * helper functions to cast a node into its container 192 * helper functions to cast a node into its container
177 */ 193 */
178static inline struct btrfs_delayed_ref * 194static inline struct btrfs_delayed_tree_ref *
179btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node) 195btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
180{ 196{
181 WARN_ON(btrfs_delayed_ref_is_head(node)); 197 WARN_ON(btrfs_delayed_ref_is_head(node));
182 return container_of(node, struct btrfs_delayed_ref, node); 198 return container_of(node, struct btrfs_delayed_tree_ref, node);
199}
183 200
201static inline struct btrfs_delayed_data_ref *
202btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
203{
204 WARN_ON(btrfs_delayed_ref_is_head(node));
205 return container_of(node, struct btrfs_delayed_data_ref, node);
184} 206}
185 207
186static inline struct btrfs_delayed_ref_head * 208static inline struct btrfs_delayed_ref_head *
@@ -188,6 +210,5 @@ btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
188{ 210{
189 WARN_ON(!btrfs_delayed_ref_is_head(node)); 211 WARN_ON(!btrfs_delayed_ref_is_head(node));
190 return container_of(node, struct btrfs_delayed_ref_head, node); 212 return container_of(node, struct btrfs_delayed_ref_head, node);
191
192} 213}
193#endif 214#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b0ea0b80c23..e83be2e4602c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,8 +26,8 @@
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h>
29#include "compat.h" 30#include "compat.h"
30#include "crc32c.h"
31#include "ctree.h" 31#include "ctree.h"
32#include "disk-io.h" 32#include "disk-io.h"
33#include "transaction.h" 33#include "transaction.h"
@@ -36,13 +36,14 @@
36#include "print-tree.h" 36#include "print-tree.h"
37#include "async-thread.h" 37#include "async-thread.h"
38#include "locking.h" 38#include "locking.h"
39#include "ref-cache.h"
40#include "tree-log.h" 39#include "tree-log.h"
41#include "free-space-cache.h" 40#include "free-space-cache.h"
42 41
43static struct extent_io_ops btree_extent_io_ops; 42static struct extent_io_ops btree_extent_io_ops;
44static void end_workqueue_fn(struct btrfs_work *work); 43static void end_workqueue_fn(struct btrfs_work *work);
45 44
45static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
46
46/* 47/*
47 * end_io_wq structs are used to do processing in task context when an IO is 48 * end_io_wq structs are used to do processing in task context when an IO is
48 * complete. This is used during reads to verify checksums, and it is used 49 * complete. This is used during reads to verify checksums, and it is used
@@ -172,7 +173,7 @@ out:
172 173
173u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) 174u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
174{ 175{
175 return btrfs_crc32c(seed, data, len); 176 return crc32c(seed, data, len);
176} 177}
177 178
178void btrfs_csum_final(u32 crc, char *result) 179void btrfs_csum_final(u32 crc, char *result)
@@ -884,7 +885,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
884{ 885{
885 root->node = NULL; 886 root->node = NULL;
886 root->commit_root = NULL; 887 root->commit_root = NULL;
887 root->ref_tree = NULL;
888 root->sectorsize = sectorsize; 888 root->sectorsize = sectorsize;
889 root->nodesize = nodesize; 889 root->nodesize = nodesize;
890 root->leafsize = leafsize; 890 root->leafsize = leafsize;
@@ -899,12 +899,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
899 root->last_inode_alloc = 0; 899 root->last_inode_alloc = 0;
900 root->name = NULL; 900 root->name = NULL;
901 root->in_sysfs = 0; 901 root->in_sysfs = 0;
902 root->inode_tree.rb_node = NULL;
902 903
903 INIT_LIST_HEAD(&root->dirty_list); 904 INIT_LIST_HEAD(&root->dirty_list);
904 INIT_LIST_HEAD(&root->orphan_list); 905 INIT_LIST_HEAD(&root->orphan_list);
905 INIT_LIST_HEAD(&root->dead_list); 906 INIT_LIST_HEAD(&root->root_list);
906 spin_lock_init(&root->node_lock); 907 spin_lock_init(&root->node_lock);
907 spin_lock_init(&root->list_lock); 908 spin_lock_init(&root->list_lock);
909 spin_lock_init(&root->inode_lock);
908 mutex_init(&root->objectid_mutex); 910 mutex_init(&root->objectid_mutex);
909 mutex_init(&root->log_mutex); 911 mutex_init(&root->log_mutex);
910 init_waitqueue_head(&root->log_writer_wait); 912 init_waitqueue_head(&root->log_writer_wait);
@@ -918,9 +920,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
918 extent_io_tree_init(&root->dirty_log_pages, 920 extent_io_tree_init(&root->dirty_log_pages,
919 fs_info->btree_inode->i_mapping, GFP_NOFS); 921 fs_info->btree_inode->i_mapping, GFP_NOFS);
920 922
921 btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
922 root->ref_tree = &root->ref_tree_struct;
923
924 memset(&root->root_key, 0, sizeof(root->root_key)); 923 memset(&root->root_key, 0, sizeof(root->root_key));
925 memset(&root->root_item, 0, sizeof(root->root_item)); 924 memset(&root->root_item, 0, sizeof(root->root_item));
926 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); 925 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -959,6 +958,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
959 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 958 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
960 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 959 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
961 blocksize, generation); 960 blocksize, generation);
961 root->commit_root = btrfs_root_node(root);
962 BUG_ON(!root->node); 962 BUG_ON(!root->node);
963 return 0; 963 return 0;
964} 964}
@@ -1025,20 +1025,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1025 */ 1025 */
1026 root->ref_cows = 0; 1026 root->ref_cows = 0;
1027 1027
1028 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 1028 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1029 0, BTRFS_TREE_LOG_OBJECTID, 1029 BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
1030 trans->transid, 0, 0, 0);
1031 if (IS_ERR(leaf)) { 1030 if (IS_ERR(leaf)) {
1032 kfree(root); 1031 kfree(root);
1033 return ERR_CAST(leaf); 1032 return ERR_CAST(leaf);
1034 } 1033 }
1035 1034
1035 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1036 btrfs_set_header_bytenr(leaf, leaf->start);
1037 btrfs_set_header_generation(leaf, trans->transid);
1038 btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1039 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
1036 root->node = leaf; 1040 root->node = leaf;
1037 btrfs_set_header_nritems(root->node, 0);
1038 btrfs_set_header_level(root->node, 0);
1039 btrfs_set_header_bytenr(root->node, root->node->start);
1040 btrfs_set_header_generation(root->node, trans->transid);
1041 btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
1042 1041
1043 write_extent_buffer(root->node, root->fs_info->fsid, 1042 write_extent_buffer(root->node, root->fs_info->fsid,
1044 (unsigned long)btrfs_header_fsid(root->node), 1043 (unsigned long)btrfs_header_fsid(root->node),
@@ -1081,8 +1080,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1081 inode_item->nbytes = cpu_to_le64(root->leafsize); 1080 inode_item->nbytes = cpu_to_le64(root->leafsize);
1082 inode_item->mode = cpu_to_le32(S_IFDIR | 0755); 1081 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
1083 1082
1084 btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start); 1083 btrfs_set_root_node(&log_root->root_item, log_root->node);
1085 btrfs_set_root_generation(&log_root->root_item, trans->transid);
1086 1084
1087 WARN_ON(root->log_root); 1085 WARN_ON(root->log_root);
1088 root->log_root = log_root; 1086 root->log_root = log_root;
@@ -1144,6 +1142,7 @@ out:
1144 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1142 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1145 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1143 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1146 blocksize, generation); 1144 blocksize, generation);
1145 root->commit_root = btrfs_root_node(root);
1147 BUG_ON(!root->node); 1146 BUG_ON(!root->node);
1148insert: 1147insert:
1149 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { 1148 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -1210,7 +1209,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1210 } 1209 }
1211 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 1210 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
1212 ret = btrfs_find_dead_roots(fs_info->tree_root, 1211 ret = btrfs_find_dead_roots(fs_info->tree_root,
1213 root->root_key.objectid, root); 1212 root->root_key.objectid);
1214 BUG_ON(ret); 1213 BUG_ON(ret);
1215 btrfs_orphan_cleanup(root); 1214 btrfs_orphan_cleanup(root);
1216 } 1215 }
@@ -1345,12 +1344,25 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1345 free_extent_map(em); 1344 free_extent_map(em);
1346} 1345}
1347 1346
1347/*
1348 * If this fails, caller must call bdi_destroy() to get rid of the
1349 * bdi again.
1350 */
1348static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) 1351static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1349{ 1352{
1350 bdi_init(bdi); 1353 int err;
1354
1355 bdi->capabilities = BDI_CAP_MAP_COPY;
1356 err = bdi_init(bdi);
1357 if (err)
1358 return err;
1359
1360 err = bdi_register(bdi, NULL, "btrfs-%d",
1361 atomic_inc_return(&btrfs_bdi_num));
1362 if (err)
1363 return err;
1364
1351 bdi->ra_pages = default_backing_dev_info.ra_pages; 1365 bdi->ra_pages = default_backing_dev_info.ra_pages;
1352 bdi->state = 0;
1353 bdi->capabilities = default_backing_dev_info.capabilities;
1354 bdi->unplug_io_fn = btrfs_unplug_io_fn; 1366 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1355 bdi->unplug_io_data = info; 1367 bdi->unplug_io_data = info;
1356 bdi->congested_fn = btrfs_congested_fn; 1368 bdi->congested_fn = btrfs_congested_fn;
@@ -1569,12 +1581,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1569 atomic_set(&fs_info->async_delalloc_pages, 0); 1581 atomic_set(&fs_info->async_delalloc_pages, 0);
1570 atomic_set(&fs_info->async_submit_draining, 0); 1582 atomic_set(&fs_info->async_submit_draining, 0);
1571 atomic_set(&fs_info->nr_async_bios, 0); 1583 atomic_set(&fs_info->nr_async_bios, 0);
1572 atomic_set(&fs_info->throttles, 0);
1573 atomic_set(&fs_info->throttle_gen, 0);
1574 fs_info->sb = sb; 1584 fs_info->sb = sb;
1575 fs_info->max_extent = (u64)-1; 1585 fs_info->max_extent = (u64)-1;
1576 fs_info->max_inline = 8192 * 1024; 1586 fs_info->max_inline = 8192 * 1024;
1577 setup_bdi(fs_info, &fs_info->bdi); 1587 if (setup_bdi(fs_info, &fs_info->bdi))
1588 goto fail_bdi;
1578 fs_info->btree_inode = new_inode(sb); 1589 fs_info->btree_inode = new_inode(sb);
1579 fs_info->btree_inode->i_ino = 1; 1590 fs_info->btree_inode->i_ino = 1;
1580 fs_info->btree_inode->i_nlink = 1; 1591 fs_info->btree_inode->i_nlink = 1;
@@ -1598,6 +1609,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1598 fs_info->btree_inode->i_mapping->a_ops = &btree_aops; 1609 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
1599 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; 1610 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
1600 1611
1612 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
1601 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, 1613 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1602 fs_info->btree_inode->i_mapping, 1614 fs_info->btree_inode->i_mapping,
1603 GFP_NOFS); 1615 GFP_NOFS);
@@ -1613,10 +1625,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1613 fs_info->btree_inode->i_mapping, GFP_NOFS); 1625 fs_info->btree_inode->i_mapping, GFP_NOFS);
1614 fs_info->do_barriers = 1; 1626 fs_info->do_barriers = 1;
1615 1627
1616 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
1617 btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
1618 btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
1619
1620 BTRFS_I(fs_info->btree_inode)->root = tree_root; 1628 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1621 memset(&BTRFS_I(fs_info->btree_inode)->location, 0, 1629 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1622 sizeof(struct btrfs_key)); 1630 sizeof(struct btrfs_key));
@@ -1631,6 +1639,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1631 mutex_init(&fs_info->cleaner_mutex); 1639 mutex_init(&fs_info->cleaner_mutex);
1632 mutex_init(&fs_info->volume_mutex); 1640 mutex_init(&fs_info->volume_mutex);
1633 mutex_init(&fs_info->tree_reloc_mutex); 1641 mutex_init(&fs_info->tree_reloc_mutex);
1642 init_rwsem(&fs_info->extent_commit_sem);
1634 1643
1635 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 1644 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1636 btrfs_init_free_cluster(&fs_info->data_alloc_cluster); 1645 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1674,6 +1683,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1674 goto fail_iput; 1683 goto fail_iput;
1675 } 1684 }
1676 1685
1686 features = btrfs_super_incompat_flags(disk_super);
1687 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
1688 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1689 btrfs_set_super_incompat_flags(disk_super, features);
1690 }
1691
1677 features = btrfs_super_compat_ro_flags(disk_super) & 1692 features = btrfs_super_compat_ro_flags(disk_super) &
1678 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1693 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
1679 if (!(sb->s_flags & MS_RDONLY) && features) { 1694 if (!(sb->s_flags & MS_RDONLY) && features) {
@@ -1771,7 +1786,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1771 if (ret) { 1786 if (ret) {
1772 printk(KERN_WARNING "btrfs: failed to read the system " 1787 printk(KERN_WARNING "btrfs: failed to read the system "
1773 "array on %s\n", sb->s_id); 1788 "array on %s\n", sb->s_id);
1774 goto fail_sys_array; 1789 goto fail_sb_buffer;
1775 } 1790 }
1776 1791
1777 blocksize = btrfs_level_size(tree_root, 1792 blocksize = btrfs_level_size(tree_root,
@@ -1785,6 +1800,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1785 btrfs_super_chunk_root(disk_super), 1800 btrfs_super_chunk_root(disk_super),
1786 blocksize, generation); 1801 blocksize, generation);
1787 BUG_ON(!chunk_root->node); 1802 BUG_ON(!chunk_root->node);
1803 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1804 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1805 sb->s_id);
1806 goto fail_chunk_root;
1807 }
1808 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1809 chunk_root->commit_root = btrfs_root_node(chunk_root);
1788 1810
1789 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, 1811 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
1790 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), 1812 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
@@ -1810,7 +1832,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1810 blocksize, generation); 1832 blocksize, generation);
1811 if (!tree_root->node) 1833 if (!tree_root->node)
1812 goto fail_chunk_root; 1834 goto fail_chunk_root;
1813 1835 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1836 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1837 sb->s_id);
1838 goto fail_tree_root;
1839 }
1840 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1841 tree_root->commit_root = btrfs_root_node(tree_root);
1814 1842
1815 ret = find_and_setup_root(tree_root, fs_info, 1843 ret = find_and_setup_root(tree_root, fs_info,
1816 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 1844 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
@@ -1820,14 +1848,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1820 1848
1821 ret = find_and_setup_root(tree_root, fs_info, 1849 ret = find_and_setup_root(tree_root, fs_info,
1822 BTRFS_DEV_TREE_OBJECTID, dev_root); 1850 BTRFS_DEV_TREE_OBJECTID, dev_root);
1823 dev_root->track_dirty = 1;
1824 if (ret) 1851 if (ret)
1825 goto fail_extent_root; 1852 goto fail_extent_root;
1853 dev_root->track_dirty = 1;
1826 1854
1827 ret = find_and_setup_root(tree_root, fs_info, 1855 ret = find_and_setup_root(tree_root, fs_info,
1828 BTRFS_CSUM_TREE_OBJECTID, csum_root); 1856 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1829 if (ret) 1857 if (ret)
1830 goto fail_extent_root; 1858 goto fail_dev_root;
1831 1859
1832 csum_root->track_dirty = 1; 1860 csum_root->track_dirty = 1;
1833 1861
@@ -1849,6 +1877,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1849 if (IS_ERR(fs_info->transaction_kthread)) 1877 if (IS_ERR(fs_info->transaction_kthread))
1850 goto fail_cleaner; 1878 goto fail_cleaner;
1851 1879
1880 if (!btrfs_test_opt(tree_root, SSD) &&
1881 !btrfs_test_opt(tree_root, NOSSD) &&
1882 !fs_info->fs_devices->rotating) {
1883 printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
1884 "mode\n");
1885 btrfs_set_opt(fs_info->mount_opt, SSD);
1886 }
1887
1852 if (btrfs_super_log_root(disk_super) != 0) { 1888 if (btrfs_super_log_root(disk_super) != 0) {
1853 u64 bytenr = btrfs_super_log_root(disk_super); 1889 u64 bytenr = btrfs_super_log_root(disk_super);
1854 1890
@@ -1881,7 +1917,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1881 } 1917 }
1882 1918
1883 if (!(sb->s_flags & MS_RDONLY)) { 1919 if (!(sb->s_flags & MS_RDONLY)) {
1884 ret = btrfs_cleanup_reloc_trees(tree_root); 1920 ret = btrfs_recover_relocation(tree_root);
1885 BUG_ON(ret); 1921 BUG_ON(ret);
1886 } 1922 }
1887 1923
@@ -1892,6 +1928,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1892 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 1928 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1893 if (!fs_info->fs_root) 1929 if (!fs_info->fs_root)
1894 goto fail_trans_kthread; 1930 goto fail_trans_kthread;
1931
1895 return tree_root; 1932 return tree_root;
1896 1933
1897fail_trans_kthread: 1934fail_trans_kthread:
@@ -1908,14 +1945,19 @@ fail_cleaner:
1908 1945
1909fail_csum_root: 1946fail_csum_root:
1910 free_extent_buffer(csum_root->node); 1947 free_extent_buffer(csum_root->node);
1948 free_extent_buffer(csum_root->commit_root);
1949fail_dev_root:
1950 free_extent_buffer(dev_root->node);
1951 free_extent_buffer(dev_root->commit_root);
1911fail_extent_root: 1952fail_extent_root:
1912 free_extent_buffer(extent_root->node); 1953 free_extent_buffer(extent_root->node);
1954 free_extent_buffer(extent_root->commit_root);
1913fail_tree_root: 1955fail_tree_root:
1914 free_extent_buffer(tree_root->node); 1956 free_extent_buffer(tree_root->node);
1957 free_extent_buffer(tree_root->commit_root);
1915fail_chunk_root: 1958fail_chunk_root:
1916 free_extent_buffer(chunk_root->node); 1959 free_extent_buffer(chunk_root->node);
1917fail_sys_array: 1960 free_extent_buffer(chunk_root->commit_root);
1918 free_extent_buffer(dev_root->node);
1919fail_sb_buffer: 1961fail_sb_buffer:
1920 btrfs_stop_workers(&fs_info->fixup_workers); 1962 btrfs_stop_workers(&fs_info->fixup_workers);
1921 btrfs_stop_workers(&fs_info->delalloc_workers); 1963 btrfs_stop_workers(&fs_info->delalloc_workers);
@@ -1931,8 +1973,8 @@ fail_iput:
1931 1973
1932 btrfs_close_devices(fs_info->fs_devices); 1974 btrfs_close_devices(fs_info->fs_devices);
1933 btrfs_mapping_tree_free(&fs_info->mapping_tree); 1975 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1976fail_bdi:
1934 bdi_destroy(&fs_info->bdi); 1977 bdi_destroy(&fs_info->bdi);
1935
1936fail: 1978fail:
1937 kfree(extent_root); 1979 kfree(extent_root);
1938 kfree(tree_root); 1980 kfree(tree_root);
@@ -2005,6 +2047,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
2005 return latest; 2047 return latest;
2006} 2048}
2007 2049
2050/*
2051 * this should be called twice, once with wait == 0 and
2052 * once with wait == 1. When wait == 0 is done, all the buffer heads
2053 * we write are pinned.
2054 *
2055 * They are released when wait == 1 is done.
2056 * max_mirrors must be the same for both runs, and it indicates how
2057 * many supers on this one device should be written.
2058 *
2059 * max_mirrors == 0 means to write them all.
2060 */
2008static int write_dev_supers(struct btrfs_device *device, 2061static int write_dev_supers(struct btrfs_device *device,
2009 struct btrfs_super_block *sb, 2062 struct btrfs_super_block *sb,
2010 int do_barriers, int wait, int max_mirrors) 2063 int do_barriers, int wait, int max_mirrors)
@@ -2040,12 +2093,16 @@ static int write_dev_supers(struct btrfs_device *device,
2040 bh = __find_get_block(device->bdev, bytenr / 4096, 2093 bh = __find_get_block(device->bdev, bytenr / 4096,
2041 BTRFS_SUPER_INFO_SIZE); 2094 BTRFS_SUPER_INFO_SIZE);
2042 BUG_ON(!bh); 2095 BUG_ON(!bh);
2043 brelse(bh);
2044 wait_on_buffer(bh); 2096 wait_on_buffer(bh);
2045 if (buffer_uptodate(bh)) { 2097 if (!buffer_uptodate(bh))
2046 brelse(bh); 2098 errors++;
2047 continue; 2099
2048 } 2100 /* drop our reference */
2101 brelse(bh);
2102
2103 /* drop the reference from the wait == 0 run */
2104 brelse(bh);
2105 continue;
2049 } else { 2106 } else {
2050 btrfs_set_super_bytenr(sb, bytenr); 2107 btrfs_set_super_bytenr(sb, bytenr);
2051 2108
@@ -2056,12 +2113,18 @@ static int write_dev_supers(struct btrfs_device *device,
2056 BTRFS_CSUM_SIZE); 2113 BTRFS_CSUM_SIZE);
2057 btrfs_csum_final(crc, sb->csum); 2114 btrfs_csum_final(crc, sb->csum);
2058 2115
2116 /*
2117 * one reference for us, and we leave it for the
2118 * caller
2119 */
2059 bh = __getblk(device->bdev, bytenr / 4096, 2120 bh = __getblk(device->bdev, bytenr / 4096,
2060 BTRFS_SUPER_INFO_SIZE); 2121 BTRFS_SUPER_INFO_SIZE);
2061 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); 2122 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
2062 2123
2063 set_buffer_uptodate(bh); 2124 /* one reference for submit_bh */
2064 get_bh(bh); 2125 get_bh(bh);
2126
2127 set_buffer_uptodate(bh);
2065 lock_buffer(bh); 2128 lock_buffer(bh);
2066 bh->b_end_io = btrfs_end_buffer_write_sync; 2129 bh->b_end_io = btrfs_end_buffer_write_sync;
2067 } 2130 }
@@ -2073,6 +2136,7 @@ static int write_dev_supers(struct btrfs_device *device,
2073 device->name); 2136 device->name);
2074 set_buffer_uptodate(bh); 2137 set_buffer_uptodate(bh);
2075 device->barriers = 0; 2138 device->barriers = 0;
2139 /* one reference for submit_bh */
2076 get_bh(bh); 2140 get_bh(bh);
2077 lock_buffer(bh); 2141 lock_buffer(bh);
2078 ret = submit_bh(WRITE_SYNC, bh); 2142 ret = submit_bh(WRITE_SYNC, bh);
@@ -2081,22 +2145,15 @@ static int write_dev_supers(struct btrfs_device *device,
2081 ret = submit_bh(WRITE_SYNC, bh); 2145 ret = submit_bh(WRITE_SYNC, bh);
2082 } 2146 }
2083 2147
2084 if (!ret && wait) { 2148 if (ret)
2085 wait_on_buffer(bh);
2086 if (!buffer_uptodate(bh))
2087 errors++;
2088 } else if (ret) {
2089 errors++; 2149 errors++;
2090 }
2091 if (wait)
2092 brelse(bh);
2093 } 2150 }
2094 return errors < i ? 0 : -1; 2151 return errors < i ? 0 : -1;
2095} 2152}
2096 2153
2097int write_all_supers(struct btrfs_root *root, int max_mirrors) 2154int write_all_supers(struct btrfs_root *root, int max_mirrors)
2098{ 2155{
2099 struct list_head *head = &root->fs_info->fs_devices->devices; 2156 struct list_head *head;
2100 struct btrfs_device *dev; 2157 struct btrfs_device *dev;
2101 struct btrfs_super_block *sb; 2158 struct btrfs_super_block *sb;
2102 struct btrfs_dev_item *dev_item; 2159 struct btrfs_dev_item *dev_item;
@@ -2111,6 +2168,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2111 2168
2112 sb = &root->fs_info->super_for_commit; 2169 sb = &root->fs_info->super_for_commit;
2113 dev_item = &sb->dev_item; 2170 dev_item = &sb->dev_item;
2171
2172 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2173 head = &root->fs_info->fs_devices->devices;
2114 list_for_each_entry(dev, head, dev_list) { 2174 list_for_each_entry(dev, head, dev_list) {
2115 if (!dev->bdev) { 2175 if (!dev->bdev) {
2116 total_errors++; 2176 total_errors++;
@@ -2154,6 +2214,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2154 if (ret) 2214 if (ret)
2155 total_errors++; 2215 total_errors++;
2156 } 2216 }
2217 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2157 if (total_errors > max_errors) { 2218 if (total_errors > max_errors) {
2158 printk(KERN_ERR "btrfs: %d errors while writing supers\n", 2219 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2159 total_errors); 2220 total_errors);
@@ -2173,6 +2234,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
2173 2234
2174int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 2235int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2175{ 2236{
2237 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2176 radix_tree_delete(&fs_info->fs_roots_radix, 2238 radix_tree_delete(&fs_info->fs_roots_radix,
2177 (unsigned long)root->root_key.objectid); 2239 (unsigned long)root->root_key.objectid);
2178 if (root->anon_super.s_dev) { 2240 if (root->anon_super.s_dev) {
@@ -2219,10 +2281,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2219 ARRAY_SIZE(gang)); 2281 ARRAY_SIZE(gang));
2220 if (!ret) 2282 if (!ret)
2221 break; 2283 break;
2284
2285 root_objectid = gang[ret - 1]->root_key.objectid + 1;
2222 for (i = 0; i < ret; i++) { 2286 for (i = 0; i < ret; i++) {
2223 root_objectid = gang[i]->root_key.objectid; 2287 root_objectid = gang[i]->root_key.objectid;
2224 ret = btrfs_find_dead_roots(fs_info->tree_root, 2288 ret = btrfs_find_dead_roots(fs_info->tree_root,
2225 root_objectid, gang[i]); 2289 root_objectid);
2226 BUG_ON(ret); 2290 BUG_ON(ret);
2227 btrfs_orphan_cleanup(gang[i]); 2291 btrfs_orphan_cleanup(gang[i]);
2228 } 2292 }
@@ -2269,6 +2333,9 @@ int close_ctree(struct btrfs_root *root)
2269 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2333 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2270 } 2334 }
2271 2335
2336 fs_info->closing = 2;
2337 smp_mb();
2338
2272 if (fs_info->delalloc_bytes) { 2339 if (fs_info->delalloc_bytes) {
2273 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 2340 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
2274 (unsigned long long)fs_info->delalloc_bytes); 2341 (unsigned long long)fs_info->delalloc_bytes);
@@ -2278,22 +2345,19 @@ int close_ctree(struct btrfs_root *root)
2278 (unsigned long long)fs_info->total_ref_cache_size); 2345 (unsigned long long)fs_info->total_ref_cache_size);
2279 } 2346 }
2280 2347
2281 if (fs_info->extent_root->node) 2348 free_extent_buffer(fs_info->extent_root->node);
2282 free_extent_buffer(fs_info->extent_root->node); 2349 free_extent_buffer(fs_info->extent_root->commit_root);
2283 2350 free_extent_buffer(fs_info->tree_root->node);
2284 if (fs_info->tree_root->node) 2351 free_extent_buffer(fs_info->tree_root->commit_root);
2285 free_extent_buffer(fs_info->tree_root->node); 2352 free_extent_buffer(root->fs_info->chunk_root->node);
2286 2353 free_extent_buffer(root->fs_info->chunk_root->commit_root);
2287 if (root->fs_info->chunk_root->node) 2354 free_extent_buffer(root->fs_info->dev_root->node);
2288 free_extent_buffer(root->fs_info->chunk_root->node); 2355 free_extent_buffer(root->fs_info->dev_root->commit_root);
2289 2356 free_extent_buffer(root->fs_info->csum_root->node);
2290 if (root->fs_info->dev_root->node) 2357 free_extent_buffer(root->fs_info->csum_root->commit_root);
2291 free_extent_buffer(root->fs_info->dev_root->node);
2292
2293 if (root->fs_info->csum_root->node)
2294 free_extent_buffer(root->fs_info->csum_root->node);
2295 2358
2296 btrfs_free_block_groups(root->fs_info); 2359 btrfs_free_block_groups(root->fs_info);
2360 btrfs_free_pinned_extents(root->fs_info);
2297 2361
2298 del_fs_roots(fs_info); 2362 del_fs_roots(fs_info);
2299 2363
@@ -2373,17 +2437,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2373 * looks as though older kernels can get into trouble with 2437 * looks as though older kernels can get into trouble with
2374 * this code, they end up stuck in balance_dirty_pages forever 2438 * this code, they end up stuck in balance_dirty_pages forever
2375 */ 2439 */
2376 struct extent_io_tree *tree;
2377 u64 num_dirty; 2440 u64 num_dirty;
2378 u64 start = 0;
2379 unsigned long thresh = 32 * 1024 * 1024; 2441 unsigned long thresh = 32 * 1024 * 1024;
2380 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2381 2442
2382 if (current->flags & PF_MEMALLOC) 2443 if (current->flags & PF_MEMALLOC)
2383 return; 2444 return;
2384 2445
2385 num_dirty = count_range_bits(tree, &start, (u64)-1, 2446 num_dirty = root->fs_info->dirty_metadata_bytes;
2386 thresh, EXTENT_DIRTY); 2447
2387 if (num_dirty > thresh) { 2448 if (num_dirty > thresh) {
2388 balance_dirty_pages_ratelimited_nr( 2449 balance_dirty_pages_ratelimited_nr(
2389 root->fs_info->btree_inode->i_mapping, 1); 2450 root->fs_info->btree_inode->i_mapping, 1);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 85315d2c90de..9596b40caa4e 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -78,7 +78,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
78 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 78 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
79 key.offset = 0; 79 key.offset = 0;
80 80
81 inode = btrfs_iget(sb, &key, root, NULL); 81 inode = btrfs_iget(sb, &key, root);
82 if (IS_ERR(inode)) 82 if (IS_ERR(inode))
83 return (void *)inode; 83 return (void *)inode;
84 84
@@ -192,7 +192,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
192 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 192 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
193 key.offset = 0; 193 key.offset = 0;
194 194
195 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); 195 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
196} 196}
197 197
198const struct export_operations btrfs_export_ops = { 198const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 35af93355063..72a2b9c28e9f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,57 +21,54 @@
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h>
24#include "compat.h" 25#include "compat.h"
25#include "hash.h" 26#include "hash.h"
26#include "crc32c.h"
27#include "ctree.h" 27#include "ctree.h"
28#include "disk-io.h" 28#include "disk-io.h"
29#include "print-tree.h" 29#include "print-tree.h"
30#include "transaction.h" 30#include "transaction.h"
31#include "volumes.h" 31#include "volumes.h"
32#include "locking.h" 32#include "locking.h"
33#include "ref-cache.h"
34#include "free-space-cache.h" 33#include "free-space-cache.h"
35 34
36#define PENDING_EXTENT_INSERT 0
37#define PENDING_EXTENT_DELETE 1
38#define PENDING_BACKREF_UPDATE 2
39
40struct pending_extent_op {
41 int type;
42 u64 bytenr;
43 u64 num_bytes;
44 u64 parent;
45 u64 orig_parent;
46 u64 generation;
47 u64 orig_generation;
48 int level;
49 struct list_head list;
50 int del;
51};
52
53static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, u64 parent,
55 u64 root_objectid, u64 ref_generation,
56 u64 owner, struct btrfs_key *ins,
57 int ref_mod);
58static int update_reserved_extents(struct btrfs_root *root, 35static int update_reserved_extents(struct btrfs_root *root,
59 u64 bytenr, u64 num, int reserve); 36 u64 bytenr, u64 num, int reserve);
60static int update_block_group(struct btrfs_trans_handle *trans, 37static int update_block_group(struct btrfs_trans_handle *trans,
61 struct btrfs_root *root, 38 struct btrfs_root *root,
62 u64 bytenr, u64 num_bytes, int alloc, 39 u64 bytenr, u64 num_bytes, int alloc,
63 int mark_free); 40 int mark_free);
64static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
65 struct btrfs_root *root, 42 struct btrfs_root *root,
66 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
67 u64 root_objectid, u64 ref_generation, 44 u64 root_objectid, u64 owner_objectid,
68 u64 owner_objectid, int pin, 45 u64 owner_offset, int refs_to_drop,
69 int ref_to_drop); 46 struct btrfs_delayed_extent_op *extra_op);
47static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
48 struct extent_buffer *leaf,
49 struct btrfs_extent_item *ei);
50static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root,
52 u64 parent, u64 root_objectid,
53 u64 flags, u64 owner, u64 offset,
54 struct btrfs_key *ins, int ref_mod);
55static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
56 struct btrfs_root *root,
57 u64 parent, u64 root_objectid,
58 u64 flags, struct btrfs_disk_key *key,
59 int level, struct btrfs_key *ins);
70 60
71static int do_chunk_alloc(struct btrfs_trans_handle *trans, 61static int do_chunk_alloc(struct btrfs_trans_handle *trans,
72 struct btrfs_root *extent_root, u64 alloc_bytes, 62 struct btrfs_root *extent_root, u64 alloc_bytes,
73 u64 flags, int force); 63 u64 flags, int force);
74 64
65static noinline int
66block_group_cache_done(struct btrfs_block_group_cache *cache)
67{
68 smp_mb();
69 return cache->cached == BTRFS_CACHE_FINISHED;
70}
71
75static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 72static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
76{ 73{
77 return (cache->flags & bits) == bits; 74 return (cache->flags & bits) == bits;
@@ -157,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
157} 154}
158 155
159/* 156/*
157 * We always set EXTENT_LOCKED for the super mirror extents so we don't
158 * overwrite them, so those bits need to be unset. Also, if we are unmounting
159 * with pinned extents still sitting there because we had a block group caching,
160 * we need to clear those now, since we are done.
161 */
162void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
163{
164 u64 start, end, last = 0;
165 int ret;
166
167 while (1) {
168 ret = find_first_extent_bit(&info->pinned_extents, last,
169 &start, &end,
170 EXTENT_LOCKED|EXTENT_DIRTY);
171 if (ret)
172 break;
173
174 clear_extent_bits(&info->pinned_extents, start, end,
175 EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
176 last = end+1;
177 }
178}
179
180static int remove_sb_from_cache(struct btrfs_root *root,
181 struct btrfs_block_group_cache *cache)
182{
183 struct btrfs_fs_info *fs_info = root->fs_info;
184 u64 bytenr;
185 u64 *logical;
186 int stripe_len;
187 int i, nr, ret;
188
189 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
190 bytenr = btrfs_sb_offset(i);
191 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
192 cache->key.objectid, bytenr,
193 0, &logical, &nr, &stripe_len);
194 BUG_ON(ret);
195 while (nr--) {
196 try_lock_extent(&fs_info->pinned_extents,
197 logical[nr],
198 logical[nr] + stripe_len - 1, GFP_NOFS);
199 }
200 kfree(logical);
201 }
202
203 return 0;
204}
205
206/*
160 * this is only called by cache_block_group, since we could have freed extents 207 * this is only called by cache_block_group, since we could have freed extents
161 * we need to check the pinned_extents for any extents that can't be used yet 208 * we need to check the pinned_extents for any extents that can't be used yet
162 * since their free space will be released as soon as the transaction commits. 209 * since their free space will be released as soon as the transaction commits.
163 */ 210 */
164static int add_new_free_space(struct btrfs_block_group_cache *block_group, 211static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
165 struct btrfs_fs_info *info, u64 start, u64 end) 212 struct btrfs_fs_info *info, u64 start, u64 end)
166{ 213{
167 u64 extent_start, extent_end, size; 214 u64 extent_start, extent_end, size, total_added = 0;
168 int ret; 215 int ret;
169 216
170 while (start < end) { 217 while (start < end) {
171 ret = find_first_extent_bit(&info->pinned_extents, start, 218 ret = find_first_extent_bit(&info->pinned_extents, start,
172 &extent_start, &extent_end, 219 &extent_start, &extent_end,
173 EXTENT_DIRTY); 220 EXTENT_DIRTY|EXTENT_LOCKED);
174 if (ret) 221 if (ret)
175 break; 222 break;
176 223
@@ -178,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
178 start = extent_end + 1; 225 start = extent_end + 1;
179 } else if (extent_start > start && extent_start < end) { 226 } else if (extent_start > start && extent_start < end) {
180 size = extent_start - start; 227 size = extent_start - start;
228 total_added += size;
181 ret = btrfs_add_free_space(block_group, start, 229 ret = btrfs_add_free_space(block_group, start,
182 size); 230 size);
183 BUG_ON(ret); 231 BUG_ON(ret);
@@ -189,84 +237,93 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
189 237
190 if (start < end) { 238 if (start < end) {
191 size = end - start; 239 size = end - start;
240 total_added += size;
192 ret = btrfs_add_free_space(block_group, start, size); 241 ret = btrfs_add_free_space(block_group, start, size);
193 BUG_ON(ret); 242 BUG_ON(ret);
194 } 243 }
195 244
196 return 0; 245 return total_added;
197}
198
199static int remove_sb_from_cache(struct btrfs_root *root,
200 struct btrfs_block_group_cache *cache)
201{
202 u64 bytenr;
203 u64 *logical;
204 int stripe_len;
205 int i, nr, ret;
206
207 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
208 bytenr = btrfs_sb_offset(i);
209 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
210 cache->key.objectid, bytenr, 0,
211 &logical, &nr, &stripe_len);
212 BUG_ON(ret);
213 while (nr--) {
214 btrfs_remove_free_space(cache, logical[nr],
215 stripe_len);
216 }
217 kfree(logical);
218 }
219 return 0;
220} 246}
221 247
222static int cache_block_group(struct btrfs_root *root, 248static int caching_kthread(void *data)
223 struct btrfs_block_group_cache *block_group)
224{ 249{
250 struct btrfs_block_group_cache *block_group = data;
251 struct btrfs_fs_info *fs_info = block_group->fs_info;
252 u64 last = 0;
225 struct btrfs_path *path; 253 struct btrfs_path *path;
226 int ret = 0; 254 int ret = 0;
227 struct btrfs_key key; 255 struct btrfs_key key;
228 struct extent_buffer *leaf; 256 struct extent_buffer *leaf;
229 int slot; 257 int slot;
230 u64 last; 258 u64 total_found = 0;
231 259
232 if (!block_group) 260 BUG_ON(!fs_info);
233 return 0;
234
235 root = root->fs_info->extent_root;
236
237 if (block_group->cached)
238 return 0;
239 261
240 path = btrfs_alloc_path(); 262 path = btrfs_alloc_path();
241 if (!path) 263 if (!path)
242 return -ENOMEM; 264 return -ENOMEM;
243 265
244 path->reada = 2; 266 atomic_inc(&block_group->space_info->caching_threads);
267 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
245 /* 268 /*
246 * we get into deadlocks with paths held by callers of this function. 269 * We don't want to deadlock with somebody trying to allocate a new
247 * since the alloc_mutex is protecting things right now, just 270 * extent for the extent root while also trying to search the extent
248 * skip the locking here 271 * root to add free space. So we skip locking and search the commit
272 * root, since its read-only
249 */ 273 */
250 path->skip_locking = 1; 274 path->skip_locking = 1;
251 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 275 path->search_commit_root = 1;
276 path->reada = 2;
277
252 key.objectid = last; 278 key.objectid = last;
253 key.offset = 0; 279 key.offset = 0;
254 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 280 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
255 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 281again:
282 /* need to make sure the commit_root doesn't disappear */
283 down_read(&fs_info->extent_commit_sem);
284
285 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
256 if (ret < 0) 286 if (ret < 0)
257 goto err; 287 goto err;
258 288
259 while (1) { 289 while (1) {
290 smp_mb();
291 if (block_group->fs_info->closing > 1) {
292 last = (u64)-1;
293 break;
294 }
295
260 leaf = path->nodes[0]; 296 leaf = path->nodes[0];
261 slot = path->slots[0]; 297 slot = path->slots[0];
262 if (slot >= btrfs_header_nritems(leaf)) { 298 if (slot >= btrfs_header_nritems(leaf)) {
263 ret = btrfs_next_leaf(root, path); 299 ret = btrfs_next_leaf(fs_info->extent_root, path);
264 if (ret < 0) 300 if (ret < 0)
265 goto err; 301 goto err;
266 if (ret == 0) 302 else if (ret)
267 continue;
268 else
269 break; 303 break;
304
305 if (need_resched() ||
306 btrfs_transaction_in_commit(fs_info)) {
307 leaf = path->nodes[0];
308
309 /* this shouldn't happen, but if the
310 * leaf is empty just move on.
311 */
312 if (btrfs_header_nritems(leaf) == 0)
313 break;
314 /*
315 * we need to copy the key out so that
316 * we are sure the next search advances
317 * us forward in the btree.
318 */
319 btrfs_item_key_to_cpu(leaf, &key, 0);
320 btrfs_release_path(fs_info->extent_root, path);
321 up_read(&fs_info->extent_commit_sem);
322 schedule_timeout(1);
323 goto again;
324 }
325
326 continue;
270 } 327 }
271 btrfs_item_key_to_cpu(leaf, &key, slot); 328 btrfs_item_key_to_cpu(leaf, &key, slot);
272 if (key.objectid < block_group->key.objectid) 329 if (key.objectid < block_group->key.objectid)
@@ -277,24 +334,59 @@ static int cache_block_group(struct btrfs_root *root,
277 break; 334 break;
278 335
279 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { 336 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
280 add_new_free_space(block_group, root->fs_info, last, 337 total_found += add_new_free_space(block_group,
281 key.objectid); 338 fs_info, last,
282 339 key.objectid);
283 last = key.objectid + key.offset; 340 last = key.objectid + key.offset;
284 } 341 }
342
343 if (total_found > (1024 * 1024 * 2)) {
344 total_found = 0;
345 wake_up(&block_group->caching_q);
346 }
285next: 347next:
286 path->slots[0]++; 348 path->slots[0]++;
287 } 349 }
350 ret = 0;
288 351
289 add_new_free_space(block_group, root->fs_info, last, 352 total_found += add_new_free_space(block_group, fs_info, last,
290 block_group->key.objectid + 353 block_group->key.objectid +
291 block_group->key.offset); 354 block_group->key.offset);
355
356 spin_lock(&block_group->lock);
357 block_group->cached = BTRFS_CACHE_FINISHED;
358 spin_unlock(&block_group->lock);
292 359
293 block_group->cached = 1;
294 remove_sb_from_cache(root, block_group);
295 ret = 0;
296err: 360err:
297 btrfs_free_path(path); 361 btrfs_free_path(path);
362 up_read(&fs_info->extent_commit_sem);
363 atomic_dec(&block_group->space_info->caching_threads);
364 wake_up(&block_group->caching_q);
365
366 return 0;
367}
368
369static int cache_block_group(struct btrfs_block_group_cache *cache)
370{
371 struct task_struct *tsk;
372 int ret = 0;
373
374 spin_lock(&cache->lock);
375 if (cache->cached != BTRFS_CACHE_NO) {
376 spin_unlock(&cache->lock);
377 return ret;
378 }
379 cache->cached = BTRFS_CACHE_STARTED;
380 spin_unlock(&cache->lock);
381
382 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
383 cache->key.objectid);
384 if (IS_ERR(tsk)) {
385 ret = PTR_ERR(tsk);
386 printk(KERN_ERR "error running thread %d\n", ret);
387 BUG();
388 }
389
298 return ret; 390 return ret;
299} 391}
300 392
@@ -453,199 +545,968 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
453 * maintenance. This is actually the same as #2, but with a slightly 545 * maintenance. This is actually the same as #2, but with a slightly
454 * different use case. 546 * different use case.
455 * 547 *
548 * There are two kinds of back refs. The implicit back refs is optimized
549 * for pointers in non-shared tree blocks. For a given pointer in a block,
550 * back refs of this kind provide information about the block's owner tree
551 * and the pointer's key. These information allow us to find the block by
552 * b-tree searching. The full back refs is for pointers in tree blocks not
553 * referenced by their owner trees. The location of tree block is recorded
554 * in the back refs. Actually the full back refs is generic, and can be
555 * used in all cases the implicit back refs is used. The major shortcoming
556 * of the full back refs is its overhead. Every time a tree block gets
557 * COWed, we have to update back refs entry for all pointers in it.
558 *
559 * For a newly allocated tree block, we use implicit back refs for
560 * pointers in it. This means most tree related operations only involve
561 * implicit back refs. For a tree block created in old transaction, the
562 * only way to drop a reference to it is COW it. So we can detect the
563 * event that tree block loses its owner tree's reference and do the
564 * back refs conversion.
565 *
566 * When a tree block is COW'd through a tree, there are four cases:
567 *
568 * The reference count of the block is one and the tree is the block's
569 * owner tree. Nothing to do in this case.
570 *
571 * The reference count of the block is one and the tree is not the
572 * block's owner tree. In this case, full back refs is used for pointers
573 * in the block. Remove these full back refs, add implicit back refs for
574 * every pointers in the new block.
575 *
576 * The reference count of the block is greater than one and the tree is
577 * the block's owner tree. In this case, implicit back refs is used for
578 * pointers in the block. Add full back refs for every pointers in the
579 * block, increase lower level extents' reference counts. The original
580 * implicit back refs are entailed to the new block.
581 *
582 * The reference count of the block is greater than one and the tree is
583 * not the block's owner tree. Add implicit back refs for every pointer in
584 * the new block, increase lower level extents' reference count.
585 *
586 * Back Reference Key composing:
587 *
588 * The key objectid corresponds to the first byte in the extent,
589 * The key type is used to differentiate between types of back refs.
590 * There are different meanings of the key offset for different types
591 * of back refs.
592 *
456 * File extents can be referenced by: 593 * File extents can be referenced by:
457 * 594 *
458 * - multiple snapshots, subvolumes, or different generations in one subvol 595 * - multiple snapshots, subvolumes, or different generations in one subvol
459 * - different files inside a single subvolume 596 * - different files inside a single subvolume
460 * - different offsets inside a file (bookend extents in file.c) 597 * - different offsets inside a file (bookend extents in file.c)
461 * 598 *
462 * The extent ref structure has fields for: 599 * The extent ref structure for the implicit back refs has fields for:
463 * 600 *
464 * - Objectid of the subvolume root 601 * - Objectid of the subvolume root
465 * - Generation number of the tree holding the reference
466 * - objectid of the file holding the reference 602 * - objectid of the file holding the reference
467 * - number of references holding by parent node (alway 1 for tree blocks) 603 * - original offset in the file
604 * - how many bookend extents
468 * 605 *
469 * Btree leaf may hold multiple references to a file extent. In most cases, 606 * The key offset for the implicit back refs is hash of the first
470 * these references are from same file and the corresponding offsets inside 607 * three fields.
471 * the file are close together.
472 * 608 *
473 * When a file extent is allocated the fields are filled in: 609 * The extent ref structure for the full back refs has field for:
474 * (root_key.objectid, trans->transid, inode objectid, 1)
475 * 610 *
476 * When a leaf is cow'd new references are added for every file extent found 611 * - number of pointers in the tree leaf
477 * in the leaf. It looks similar to the create case, but trans->transid will
478 * be different when the block is cow'd.
479 * 612 *
480 * (root_key.objectid, trans->transid, inode objectid, 613 * The key offset for the implicit back refs is the first byte of
481 * number of references in the leaf) 614 * the tree leaf
482 * 615 *
483 * When a file extent is removed either during snapshot deletion or 616 * When a file extent is allocated, The implicit back refs is used.
484 * file truncation, we find the corresponding back reference and check 617 * the fields are filled in:
485 * the following fields:
486 * 618 *
487 * (btrfs_header_owner(leaf), btrfs_header_generation(leaf), 619 * (root_key.objectid, inode objectid, offset in file, 1)
488 * inode objectid)
489 * 620 *
490 * Btree extents can be referenced by: 621 * When a file extent is removed file truncation, we find the
491 * 622 * corresponding implicit back refs and check the following fields:
492 * - Different subvolumes
493 * - Different generations of the same subvolume
494 *
495 * When a tree block is created, back references are inserted:
496 *
497 * (root->root_key.objectid, trans->transid, level, 1)
498 *
499 * When a tree block is cow'd, new back references are added for all the
500 * blocks it points to. If the tree block isn't in reference counted root,
501 * the old back references are removed. These new back references are of
502 * the form (trans->transid will have increased since creation):
503 * 623 *
504 * (root->root_key.objectid, trans->transid, level, 1) 624 * (btrfs_header_owner(leaf), inode objectid, offset in file)
505 * 625 *
506 * When a backref is in deleting, the following fields are checked: 626 * Btree extents can be referenced by:
507 * 627 *
508 * if backref was for a tree root: 628 * - Different subvolumes
509 * (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
510 * else
511 * (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
512 * 629 *
513 * Back Reference Key composing: 630 * Both the implicit back refs and the full back refs for tree blocks
631 * only consist of key. The key offset for the implicit back refs is
632 * objectid of block's owner tree. The key offset for the full back refs
633 * is the first byte of parent block.
514 * 634 *
515 * The key objectid corresponds to the first byte in the extent, the key 635 * When implicit back refs is used, information about the lowest key and
516 * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first 636 * level of the tree block are required. These information are stored in
517 * byte of parent extent. If a extent is tree root, the key offset is set 637 * tree block info structure.
518 * to the key objectid.
519 */ 638 */
520 639
521static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans, 640#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
522 struct btrfs_root *root, 641static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
523 struct btrfs_path *path, 642 struct btrfs_root *root,
524 u64 bytenr, u64 parent, 643 struct btrfs_path *path,
525 u64 ref_root, u64 ref_generation, 644 u64 owner, u32 extra_size)
526 u64 owner_objectid, int del)
527{ 645{
646 struct btrfs_extent_item *item;
647 struct btrfs_extent_item_v0 *ei0;
648 struct btrfs_extent_ref_v0 *ref0;
649 struct btrfs_tree_block_info *bi;
650 struct extent_buffer *leaf;
528 struct btrfs_key key; 651 struct btrfs_key key;
529 struct btrfs_extent_ref *ref; 652 struct btrfs_key found_key;
653 u32 new_size = sizeof(*item);
654 u64 refs;
655 int ret;
656
657 leaf = path->nodes[0];
658 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
659
660 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
661 ei0 = btrfs_item_ptr(leaf, path->slots[0],
662 struct btrfs_extent_item_v0);
663 refs = btrfs_extent_refs_v0(leaf, ei0);
664
665 if (owner == (u64)-1) {
666 while (1) {
667 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
668 ret = btrfs_next_leaf(root, path);
669 if (ret < 0)
670 return ret;
671 BUG_ON(ret > 0);
672 leaf = path->nodes[0];
673 }
674 btrfs_item_key_to_cpu(leaf, &found_key,
675 path->slots[0]);
676 BUG_ON(key.objectid != found_key.objectid);
677 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
678 path->slots[0]++;
679 continue;
680 }
681 ref0 = btrfs_item_ptr(leaf, path->slots[0],
682 struct btrfs_extent_ref_v0);
683 owner = btrfs_ref_objectid_v0(leaf, ref0);
684 break;
685 }
686 }
687 btrfs_release_path(root, path);
688
689 if (owner < BTRFS_FIRST_FREE_OBJECTID)
690 new_size += sizeof(*bi);
691
692 new_size -= sizeof(*ei0);
693 ret = btrfs_search_slot(trans, root, &key, path,
694 new_size + extra_size, 1);
695 if (ret < 0)
696 return ret;
697 BUG_ON(ret);
698
699 ret = btrfs_extend_item(trans, root, path, new_size);
700 BUG_ON(ret);
701
702 leaf = path->nodes[0];
703 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
704 btrfs_set_extent_refs(leaf, item, refs);
705 /* FIXME: get real generation */
706 btrfs_set_extent_generation(leaf, item, 0);
707 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
708 btrfs_set_extent_flags(leaf, item,
709 BTRFS_EXTENT_FLAG_TREE_BLOCK |
710 BTRFS_BLOCK_FLAG_FULL_BACKREF);
711 bi = (struct btrfs_tree_block_info *)(item + 1);
712 /* FIXME: get first key of the block */
713 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
714 btrfs_set_tree_block_level(leaf, bi, (int)owner);
715 } else {
716 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
717 }
718 btrfs_mark_buffer_dirty(leaf);
719 return 0;
720}
721#endif
722
723static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
724{
725 u32 high_crc = ~(u32)0;
726 u32 low_crc = ~(u32)0;
727 __le64 lenum;
728
729 lenum = cpu_to_le64(root_objectid);
730 high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
731 lenum = cpu_to_le64(owner);
732 low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
733 lenum = cpu_to_le64(offset);
734 low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
735
736 return ((u64)high_crc << 31) ^ (u64)low_crc;
737}
738
739static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
740 struct btrfs_extent_data_ref *ref)
741{
742 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
743 btrfs_extent_data_ref_objectid(leaf, ref),
744 btrfs_extent_data_ref_offset(leaf, ref));
745}
746
747static int match_extent_data_ref(struct extent_buffer *leaf,
748 struct btrfs_extent_data_ref *ref,
749 u64 root_objectid, u64 owner, u64 offset)
750{
751 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
752 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
753 btrfs_extent_data_ref_offset(leaf, ref) != offset)
754 return 0;
755 return 1;
756}
757
758static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
759 struct btrfs_root *root,
760 struct btrfs_path *path,
761 u64 bytenr, u64 parent,
762 u64 root_objectid,
763 u64 owner, u64 offset)
764{
765 struct btrfs_key key;
766 struct btrfs_extent_data_ref *ref;
530 struct extent_buffer *leaf; 767 struct extent_buffer *leaf;
531 u64 ref_objectid; 768 u32 nritems;
532 int ret; 769 int ret;
770 int recow;
771 int err = -ENOENT;
533 772
534 key.objectid = bytenr; 773 key.objectid = bytenr;
535 key.type = BTRFS_EXTENT_REF_KEY; 774 if (parent) {
536 key.offset = parent; 775 key.type = BTRFS_SHARED_DATA_REF_KEY;
776 key.offset = parent;
777 } else {
778 key.type = BTRFS_EXTENT_DATA_REF_KEY;
779 key.offset = hash_extent_data_ref(root_objectid,
780 owner, offset);
781 }
782again:
783 recow = 0;
784 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
785 if (ret < 0) {
786 err = ret;
787 goto fail;
788 }
537 789
538 ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1); 790 if (parent) {
539 if (ret < 0) 791 if (!ret)
540 goto out; 792 return 0;
541 if (ret > 0) { 793#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
542 ret = -ENOENT; 794 key.type = BTRFS_EXTENT_REF_V0_KEY;
543 goto out; 795 btrfs_release_path(root, path);
796 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
797 if (ret < 0) {
798 err = ret;
799 goto fail;
800 }
801 if (!ret)
802 return 0;
803#endif
804 goto fail;
544 } 805 }
545 806
546 leaf = path->nodes[0]; 807 leaf = path->nodes[0];
547 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); 808 nritems = btrfs_header_nritems(leaf);
548 ref_objectid = btrfs_ref_objectid(leaf, ref); 809 while (1) {
549 if (btrfs_ref_root(leaf, ref) != ref_root || 810 if (path->slots[0] >= nritems) {
550 btrfs_ref_generation(leaf, ref) != ref_generation || 811 ret = btrfs_next_leaf(root, path);
551 (ref_objectid != owner_objectid && 812 if (ret < 0)
552 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { 813 err = ret;
553 ret = -EIO; 814 if (ret)
554 WARN_ON(1); 815 goto fail;
555 goto out; 816
817 leaf = path->nodes[0];
818 nritems = btrfs_header_nritems(leaf);
819 recow = 1;
820 }
821
822 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
823 if (key.objectid != bytenr ||
824 key.type != BTRFS_EXTENT_DATA_REF_KEY)
825 goto fail;
826
827 ref = btrfs_item_ptr(leaf, path->slots[0],
828 struct btrfs_extent_data_ref);
829
830 if (match_extent_data_ref(leaf, ref, root_objectid,
831 owner, offset)) {
832 if (recow) {
833 btrfs_release_path(root, path);
834 goto again;
835 }
836 err = 0;
837 break;
838 }
839 path->slots[0]++;
556 } 840 }
557 ret = 0; 841fail:
558out: 842 return err;
559 return ret;
560} 843}
561 844
562static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, 845static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
563 struct btrfs_root *root, 846 struct btrfs_root *root,
564 struct btrfs_path *path, 847 struct btrfs_path *path,
565 u64 bytenr, u64 parent, 848 u64 bytenr, u64 parent,
566 u64 ref_root, u64 ref_generation, 849 u64 root_objectid, u64 owner,
567 u64 owner_objectid, 850 u64 offset, int refs_to_add)
568 int refs_to_add)
569{ 851{
570 struct btrfs_key key; 852 struct btrfs_key key;
571 struct extent_buffer *leaf; 853 struct extent_buffer *leaf;
572 struct btrfs_extent_ref *ref; 854 u32 size;
573 u32 num_refs; 855 u32 num_refs;
574 int ret; 856 int ret;
575 857
576 key.objectid = bytenr; 858 key.objectid = bytenr;
577 key.type = BTRFS_EXTENT_REF_KEY; 859 if (parent) {
578 key.offset = parent; 860 key.type = BTRFS_SHARED_DATA_REF_KEY;
861 key.offset = parent;
862 size = sizeof(struct btrfs_shared_data_ref);
863 } else {
864 key.type = BTRFS_EXTENT_DATA_REF_KEY;
865 key.offset = hash_extent_data_ref(root_objectid,
866 owner, offset);
867 size = sizeof(struct btrfs_extent_data_ref);
868 }
579 869
580 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref)); 870 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
581 if (ret == 0) { 871 if (ret && ret != -EEXIST)
582 leaf = path->nodes[0]; 872 goto fail;
583 ref = btrfs_item_ptr(leaf, path->slots[0], 873
584 struct btrfs_extent_ref); 874 leaf = path->nodes[0];
585 btrfs_set_ref_root(leaf, ref, ref_root); 875 if (parent) {
586 btrfs_set_ref_generation(leaf, ref, ref_generation); 876 struct btrfs_shared_data_ref *ref;
587 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
588 btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
589 } else if (ret == -EEXIST) {
590 u64 existing_owner;
591
592 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
593 leaf = path->nodes[0];
594 ref = btrfs_item_ptr(leaf, path->slots[0], 877 ref = btrfs_item_ptr(leaf, path->slots[0],
595 struct btrfs_extent_ref); 878 struct btrfs_shared_data_ref);
596 if (btrfs_ref_root(leaf, ref) != ref_root || 879 if (ret == 0) {
597 btrfs_ref_generation(leaf, ref) != ref_generation) { 880 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
598 ret = -EIO; 881 } else {
599 WARN_ON(1); 882 num_refs = btrfs_shared_data_ref_count(leaf, ref);
600 goto out; 883 num_refs += refs_to_add;
884 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
601 } 885 }
886 } else {
887 struct btrfs_extent_data_ref *ref;
888 while (ret == -EEXIST) {
889 ref = btrfs_item_ptr(leaf, path->slots[0],
890 struct btrfs_extent_data_ref);
891 if (match_extent_data_ref(leaf, ref, root_objectid,
892 owner, offset))
893 break;
894 btrfs_release_path(root, path);
895 key.offset++;
896 ret = btrfs_insert_empty_item(trans, root, path, &key,
897 size);
898 if (ret && ret != -EEXIST)
899 goto fail;
602 900
603 num_refs = btrfs_ref_num_refs(leaf, ref); 901 leaf = path->nodes[0];
604 BUG_ON(num_refs == 0); 902 }
605 btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add); 903 ref = btrfs_item_ptr(leaf, path->slots[0],
606 904 struct btrfs_extent_data_ref);
607 existing_owner = btrfs_ref_objectid(leaf, ref); 905 if (ret == 0) {
608 if (existing_owner != owner_objectid && 906 btrfs_set_extent_data_ref_root(leaf, ref,
609 existing_owner != BTRFS_MULTIPLE_OBJECTIDS) { 907 root_objectid);
610 btrfs_set_ref_objectid(leaf, ref, 908 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
611 BTRFS_MULTIPLE_OBJECTIDS); 909 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
910 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
911 } else {
912 num_refs = btrfs_extent_data_ref_count(leaf, ref);
913 num_refs += refs_to_add;
914 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
612 } 915 }
613 ret = 0;
614 } else {
615 goto out;
616 } 916 }
617 btrfs_unlock_up_safe(path, 1); 917 btrfs_mark_buffer_dirty(leaf);
618 btrfs_mark_buffer_dirty(path->nodes[0]); 918 ret = 0;
619out: 919fail:
620 btrfs_release_path(root, path); 920 btrfs_release_path(root, path);
621 return ret; 921 return ret;
622} 922}
623 923
624static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, 924static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
625 struct btrfs_root *root, 925 struct btrfs_root *root,
626 struct btrfs_path *path, 926 struct btrfs_path *path,
627 int refs_to_drop) 927 int refs_to_drop)
628{ 928{
929 struct btrfs_key key;
930 struct btrfs_extent_data_ref *ref1 = NULL;
931 struct btrfs_shared_data_ref *ref2 = NULL;
629 struct extent_buffer *leaf; 932 struct extent_buffer *leaf;
630 struct btrfs_extent_ref *ref; 933 u32 num_refs = 0;
631 u32 num_refs;
632 int ret = 0; 934 int ret = 0;
633 935
634 leaf = path->nodes[0]; 936 leaf = path->nodes[0];
635 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); 937 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
636 num_refs = btrfs_ref_num_refs(leaf, ref); 938
939 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
940 ref1 = btrfs_item_ptr(leaf, path->slots[0],
941 struct btrfs_extent_data_ref);
942 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
943 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
944 ref2 = btrfs_item_ptr(leaf, path->slots[0],
945 struct btrfs_shared_data_ref);
946 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
947#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
948 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
949 struct btrfs_extent_ref_v0 *ref0;
950 ref0 = btrfs_item_ptr(leaf, path->slots[0],
951 struct btrfs_extent_ref_v0);
952 num_refs = btrfs_ref_count_v0(leaf, ref0);
953#endif
954 } else {
955 BUG();
956 }
957
637 BUG_ON(num_refs < refs_to_drop); 958 BUG_ON(num_refs < refs_to_drop);
638 num_refs -= refs_to_drop; 959 num_refs -= refs_to_drop;
960
639 if (num_refs == 0) { 961 if (num_refs == 0) {
640 ret = btrfs_del_item(trans, root, path); 962 ret = btrfs_del_item(trans, root, path);
641 } else { 963 } else {
642 btrfs_set_ref_num_refs(leaf, ref, num_refs); 964 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
965 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
966 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
967 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
968#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
969 else {
970 struct btrfs_extent_ref_v0 *ref0;
971 ref0 = btrfs_item_ptr(leaf, path->slots[0],
972 struct btrfs_extent_ref_v0);
973 btrfs_set_ref_count_v0(leaf, ref0, num_refs);
974 }
975#endif
643 btrfs_mark_buffer_dirty(leaf); 976 btrfs_mark_buffer_dirty(leaf);
644 } 977 }
978 return ret;
979}
980
981static noinline u32 extent_data_ref_count(struct btrfs_root *root,
982 struct btrfs_path *path,
983 struct btrfs_extent_inline_ref *iref)
984{
985 struct btrfs_key key;
986 struct extent_buffer *leaf;
987 struct btrfs_extent_data_ref *ref1;
988 struct btrfs_shared_data_ref *ref2;
989 u32 num_refs = 0;
990
991 leaf = path->nodes[0];
992 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
993 if (iref) {
994 if (btrfs_extent_inline_ref_type(leaf, iref) ==
995 BTRFS_EXTENT_DATA_REF_KEY) {
996 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
997 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
998 } else {
999 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1000 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1001 }
1002 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1003 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1004 struct btrfs_extent_data_ref);
1005 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1006 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1007 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1008 struct btrfs_shared_data_ref);
1009 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1010#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1011 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1012 struct btrfs_extent_ref_v0 *ref0;
1013 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1014 struct btrfs_extent_ref_v0);
1015 num_refs = btrfs_ref_count_v0(leaf, ref0);
1016#endif
1017 } else {
1018 WARN_ON(1);
1019 }
1020 return num_refs;
1021}
1022
1023static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1024 struct btrfs_root *root,
1025 struct btrfs_path *path,
1026 u64 bytenr, u64 parent,
1027 u64 root_objectid)
1028{
1029 struct btrfs_key key;
1030 int ret;
1031
1032 key.objectid = bytenr;
1033 if (parent) {
1034 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1035 key.offset = parent;
1036 } else {
1037 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1038 key.offset = root_objectid;
1039 }
1040
1041 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1042 if (ret > 0)
1043 ret = -ENOENT;
1044#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1045 if (ret == -ENOENT && parent) {
1046 btrfs_release_path(root, path);
1047 key.type = BTRFS_EXTENT_REF_V0_KEY;
1048 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1049 if (ret > 0)
1050 ret = -ENOENT;
1051 }
1052#endif
1053 return ret;
1054}
1055
1056static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1057 struct btrfs_root *root,
1058 struct btrfs_path *path,
1059 u64 bytenr, u64 parent,
1060 u64 root_objectid)
1061{
1062 struct btrfs_key key;
1063 int ret;
1064
1065 key.objectid = bytenr;
1066 if (parent) {
1067 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1068 key.offset = parent;
1069 } else {
1070 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1071 key.offset = root_objectid;
1072 }
1073
1074 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
645 btrfs_release_path(root, path); 1075 btrfs_release_path(root, path);
646 return ret; 1076 return ret;
647} 1077}
648 1078
1079static inline int extent_ref_type(u64 parent, u64 owner)
1080{
1081 int type;
1082 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1083 if (parent > 0)
1084 type = BTRFS_SHARED_BLOCK_REF_KEY;
1085 else
1086 type = BTRFS_TREE_BLOCK_REF_KEY;
1087 } else {
1088 if (parent > 0)
1089 type = BTRFS_SHARED_DATA_REF_KEY;
1090 else
1091 type = BTRFS_EXTENT_DATA_REF_KEY;
1092 }
1093 return type;
1094}
1095
1096static int find_next_key(struct btrfs_path *path, int level,
1097 struct btrfs_key *key)
1098
1099{
1100 for (; level < BTRFS_MAX_LEVEL; level++) {
1101 if (!path->nodes[level])
1102 break;
1103 if (path->slots[level] + 1 >=
1104 btrfs_header_nritems(path->nodes[level]))
1105 continue;
1106 if (level == 0)
1107 btrfs_item_key_to_cpu(path->nodes[level], key,
1108 path->slots[level] + 1);
1109 else
1110 btrfs_node_key_to_cpu(path->nodes[level], key,
1111 path->slots[level] + 1);
1112 return 0;
1113 }
1114 return 1;
1115}
1116
1117/*
1118 * look for inline back ref. if back ref is found, *ref_ret is set
1119 * to the address of inline back ref, and 0 is returned.
1120 *
1121 * if back ref isn't found, *ref_ret is set to the address where it
1122 * should be inserted, and -ENOENT is returned.
1123 *
1124 * if insert is true and there are too many inline back refs, the path
1125 * points to the extent item, and -EAGAIN is returned.
1126 *
1127 * NOTE: inline back refs are ordered in the same way that back ref
1128 * items in the tree are ordered.
1129 */
1130static noinline_for_stack
1131int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1132 struct btrfs_root *root,
1133 struct btrfs_path *path,
1134 struct btrfs_extent_inline_ref **ref_ret,
1135 u64 bytenr, u64 num_bytes,
1136 u64 parent, u64 root_objectid,
1137 u64 owner, u64 offset, int insert)
1138{
1139 struct btrfs_key key;
1140 struct extent_buffer *leaf;
1141 struct btrfs_extent_item *ei;
1142 struct btrfs_extent_inline_ref *iref;
1143 u64 flags;
1144 u64 item_size;
1145 unsigned long ptr;
1146 unsigned long end;
1147 int extra_size;
1148 int type;
1149 int want;
1150 int ret;
1151 int err = 0;
1152
1153 key.objectid = bytenr;
1154 key.type = BTRFS_EXTENT_ITEM_KEY;
1155 key.offset = num_bytes;
1156
1157 want = extent_ref_type(parent, owner);
1158 if (insert) {
1159 extra_size = btrfs_extent_inline_ref_size(want);
1160 path->keep_locks = 1;
1161 } else
1162 extra_size = -1;
1163 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1164 if (ret < 0) {
1165 err = ret;
1166 goto out;
1167 }
1168 BUG_ON(ret);
1169
1170 leaf = path->nodes[0];
1171 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1172#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1173 if (item_size < sizeof(*ei)) {
1174 if (!insert) {
1175 err = -ENOENT;
1176 goto out;
1177 }
1178 ret = convert_extent_item_v0(trans, root, path, owner,
1179 extra_size);
1180 if (ret < 0) {
1181 err = ret;
1182 goto out;
1183 }
1184 leaf = path->nodes[0];
1185 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1186 }
1187#endif
1188 BUG_ON(item_size < sizeof(*ei));
1189
1190 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1191 flags = btrfs_extent_flags(leaf, ei);
1192
1193 ptr = (unsigned long)(ei + 1);
1194 end = (unsigned long)ei + item_size;
1195
1196 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1197 ptr += sizeof(struct btrfs_tree_block_info);
1198 BUG_ON(ptr > end);
1199 } else {
1200 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1201 }
1202
1203 err = -ENOENT;
1204 while (1) {
1205 if (ptr >= end) {
1206 WARN_ON(ptr > end);
1207 break;
1208 }
1209 iref = (struct btrfs_extent_inline_ref *)ptr;
1210 type = btrfs_extent_inline_ref_type(leaf, iref);
1211 if (want < type)
1212 break;
1213 if (want > type) {
1214 ptr += btrfs_extent_inline_ref_size(type);
1215 continue;
1216 }
1217
1218 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1219 struct btrfs_extent_data_ref *dref;
1220 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1221 if (match_extent_data_ref(leaf, dref, root_objectid,
1222 owner, offset)) {
1223 err = 0;
1224 break;
1225 }
1226 if (hash_extent_data_ref_item(leaf, dref) <
1227 hash_extent_data_ref(root_objectid, owner, offset))
1228 break;
1229 } else {
1230 u64 ref_offset;
1231 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1232 if (parent > 0) {
1233 if (parent == ref_offset) {
1234 err = 0;
1235 break;
1236 }
1237 if (ref_offset < parent)
1238 break;
1239 } else {
1240 if (root_objectid == ref_offset) {
1241 err = 0;
1242 break;
1243 }
1244 if (ref_offset < root_objectid)
1245 break;
1246 }
1247 }
1248 ptr += btrfs_extent_inline_ref_size(type);
1249 }
1250 if (err == -ENOENT && insert) {
1251 if (item_size + extra_size >=
1252 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1253 err = -EAGAIN;
1254 goto out;
1255 }
1256 /*
1257 * To add new inline back ref, we have to make sure
1258 * there is no corresponding back ref item.
1259 * For simplicity, we just do not add new inline back
1260 * ref if there is any kind of item for this block
1261 */
1262 if (find_next_key(path, 0, &key) == 0 &&
1263 key.objectid == bytenr &&
1264 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1265 err = -EAGAIN;
1266 goto out;
1267 }
1268 }
1269 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1270out:
1271 if (insert) {
1272 path->keep_locks = 0;
1273 btrfs_unlock_up_safe(path, 1);
1274 }
1275 return err;
1276}
1277
1278/*
1279 * helper to add new inline back ref
1280 */
1281static noinline_for_stack
1282int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1283 struct btrfs_root *root,
1284 struct btrfs_path *path,
1285 struct btrfs_extent_inline_ref *iref,
1286 u64 parent, u64 root_objectid,
1287 u64 owner, u64 offset, int refs_to_add,
1288 struct btrfs_delayed_extent_op *extent_op)
1289{
1290 struct extent_buffer *leaf;
1291 struct btrfs_extent_item *ei;
1292 unsigned long ptr;
1293 unsigned long end;
1294 unsigned long item_offset;
1295 u64 refs;
1296 int size;
1297 int type;
1298 int ret;
1299
1300 leaf = path->nodes[0];
1301 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1302 item_offset = (unsigned long)iref - (unsigned long)ei;
1303
1304 type = extent_ref_type(parent, owner);
1305 size = btrfs_extent_inline_ref_size(type);
1306
1307 ret = btrfs_extend_item(trans, root, path, size);
1308 BUG_ON(ret);
1309
1310 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1311 refs = btrfs_extent_refs(leaf, ei);
1312 refs += refs_to_add;
1313 btrfs_set_extent_refs(leaf, ei, refs);
1314 if (extent_op)
1315 __run_delayed_extent_op(extent_op, leaf, ei);
1316
1317 ptr = (unsigned long)ei + item_offset;
1318 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1319 if (ptr < end - size)
1320 memmove_extent_buffer(leaf, ptr + size, ptr,
1321 end - size - ptr);
1322
1323 iref = (struct btrfs_extent_inline_ref *)ptr;
1324 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1325 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1326 struct btrfs_extent_data_ref *dref;
1327 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1328 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1329 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1330 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1331 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1332 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1333 struct btrfs_shared_data_ref *sref;
1334 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1335 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1336 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1337 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1338 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1339 } else {
1340 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1341 }
1342 btrfs_mark_buffer_dirty(leaf);
1343 return 0;
1344}
1345
1346static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1347 struct btrfs_root *root,
1348 struct btrfs_path *path,
1349 struct btrfs_extent_inline_ref **ref_ret,
1350 u64 bytenr, u64 num_bytes, u64 parent,
1351 u64 root_objectid, u64 owner, u64 offset)
1352{
1353 int ret;
1354
1355 ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1356 bytenr, num_bytes, parent,
1357 root_objectid, owner, offset, 0);
1358 if (ret != -ENOENT)
1359 return ret;
1360
1361 btrfs_release_path(root, path);
1362 *ref_ret = NULL;
1363
1364 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1365 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1366 root_objectid);
1367 } else {
1368 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1369 root_objectid, owner, offset);
1370 }
1371 return ret;
1372}
1373
1374/*
1375 * helper to update/remove inline back ref
1376 */
1377static noinline_for_stack
1378int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1379 struct btrfs_root *root,
1380 struct btrfs_path *path,
1381 struct btrfs_extent_inline_ref *iref,
1382 int refs_to_mod,
1383 struct btrfs_delayed_extent_op *extent_op)
1384{
1385 struct extent_buffer *leaf;
1386 struct btrfs_extent_item *ei;
1387 struct btrfs_extent_data_ref *dref = NULL;
1388 struct btrfs_shared_data_ref *sref = NULL;
1389 unsigned long ptr;
1390 unsigned long end;
1391 u32 item_size;
1392 int size;
1393 int type;
1394 int ret;
1395 u64 refs;
1396
1397 leaf = path->nodes[0];
1398 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1399 refs = btrfs_extent_refs(leaf, ei);
1400 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1401 refs += refs_to_mod;
1402 btrfs_set_extent_refs(leaf, ei, refs);
1403 if (extent_op)
1404 __run_delayed_extent_op(extent_op, leaf, ei);
1405
1406 type = btrfs_extent_inline_ref_type(leaf, iref);
1407
1408 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1409 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1410 refs = btrfs_extent_data_ref_count(leaf, dref);
1411 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1412 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1413 refs = btrfs_shared_data_ref_count(leaf, sref);
1414 } else {
1415 refs = 1;
1416 BUG_ON(refs_to_mod != -1);
1417 }
1418
1419 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1420 refs += refs_to_mod;
1421
1422 if (refs > 0) {
1423 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1424 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1425 else
1426 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1427 } else {
1428 size = btrfs_extent_inline_ref_size(type);
1429 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1430 ptr = (unsigned long)iref;
1431 end = (unsigned long)ei + item_size;
1432 if (ptr + size < end)
1433 memmove_extent_buffer(leaf, ptr, ptr + size,
1434 end - ptr - size);
1435 item_size -= size;
1436 ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1437 BUG_ON(ret);
1438 }
1439 btrfs_mark_buffer_dirty(leaf);
1440 return 0;
1441}
1442
1443static noinline_for_stack
1444int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1445 struct btrfs_root *root,
1446 struct btrfs_path *path,
1447 u64 bytenr, u64 num_bytes, u64 parent,
1448 u64 root_objectid, u64 owner,
1449 u64 offset, int refs_to_add,
1450 struct btrfs_delayed_extent_op *extent_op)
1451{
1452 struct btrfs_extent_inline_ref *iref;
1453 int ret;
1454
1455 ret = lookup_inline_extent_backref(trans, root, path, &iref,
1456 bytenr, num_bytes, parent,
1457 root_objectid, owner, offset, 1);
1458 if (ret == 0) {
1459 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1460 ret = update_inline_extent_backref(trans, root, path, iref,
1461 refs_to_add, extent_op);
1462 } else if (ret == -ENOENT) {
1463 ret = setup_inline_extent_backref(trans, root, path, iref,
1464 parent, root_objectid,
1465 owner, offset, refs_to_add,
1466 extent_op);
1467 }
1468 return ret;
1469}
1470
1471static int insert_extent_backref(struct btrfs_trans_handle *trans,
1472 struct btrfs_root *root,
1473 struct btrfs_path *path,
1474 u64 bytenr, u64 parent, u64 root_objectid,
1475 u64 owner, u64 offset, int refs_to_add)
1476{
1477 int ret;
1478 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1479 BUG_ON(refs_to_add != 1);
1480 ret = insert_tree_block_ref(trans, root, path, bytenr,
1481 parent, root_objectid);
1482 } else {
1483 ret = insert_extent_data_ref(trans, root, path, bytenr,
1484 parent, root_objectid,
1485 owner, offset, refs_to_add);
1486 }
1487 return ret;
1488}
1489
1490static int remove_extent_backref(struct btrfs_trans_handle *trans,
1491 struct btrfs_root *root,
1492 struct btrfs_path *path,
1493 struct btrfs_extent_inline_ref *iref,
1494 int refs_to_drop, int is_data)
1495{
1496 int ret;
1497
1498 BUG_ON(!is_data && refs_to_drop != 1);
1499 if (iref) {
1500 ret = update_inline_extent_backref(trans, root, path, iref,
1501 -refs_to_drop, NULL);
1502 } else if (is_data) {
1503 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1504 } else {
1505 ret = btrfs_del_item(trans, root, path);
1506 }
1507 return ret;
1508}
1509
649#ifdef BIO_RW_DISCARD 1510#ifdef BIO_RW_DISCARD
650static void btrfs_issue_discard(struct block_device *bdev, 1511static void btrfs_issue_discard(struct block_device *bdev,
651 u64 start, u64 len) 1512 u64 start, u64 len)
@@ -686,71 +1547,40 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
686#endif 1547#endif
687} 1548}
688 1549
689static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 1550int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
690 struct btrfs_root *root, u64 bytenr, 1551 struct btrfs_root *root,
691 u64 num_bytes, 1552 u64 bytenr, u64 num_bytes, u64 parent,
692 u64 orig_parent, u64 parent, 1553 u64 root_objectid, u64 owner, u64 offset)
693 u64 orig_root, u64 ref_root,
694 u64 orig_generation, u64 ref_generation,
695 u64 owner_objectid)
696{ 1554{
697 int ret; 1555 int ret;
698 int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID; 1556 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1557 root_objectid == BTRFS_TREE_LOG_OBJECTID);
699 1558
700 ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes, 1559 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
701 orig_parent, parent, orig_root, 1560 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
702 ref_root, orig_generation, 1561 parent, root_objectid, (int)owner,
703 ref_generation, owner_objectid, pin); 1562 BTRFS_ADD_DELAYED_REF, NULL);
704 BUG_ON(ret); 1563 } else {
1564 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1565 parent, root_objectid, owner, offset,
1566 BTRFS_ADD_DELAYED_REF, NULL);
1567 }
705 return ret; 1568 return ret;
706} 1569}
707 1570
708int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
709 struct btrfs_root *root, u64 bytenr,
710 u64 num_bytes, u64 orig_parent, u64 parent,
711 u64 ref_root, u64 ref_generation,
712 u64 owner_objectid)
713{
714 int ret;
715 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
716 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
717 return 0;
718
719 ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
720 orig_parent, parent, ref_root,
721 ref_root, ref_generation,
722 ref_generation, owner_objectid);
723 return ret;
724}
725static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1571static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
726 struct btrfs_root *root, u64 bytenr, 1572 struct btrfs_root *root,
727 u64 num_bytes, 1573 u64 bytenr, u64 num_bytes,
728 u64 orig_parent, u64 parent, 1574 u64 parent, u64 root_objectid,
729 u64 orig_root, u64 ref_root, 1575 u64 owner, u64 offset, int refs_to_add,
730 u64 orig_generation, u64 ref_generation, 1576 struct btrfs_delayed_extent_op *extent_op)
731 u64 owner_objectid)
732{
733 int ret;
734
735 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
736 ref_generation, owner_objectid,
737 BTRFS_ADD_DELAYED_REF, 0);
738 BUG_ON(ret);
739 return ret;
740}
741
742static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
743 struct btrfs_root *root, u64 bytenr,
744 u64 num_bytes, u64 parent, u64 ref_root,
745 u64 ref_generation, u64 owner_objectid,
746 int refs_to_add)
747{ 1577{
748 struct btrfs_path *path; 1578 struct btrfs_path *path;
749 int ret; 1579 struct extent_buffer *leaf;
750 struct btrfs_key key;
751 struct extent_buffer *l;
752 struct btrfs_extent_item *item; 1580 struct btrfs_extent_item *item;
753 u32 refs; 1581 u64 refs;
1582 int ret;
1583 int err = 0;
754 1584
755 path = btrfs_alloc_path(); 1585 path = btrfs_alloc_path();
756 if (!path) 1586 if (!path)
@@ -758,43 +1588,27 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
758 1588
759 path->reada = 1; 1589 path->reada = 1;
760 path->leave_spinning = 1; 1590 path->leave_spinning = 1;
761 key.objectid = bytenr; 1591 /* this will setup the path even if it fails to insert the back ref */
762 key.type = BTRFS_EXTENT_ITEM_KEY; 1592 ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
763 key.offset = num_bytes; 1593 path, bytenr, num_bytes, parent,
764 1594 root_objectid, owner, offset,
765 /* first find the extent item and update its reference count */ 1595 refs_to_add, extent_op);
766 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 1596 if (ret == 0)
767 path, 0, 1); 1597 goto out;
768 if (ret < 0) {
769 btrfs_set_path_blocking(path);
770 return ret;
771 }
772
773 if (ret > 0) {
774 WARN_ON(1);
775 btrfs_free_path(path);
776 return -EIO;
777 }
778 l = path->nodes[0];
779 1598
780 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 1599 if (ret != -EAGAIN) {
781 if (key.objectid != bytenr) { 1600 err = ret;
782 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]); 1601 goto out;
783 printk(KERN_ERR "btrfs wanted %llu found %llu\n",
784 (unsigned long long)bytenr,
785 (unsigned long long)key.objectid);
786 BUG();
787 } 1602 }
788 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
789
790 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
791 1603
792 refs = btrfs_extent_refs(l, item); 1604 leaf = path->nodes[0];
793 btrfs_set_extent_refs(l, item, refs + refs_to_add); 1605 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
794 btrfs_unlock_up_safe(path, 1); 1606 refs = btrfs_extent_refs(leaf, item);
795 1607 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
796 btrfs_mark_buffer_dirty(path->nodes[0]); 1608 if (extent_op)
1609 __run_delayed_extent_op(extent_op, leaf, item);
797 1610
1611 btrfs_mark_buffer_dirty(leaf);
798 btrfs_release_path(root->fs_info->extent_root, path); 1612 btrfs_release_path(root->fs_info->extent_root, path);
799 1613
800 path->reada = 1; 1614 path->reada = 1;
@@ -802,56 +1616,197 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
802 1616
803 /* now insert the actual backref */ 1617 /* now insert the actual backref */
804 ret = insert_extent_backref(trans, root->fs_info->extent_root, 1618 ret = insert_extent_backref(trans, root->fs_info->extent_root,
805 path, bytenr, parent, 1619 path, bytenr, parent, root_objectid,
806 ref_root, ref_generation, 1620 owner, offset, refs_to_add);
807 owner_objectid, refs_to_add);
808 BUG_ON(ret); 1621 BUG_ON(ret);
1622out:
809 btrfs_free_path(path); 1623 btrfs_free_path(path);
810 return 0; 1624 return err;
811} 1625}
812 1626
813int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1627static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
814 struct btrfs_root *root, 1628 struct btrfs_root *root,
815 u64 bytenr, u64 num_bytes, u64 parent, 1629 struct btrfs_delayed_ref_node *node,
816 u64 ref_root, u64 ref_generation, 1630 struct btrfs_delayed_extent_op *extent_op,
817 u64 owner_objectid) 1631 int insert_reserved)
818{ 1632{
819 int ret; 1633 int ret = 0;
820 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 1634 struct btrfs_delayed_data_ref *ref;
821 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 1635 struct btrfs_key ins;
822 return 0; 1636 u64 parent = 0;
1637 u64 ref_root = 0;
1638 u64 flags = 0;
823 1639
824 ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent, 1640 ins.objectid = node->bytenr;
825 0, ref_root, 0, ref_generation, 1641 ins.offset = node->num_bytes;
826 owner_objectid); 1642 ins.type = BTRFS_EXTENT_ITEM_KEY;
1643
1644 ref = btrfs_delayed_node_to_data_ref(node);
1645 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1646 parent = ref->parent;
1647 else
1648 ref_root = ref->root;
1649
1650 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1651 if (extent_op) {
1652 BUG_ON(extent_op->update_key);
1653 flags |= extent_op->flags_to_set;
1654 }
1655 ret = alloc_reserved_file_extent(trans, root,
1656 parent, ref_root, flags,
1657 ref->objectid, ref->offset,
1658 &ins, node->ref_mod);
1659 update_reserved_extents(root, ins.objectid, ins.offset, 0);
1660 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1661 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1662 node->num_bytes, parent,
1663 ref_root, ref->objectid,
1664 ref->offset, node->ref_mod,
1665 extent_op);
1666 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1667 ret = __btrfs_free_extent(trans, root, node->bytenr,
1668 node->num_bytes, parent,
1669 ref_root, ref->objectid,
1670 ref->offset, node->ref_mod,
1671 extent_op);
1672 } else {
1673 BUG();
1674 }
827 return ret; 1675 return ret;
828} 1676}
829 1677
830static int drop_delayed_ref(struct btrfs_trans_handle *trans, 1678static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
831 struct btrfs_root *root, 1679 struct extent_buffer *leaf,
832 struct btrfs_delayed_ref_node *node) 1680 struct btrfs_extent_item *ei)
1681{
1682 u64 flags = btrfs_extent_flags(leaf, ei);
1683 if (extent_op->update_flags) {
1684 flags |= extent_op->flags_to_set;
1685 btrfs_set_extent_flags(leaf, ei, flags);
1686 }
1687
1688 if (extent_op->update_key) {
1689 struct btrfs_tree_block_info *bi;
1690 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1691 bi = (struct btrfs_tree_block_info *)(ei + 1);
1692 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1693 }
1694}
1695
1696static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1697 struct btrfs_root *root,
1698 struct btrfs_delayed_ref_node *node,
1699 struct btrfs_delayed_extent_op *extent_op)
1700{
1701 struct btrfs_key key;
1702 struct btrfs_path *path;
1703 struct btrfs_extent_item *ei;
1704 struct extent_buffer *leaf;
1705 u32 item_size;
1706 int ret;
1707 int err = 0;
1708
1709 path = btrfs_alloc_path();
1710 if (!path)
1711 return -ENOMEM;
1712
1713 key.objectid = node->bytenr;
1714 key.type = BTRFS_EXTENT_ITEM_KEY;
1715 key.offset = node->num_bytes;
1716
1717 path->reada = 1;
1718 path->leave_spinning = 1;
1719 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1720 path, 0, 1);
1721 if (ret < 0) {
1722 err = ret;
1723 goto out;
1724 }
1725 if (ret > 0) {
1726 err = -EIO;
1727 goto out;
1728 }
1729
1730 leaf = path->nodes[0];
1731 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1732#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1733 if (item_size < sizeof(*ei)) {
1734 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1735 path, (u64)-1, 0);
1736 if (ret < 0) {
1737 err = ret;
1738 goto out;
1739 }
1740 leaf = path->nodes[0];
1741 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1742 }
1743#endif
1744 BUG_ON(item_size < sizeof(*ei));
1745 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1746 __run_delayed_extent_op(extent_op, leaf, ei);
1747
1748 btrfs_mark_buffer_dirty(leaf);
1749out:
1750 btrfs_free_path(path);
1751 return err;
1752}
1753
1754static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1755 struct btrfs_root *root,
1756 struct btrfs_delayed_ref_node *node,
1757 struct btrfs_delayed_extent_op *extent_op,
1758 int insert_reserved)
833{ 1759{
834 int ret = 0; 1760 int ret = 0;
835 struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node); 1761 struct btrfs_delayed_tree_ref *ref;
1762 struct btrfs_key ins;
1763 u64 parent = 0;
1764 u64 ref_root = 0;
836 1765
837 BUG_ON(node->ref_mod == 0); 1766 ins.objectid = node->bytenr;
838 ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, 1767 ins.offset = node->num_bytes;
839 node->parent, ref->root, ref->generation, 1768 ins.type = BTRFS_EXTENT_ITEM_KEY;
840 ref->owner_objectid, ref->pin, node->ref_mod);
841 1769
1770 ref = btrfs_delayed_node_to_tree_ref(node);
1771 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1772 parent = ref->parent;
1773 else
1774 ref_root = ref->root;
1775
1776 BUG_ON(node->ref_mod != 1);
1777 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1778 BUG_ON(!extent_op || !extent_op->update_flags ||
1779 !extent_op->update_key);
1780 ret = alloc_reserved_tree_block(trans, root,
1781 parent, ref_root,
1782 extent_op->flags_to_set,
1783 &extent_op->key,
1784 ref->level, &ins);
1785 update_reserved_extents(root, ins.objectid, ins.offset, 0);
1786 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1787 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1788 node->num_bytes, parent, ref_root,
1789 ref->level, 0, 1, extent_op);
1790 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1791 ret = __btrfs_free_extent(trans, root, node->bytenr,
1792 node->num_bytes, parent, ref_root,
1793 ref->level, 0, 1, extent_op);
1794 } else {
1795 BUG();
1796 }
842 return ret; 1797 return ret;
843} 1798}
844 1799
1800
845/* helper function to actually process a single delayed ref entry */ 1801/* helper function to actually process a single delayed ref entry */
846static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1802static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
847 struct btrfs_root *root, 1803 struct btrfs_root *root,
848 struct btrfs_delayed_ref_node *node, 1804 struct btrfs_delayed_ref_node *node,
849 int insert_reserved) 1805 struct btrfs_delayed_extent_op *extent_op,
1806 int insert_reserved)
850{ 1807{
851 int ret; 1808 int ret;
852 struct btrfs_delayed_ref *ref; 1809 if (btrfs_delayed_ref_is_head(node)) {
853
854 if (node->parent == (u64)-1) {
855 struct btrfs_delayed_ref_head *head; 1810 struct btrfs_delayed_ref_head *head;
856 /* 1811 /*
857 * we've hit the end of the chain and we were supposed 1812 * we've hit the end of the chain and we were supposed
@@ -859,44 +1814,35 @@ static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
859 * deleted before we ever needed to insert it, so all 1814 * deleted before we ever needed to insert it, so all
860 * we have to do is clean up the accounting 1815 * we have to do is clean up the accounting
861 */ 1816 */
1817 BUG_ON(extent_op);
1818 head = btrfs_delayed_node_to_head(node);
862 if (insert_reserved) { 1819 if (insert_reserved) {
1820 if (head->is_data) {
1821 ret = btrfs_del_csums(trans, root,
1822 node->bytenr,
1823 node->num_bytes);
1824 BUG_ON(ret);
1825 }
1826 btrfs_update_pinned_extents(root, node->bytenr,
1827 node->num_bytes, 1);
863 update_reserved_extents(root, node->bytenr, 1828 update_reserved_extents(root, node->bytenr,
864 node->num_bytes, 0); 1829 node->num_bytes, 0);
865 } 1830 }
866 head = btrfs_delayed_node_to_head(node);
867 mutex_unlock(&head->mutex); 1831 mutex_unlock(&head->mutex);
868 return 0; 1832 return 0;
869 } 1833 }
870 1834
871 ref = btrfs_delayed_node_to_ref(node); 1835 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
872 if (ref->action == BTRFS_ADD_DELAYED_REF) { 1836 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
873 if (insert_reserved) { 1837 ret = run_delayed_tree_ref(trans, root, node, extent_op,
874 struct btrfs_key ins; 1838 insert_reserved);
875 1839 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
876 ins.objectid = node->bytenr; 1840 node->type == BTRFS_SHARED_DATA_REF_KEY)
877 ins.offset = node->num_bytes; 1841 ret = run_delayed_data_ref(trans, root, node, extent_op,
878 ins.type = BTRFS_EXTENT_ITEM_KEY; 1842 insert_reserved);
879 1843 else
880 /* record the full extent allocation */ 1844 BUG();
881 ret = __btrfs_alloc_reserved_extent(trans, root, 1845 return ret;
882 node->parent, ref->root,
883 ref->generation, ref->owner_objectid,
884 &ins, node->ref_mod);
885 update_reserved_extents(root, node->bytenr,
886 node->num_bytes, 0);
887 } else {
888 /* just add one backref */
889 ret = add_extent_ref(trans, root, node->bytenr,
890 node->num_bytes,
891 node->parent, ref->root, ref->generation,
892 ref->owner_objectid, node->ref_mod);
893 }
894 BUG_ON(ret);
895 } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
896 WARN_ON(insert_reserved);
897 ret = drop_delayed_ref(trans, root, node);
898 }
899 return 0;
900} 1846}
901 1847
902static noinline struct btrfs_delayed_ref_node * 1848static noinline struct btrfs_delayed_ref_node *
@@ -919,7 +1865,7 @@ again:
919 rb_node); 1865 rb_node);
920 if (ref->bytenr != head->node.bytenr) 1866 if (ref->bytenr != head->node.bytenr)
921 break; 1867 break;
922 if (btrfs_delayed_node_to_ref(ref)->action == action) 1868 if (ref->action == action)
923 return ref; 1869 return ref;
924 node = rb_prev(node); 1870 node = rb_prev(node);
925 } 1871 }
@@ -937,6 +1883,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
937 struct btrfs_delayed_ref_root *delayed_refs; 1883 struct btrfs_delayed_ref_root *delayed_refs;
938 struct btrfs_delayed_ref_node *ref; 1884 struct btrfs_delayed_ref_node *ref;
939 struct btrfs_delayed_ref_head *locked_ref = NULL; 1885 struct btrfs_delayed_ref_head *locked_ref = NULL;
1886 struct btrfs_delayed_extent_op *extent_op;
940 int ret; 1887 int ret;
941 int count = 0; 1888 int count = 0;
942 int must_insert_reserved = 0; 1889 int must_insert_reserved = 0;
@@ -975,6 +1922,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
975 must_insert_reserved = locked_ref->must_insert_reserved; 1922 must_insert_reserved = locked_ref->must_insert_reserved;
976 locked_ref->must_insert_reserved = 0; 1923 locked_ref->must_insert_reserved = 0;
977 1924
1925 extent_op = locked_ref->extent_op;
1926 locked_ref->extent_op = NULL;
1927
978 /* 1928 /*
979 * locked_ref is the head node, so we have to go one 1929 * locked_ref is the head node, so we have to go one
980 * node back for any delayed ref updates 1930 * node back for any delayed ref updates
@@ -986,6 +1936,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
986 * so that any accounting fixes can happen 1936 * so that any accounting fixes can happen
987 */ 1937 */
988 ref = &locked_ref->node; 1938 ref = &locked_ref->node;
1939
1940 if (extent_op && must_insert_reserved) {
1941 kfree(extent_op);
1942 extent_op = NULL;
1943 }
1944
1945 if (extent_op) {
1946 spin_unlock(&delayed_refs->lock);
1947
1948 ret = run_delayed_extent_op(trans, root,
1949 ref, extent_op);
1950 BUG_ON(ret);
1951 kfree(extent_op);
1952
1953 cond_resched();
1954 spin_lock(&delayed_refs->lock);
1955 continue;
1956 }
1957
989 list_del_init(&locked_ref->cluster); 1958 list_del_init(&locked_ref->cluster);
990 locked_ref = NULL; 1959 locked_ref = NULL;
991 } 1960 }
@@ -993,14 +1962,17 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
993 ref->in_tree = 0; 1962 ref->in_tree = 0;
994 rb_erase(&ref->rb_node, &delayed_refs->root); 1963 rb_erase(&ref->rb_node, &delayed_refs->root);
995 delayed_refs->num_entries--; 1964 delayed_refs->num_entries--;
1965
996 spin_unlock(&delayed_refs->lock); 1966 spin_unlock(&delayed_refs->lock);
997 1967
998 ret = run_one_delayed_ref(trans, root, ref, 1968 ret = run_one_delayed_ref(trans, root, ref, extent_op,
999 must_insert_reserved); 1969 must_insert_reserved);
1000 BUG_ON(ret); 1970 BUG_ON(ret);
1001 btrfs_put_delayed_ref(ref);
1002 1971
1972 btrfs_put_delayed_ref(ref);
1973 kfree(extent_op);
1003 count++; 1974 count++;
1975
1004 cond_resched(); 1976 cond_resched();
1005 spin_lock(&delayed_refs->lock); 1977 spin_lock(&delayed_refs->lock);
1006 } 1978 }
@@ -1095,25 +2067,112 @@ out:
1095 return 0; 2067 return 0;
1096} 2068}
1097 2069
1098int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2070int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
1099 struct btrfs_root *root, u64 objectid, u64 bytenr) 2071 struct btrfs_root *root,
2072 u64 bytenr, u64 num_bytes, u64 flags,
2073 int is_data)
2074{
2075 struct btrfs_delayed_extent_op *extent_op;
2076 int ret;
2077
2078 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2079 if (!extent_op)
2080 return -ENOMEM;
2081
2082 extent_op->flags_to_set = flags;
2083 extent_op->update_flags = 1;
2084 extent_op->update_key = 0;
2085 extent_op->is_data = is_data ? 1 : 0;
2086
2087 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2088 if (ret)
2089 kfree(extent_op);
2090 return ret;
2091}
2092
2093static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2094 struct btrfs_root *root,
2095 struct btrfs_path *path,
2096 u64 objectid, u64 offset, u64 bytenr)
2097{
2098 struct btrfs_delayed_ref_head *head;
2099 struct btrfs_delayed_ref_node *ref;
2100 struct btrfs_delayed_data_ref *data_ref;
2101 struct btrfs_delayed_ref_root *delayed_refs;
2102 struct rb_node *node;
2103 int ret = 0;
2104
2105 ret = -ENOENT;
2106 delayed_refs = &trans->transaction->delayed_refs;
2107 spin_lock(&delayed_refs->lock);
2108 head = btrfs_find_delayed_ref_head(trans, bytenr);
2109 if (!head)
2110 goto out;
2111
2112 if (!mutex_trylock(&head->mutex)) {
2113 atomic_inc(&head->node.refs);
2114 spin_unlock(&delayed_refs->lock);
2115
2116 btrfs_release_path(root->fs_info->extent_root, path);
2117
2118 mutex_lock(&head->mutex);
2119 mutex_unlock(&head->mutex);
2120 btrfs_put_delayed_ref(&head->node);
2121 return -EAGAIN;
2122 }
2123
2124 node = rb_prev(&head->node.rb_node);
2125 if (!node)
2126 goto out_unlock;
2127
2128 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2129
2130 if (ref->bytenr != bytenr)
2131 goto out_unlock;
2132
2133 ret = 1;
2134 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2135 goto out_unlock;
2136
2137 data_ref = btrfs_delayed_node_to_data_ref(ref);
2138
2139 node = rb_prev(node);
2140 if (node) {
2141 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2142 if (ref->bytenr == bytenr)
2143 goto out_unlock;
2144 }
2145
2146 if (data_ref->root != root->root_key.objectid ||
2147 data_ref->objectid != objectid || data_ref->offset != offset)
2148 goto out_unlock;
2149
2150 ret = 0;
2151out_unlock:
2152 mutex_unlock(&head->mutex);
2153out:
2154 spin_unlock(&delayed_refs->lock);
2155 return ret;
2156}
2157
2158static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2159 struct btrfs_root *root,
2160 struct btrfs_path *path,
2161 u64 objectid, u64 offset, u64 bytenr)
1100{ 2162{
1101 struct btrfs_root *extent_root = root->fs_info->extent_root; 2163 struct btrfs_root *extent_root = root->fs_info->extent_root;
1102 struct btrfs_path *path;
1103 struct extent_buffer *leaf; 2164 struct extent_buffer *leaf;
1104 struct btrfs_extent_ref *ref_item; 2165 struct btrfs_extent_data_ref *ref;
2166 struct btrfs_extent_inline_ref *iref;
2167 struct btrfs_extent_item *ei;
1105 struct btrfs_key key; 2168 struct btrfs_key key;
1106 struct btrfs_key found_key; 2169 u32 item_size;
1107 u64 ref_root;
1108 u64 last_snapshot;
1109 u32 nritems;
1110 int ret; 2170 int ret;
1111 2171
1112 key.objectid = bytenr; 2172 key.objectid = bytenr;
1113 key.offset = (u64)-1; 2173 key.offset = (u64)-1;
1114 key.type = BTRFS_EXTENT_ITEM_KEY; 2174 key.type = BTRFS_EXTENT_ITEM_KEY;
1115 2175
1116 path = btrfs_alloc_path();
1117 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2176 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1118 if (ret < 0) 2177 if (ret < 0)
1119 goto out; 2178 goto out;
@@ -1125,55 +2184,83 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1125 2184
1126 path->slots[0]--; 2185 path->slots[0]--;
1127 leaf = path->nodes[0]; 2186 leaf = path->nodes[0];
1128 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2187 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1129 2188
1130 if (found_key.objectid != bytenr || 2189 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
1131 found_key.type != BTRFS_EXTENT_ITEM_KEY)
1132 goto out; 2190 goto out;
1133 2191
1134 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2192 ret = 1;
1135 while (1) { 2193 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1136 leaf = path->nodes[0]; 2194#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1137 nritems = btrfs_header_nritems(leaf); 2195 if (item_size < sizeof(*ei)) {
1138 if (path->slots[0] >= nritems) { 2196 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
1139 ret = btrfs_next_leaf(extent_root, path); 2197 goto out;
1140 if (ret < 0) 2198 }
1141 goto out; 2199#endif
1142 if (ret == 0) 2200 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1143 continue;
1144 break;
1145 }
1146 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1147 if (found_key.objectid != bytenr)
1148 break;
1149 2201
1150 if (found_key.type != BTRFS_EXTENT_REF_KEY) { 2202 if (item_size != sizeof(*ei) +
1151 path->slots[0]++; 2203 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
1152 continue; 2204 goto out;
1153 }
1154 2205
1155 ref_item = btrfs_item_ptr(leaf, path->slots[0], 2206 if (btrfs_extent_generation(leaf, ei) <=
1156 struct btrfs_extent_ref); 2207 btrfs_root_last_snapshot(&root->root_item))
1157 ref_root = btrfs_ref_root(leaf, ref_item); 2208 goto out;
1158 if ((ref_root != root->root_key.objectid && 2209
1159 ref_root != BTRFS_TREE_LOG_OBJECTID) || 2210 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
1160 objectid != btrfs_ref_objectid(leaf, ref_item)) { 2211 if (btrfs_extent_inline_ref_type(leaf, iref) !=
1161 ret = 1; 2212 BTRFS_EXTENT_DATA_REF_KEY)
1162 goto out; 2213 goto out;
1163 } 2214
1164 if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) { 2215 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
1165 ret = 1; 2216 if (btrfs_extent_refs(leaf, ei) !=
2217 btrfs_extent_data_ref_count(leaf, ref) ||
2218 btrfs_extent_data_ref_root(leaf, ref) !=
2219 root->root_key.objectid ||
2220 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2221 btrfs_extent_data_ref_offset(leaf, ref) != offset)
2222 goto out;
2223
2224 ret = 0;
2225out:
2226 return ret;
2227}
2228
2229int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2230 struct btrfs_root *root,
2231 u64 objectid, u64 offset, u64 bytenr)
2232{
2233 struct btrfs_path *path;
2234 int ret;
2235 int ret2;
2236
2237 path = btrfs_alloc_path();
2238 if (!path)
2239 return -ENOENT;
2240
2241 do {
2242 ret = check_committed_ref(trans, root, path, objectid,
2243 offset, bytenr);
2244 if (ret && ret != -ENOENT)
1166 goto out; 2245 goto out;
1167 }
1168 2246
1169 path->slots[0]++; 2247 ret2 = check_delayed_ref(trans, root, path, objectid,
2248 offset, bytenr);
2249 } while (ret2 == -EAGAIN);
2250
2251 if (ret2 && ret2 != -ENOENT) {
2252 ret = ret2;
2253 goto out;
1170 } 2254 }
1171 ret = 0; 2255
2256 if (ret != -ENOENT || ret2 != -ENOENT)
2257 ret = 0;
1172out: 2258out:
1173 btrfs_free_path(path); 2259 btrfs_free_path(path);
1174 return ret; 2260 return ret;
1175} 2261}
1176 2262
2263#if 0
1177int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2264int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1178 struct extent_buffer *buf, u32 nr_extents) 2265 struct extent_buffer *buf, u32 nr_extents)
1179{ 2266{
@@ -1291,62 +2378,44 @@ static int refsort_cmp(const void *a_void, const void *b_void)
1291 return 1; 2378 return 1;
1292 return 0; 2379 return 0;
1293} 2380}
2381#endif
1294 2382
1295 2383static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
1296noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1297 struct btrfs_root *root, 2384 struct btrfs_root *root,
1298 struct extent_buffer *orig_buf, 2385 struct extent_buffer *buf,
1299 struct extent_buffer *buf, u32 *nr_extents) 2386 int full_backref, int inc)
1300{ 2387{
1301 u64 bytenr; 2388 u64 bytenr;
2389 u64 num_bytes;
2390 u64 parent;
1302 u64 ref_root; 2391 u64 ref_root;
1303 u64 orig_root;
1304 u64 ref_generation;
1305 u64 orig_generation;
1306 struct refsort *sorted;
1307 u32 nritems; 2392 u32 nritems;
1308 u32 nr_file_extents = 0;
1309 struct btrfs_key key; 2393 struct btrfs_key key;
1310 struct btrfs_file_extent_item *fi; 2394 struct btrfs_file_extent_item *fi;
1311 int i; 2395 int i;
1312 int level; 2396 int level;
1313 int ret = 0; 2397 int ret = 0;
1314 int faili = 0;
1315 int refi = 0;
1316 int slot;
1317 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2398 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1318 u64, u64, u64, u64, u64, u64, u64, u64, u64); 2399 u64, u64, u64, u64, u64, u64);
1319 2400
1320 ref_root = btrfs_header_owner(buf); 2401 ref_root = btrfs_header_owner(buf);
1321 ref_generation = btrfs_header_generation(buf);
1322 orig_root = btrfs_header_owner(orig_buf);
1323 orig_generation = btrfs_header_generation(orig_buf);
1324
1325 nritems = btrfs_header_nritems(buf); 2402 nritems = btrfs_header_nritems(buf);
1326 level = btrfs_header_level(buf); 2403 level = btrfs_header_level(buf);
1327 2404
1328 sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS); 2405 if (!root->ref_cows && level == 0)
1329 BUG_ON(!sorted); 2406 return 0;
2407
2408 if (inc)
2409 process_func = btrfs_inc_extent_ref;
2410 else
2411 process_func = btrfs_free_extent;
1330 2412
1331 if (root->ref_cows) { 2413 if (full_backref)
1332 process_func = __btrfs_inc_extent_ref; 2414 parent = buf->start;
1333 } else { 2415 else
1334 if (level == 0 && 2416 parent = 0;
1335 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1336 goto out;
1337 if (level != 0 &&
1338 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1339 goto out;
1340 process_func = __btrfs_update_extent_ref;
1341 }
1342 2417
1343 /*
1344 * we make two passes through the items. In the first pass we
1345 * only record the byte number and slot. Then we sort based on
1346 * byte number and do the actual work based on the sorted results
1347 */
1348 for (i = 0; i < nritems; i++) { 2418 for (i = 0; i < nritems; i++) {
1349 cond_resched();
1350 if (level == 0) { 2419 if (level == 0) {
1351 btrfs_item_key_to_cpu(buf, &key, i); 2420 btrfs_item_key_to_cpu(buf, &key, i);
1352 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 2421 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
@@ -1360,151 +2429,38 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1360 if (bytenr == 0) 2429 if (bytenr == 0)
1361 continue; 2430 continue;
1362 2431
1363 nr_file_extents++; 2432 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
1364 sorted[refi].bytenr = bytenr; 2433 key.offset -= btrfs_file_extent_offset(buf, fi);
1365 sorted[refi].slot = i; 2434 ret = process_func(trans, root, bytenr, num_bytes,
1366 refi++; 2435 parent, ref_root, key.objectid,
1367 } else { 2436 key.offset);
1368 bytenr = btrfs_node_blockptr(buf, i); 2437 if (ret)
1369 sorted[refi].bytenr = bytenr;
1370 sorted[refi].slot = i;
1371 refi++;
1372 }
1373 }
1374 /*
1375 * if refi == 0, we didn't actually put anything into the sorted
1376 * array and we're done
1377 */
1378 if (refi == 0)
1379 goto out;
1380
1381 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1382
1383 for (i = 0; i < refi; i++) {
1384 cond_resched();
1385 slot = sorted[i].slot;
1386 bytenr = sorted[i].bytenr;
1387
1388 if (level == 0) {
1389 btrfs_item_key_to_cpu(buf, &key, slot);
1390 fi = btrfs_item_ptr(buf, slot,
1391 struct btrfs_file_extent_item);
1392
1393 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1394 if (bytenr == 0)
1395 continue;
1396
1397 ret = process_func(trans, root, bytenr,
1398 btrfs_file_extent_disk_num_bytes(buf, fi),
1399 orig_buf->start, buf->start,
1400 orig_root, ref_root,
1401 orig_generation, ref_generation,
1402 key.objectid);
1403
1404 if (ret) {
1405 faili = slot;
1406 WARN_ON(1);
1407 goto fail; 2438 goto fail;
1408 }
1409 } else { 2439 } else {
1410 ret = process_func(trans, root, bytenr, buf->len, 2440 bytenr = btrfs_node_blockptr(buf, i);
1411 orig_buf->start, buf->start, 2441 num_bytes = btrfs_level_size(root, level - 1);
1412 orig_root, ref_root, 2442 ret = process_func(trans, root, bytenr, num_bytes,
1413 orig_generation, ref_generation, 2443 parent, ref_root, level - 1, 0);
1414 level - 1); 2444 if (ret)
1415 if (ret) {
1416 faili = slot;
1417 WARN_ON(1);
1418 goto fail; 2445 goto fail;
1419 }
1420 } 2446 }
1421 } 2447 }
1422out:
1423 kfree(sorted);
1424 if (nr_extents) {
1425 if (level == 0)
1426 *nr_extents = nr_file_extents;
1427 else
1428 *nr_extents = nritems;
1429 }
1430 return 0; 2448 return 0;
1431fail: 2449fail:
1432 kfree(sorted); 2450 BUG();
1433 WARN_ON(1);
1434 return ret; 2451 return ret;
1435} 2452}
1436 2453
1437int btrfs_update_ref(struct btrfs_trans_handle *trans, 2454int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1438 struct btrfs_root *root, struct extent_buffer *orig_buf, 2455 struct extent_buffer *buf, int full_backref)
1439 struct extent_buffer *buf, int start_slot, int nr)
1440
1441{ 2456{
1442 u64 bytenr; 2457 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
1443 u64 ref_root; 2458}
1444 u64 orig_root;
1445 u64 ref_generation;
1446 u64 orig_generation;
1447 struct btrfs_key key;
1448 struct btrfs_file_extent_item *fi;
1449 int i;
1450 int ret;
1451 int slot;
1452 int level;
1453
1454 BUG_ON(start_slot < 0);
1455 BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1456
1457 ref_root = btrfs_header_owner(buf);
1458 ref_generation = btrfs_header_generation(buf);
1459 orig_root = btrfs_header_owner(orig_buf);
1460 orig_generation = btrfs_header_generation(orig_buf);
1461 level = btrfs_header_level(buf);
1462
1463 if (!root->ref_cows) {
1464 if (level == 0 &&
1465 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1466 return 0;
1467 if (level != 0 &&
1468 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1469 return 0;
1470 }
1471 2459
1472 for (i = 0, slot = start_slot; i < nr; i++, slot++) { 2460int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1473 cond_resched(); 2461 struct extent_buffer *buf, int full_backref)
1474 if (level == 0) { 2462{
1475 btrfs_item_key_to_cpu(buf, &key, slot); 2463 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
1476 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1477 continue;
1478 fi = btrfs_item_ptr(buf, slot,
1479 struct btrfs_file_extent_item);
1480 if (btrfs_file_extent_type(buf, fi) ==
1481 BTRFS_FILE_EXTENT_INLINE)
1482 continue;
1483 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1484 if (bytenr == 0)
1485 continue;
1486 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1487 btrfs_file_extent_disk_num_bytes(buf, fi),
1488 orig_buf->start, buf->start,
1489 orig_root, ref_root, orig_generation,
1490 ref_generation, key.objectid);
1491 if (ret)
1492 goto fail;
1493 } else {
1494 bytenr = btrfs_node_blockptr(buf, slot);
1495 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1496 buf->len, orig_buf->start,
1497 buf->start, orig_root, ref_root,
1498 orig_generation, ref_generation,
1499 level - 1);
1500 if (ret)
1501 goto fail;
1502 }
1503 }
1504 return 0;
1505fail:
1506 WARN_ON(1);
1507 return -1;
1508} 2464}
1509 2465
1510static int write_one_cache_group(struct btrfs_trans_handle *trans, 2466static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -1534,13 +2490,29 @@ fail:
1534 2490
1535} 2491}
1536 2492
2493static struct btrfs_block_group_cache *
2494next_block_group(struct btrfs_root *root,
2495 struct btrfs_block_group_cache *cache)
2496{
2497 struct rb_node *node;
2498 spin_lock(&root->fs_info->block_group_cache_lock);
2499 node = rb_next(&cache->cache_node);
2500 btrfs_put_block_group(cache);
2501 if (node) {
2502 cache = rb_entry(node, struct btrfs_block_group_cache,
2503 cache_node);
2504 atomic_inc(&cache->count);
2505 } else
2506 cache = NULL;
2507 spin_unlock(&root->fs_info->block_group_cache_lock);
2508 return cache;
2509}
2510
1537int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2511int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1538 struct btrfs_root *root) 2512 struct btrfs_root *root)
1539{ 2513{
1540 struct btrfs_block_group_cache *cache, *entry; 2514 struct btrfs_block_group_cache *cache;
1541 struct rb_node *n;
1542 int err = 0; 2515 int err = 0;
1543 int werr = 0;
1544 struct btrfs_path *path; 2516 struct btrfs_path *path;
1545 u64 last = 0; 2517 u64 last = 0;
1546 2518
@@ -1549,39 +2521,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1549 return -ENOMEM; 2521 return -ENOMEM;
1550 2522
1551 while (1) { 2523 while (1) {
1552 cache = NULL; 2524 if (last == 0) {
1553 spin_lock(&root->fs_info->block_group_cache_lock); 2525 err = btrfs_run_delayed_refs(trans, root,
1554 for (n = rb_first(&root->fs_info->block_group_cache_tree); 2526 (unsigned long)-1);
1555 n; n = rb_next(n)) { 2527 BUG_ON(err);
1556 entry = rb_entry(n, struct btrfs_block_group_cache,
1557 cache_node);
1558 if (entry->dirty) {
1559 cache = entry;
1560 break;
1561 }
1562 } 2528 }
1563 spin_unlock(&root->fs_info->block_group_cache_lock);
1564 2529
1565 if (!cache) 2530 cache = btrfs_lookup_first_block_group(root->fs_info, last);
1566 break; 2531 while (cache) {
2532 if (cache->dirty)
2533 break;
2534 cache = next_block_group(root, cache);
2535 }
2536 if (!cache) {
2537 if (last == 0)
2538 break;
2539 last = 0;
2540 continue;
2541 }
1567 2542
1568 cache->dirty = 0; 2543 cache->dirty = 0;
1569 last += cache->key.offset; 2544 last = cache->key.objectid + cache->key.offset;
1570 2545
1571 err = write_one_cache_group(trans, root, 2546 err = write_one_cache_group(trans, root, path, cache);
1572 path, cache); 2547 BUG_ON(err);
1573 /* 2548 btrfs_put_block_group(cache);
1574 * if we fail to write the cache group, we want
1575 * to keep it marked dirty in hopes that a later
1576 * write will work
1577 */
1578 if (err) {
1579 werr = err;
1580 continue;
1581 }
1582 } 2549 }
2550
1583 btrfs_free_path(path); 2551 btrfs_free_path(path);
1584 return werr; 2552 return 0;
1585} 2553}
1586 2554
1587int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 2555int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -1631,6 +2599,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1631 found->force_alloc = 0; 2599 found->force_alloc = 0;
1632 *space_info = found; 2600 *space_info = found;
1633 list_add_rcu(&found->list, &info->space_info); 2601 list_add_rcu(&found->list, &info->space_info);
2602 atomic_set(&found->caching_threads, 0);
1634 return 0; 2603 return 0;
1635} 2604}
1636 2605
@@ -1843,7 +2812,7 @@ again:
1843 2812
1844 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 2813 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
1845 ", %llu bytes_used, %llu bytes_reserved, " 2814 ", %llu bytes_used, %llu bytes_reserved, "
1846 "%llu bytes_pinned, %llu bytes_readonly, %llu may use" 2815 "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
1847 "%llu total\n", (unsigned long long)bytes, 2816 "%llu total\n", (unsigned long long)bytes,
1848 (unsigned long long)data_sinfo->bytes_delalloc, 2817 (unsigned long long)data_sinfo->bytes_delalloc,
1849 (unsigned long long)data_sinfo->bytes_used, 2818 (unsigned long long)data_sinfo->bytes_used,
@@ -2007,6 +2976,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2007 u64 old_val; 2976 u64 old_val;
2008 u64 byte_in_group; 2977 u64 byte_in_group;
2009 2978
2979 /* block accounting for super block */
2980 spin_lock(&info->delalloc_lock);
2981 old_val = btrfs_super_bytes_used(&info->super_copy);
2982 if (alloc)
2983 old_val += num_bytes;
2984 else
2985 old_val -= num_bytes;
2986 btrfs_set_super_bytes_used(&info->super_copy, old_val);
2987
2988 /* block accounting for root item */
2989 old_val = btrfs_root_used(&root->root_item);
2990 if (alloc)
2991 old_val += num_bytes;
2992 else
2993 old_val -= num_bytes;
2994 btrfs_set_root_used(&root->root_item, old_val);
2995 spin_unlock(&info->delalloc_lock);
2996
2010 while (total) { 2997 while (total) {
2011 cache = btrfs_lookup_block_group(info, bytenr); 2998 cache = btrfs_lookup_block_group(info, bytenr);
2012 if (!cache) 2999 if (!cache)
@@ -2076,13 +3063,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2076 struct btrfs_block_group_cache *cache; 3063 struct btrfs_block_group_cache *cache;
2077 struct btrfs_fs_info *fs_info = root->fs_info; 3064 struct btrfs_fs_info *fs_info = root->fs_info;
2078 3065
2079 if (pin) { 3066 if (pin)
2080 set_extent_dirty(&fs_info->pinned_extents, 3067 set_extent_dirty(&fs_info->pinned_extents,
2081 bytenr, bytenr + num - 1, GFP_NOFS); 3068 bytenr, bytenr + num - 1, GFP_NOFS);
2082 } else {
2083 clear_extent_dirty(&fs_info->pinned_extents,
2084 bytenr, bytenr + num - 1, GFP_NOFS);
2085 }
2086 3069
2087 while (num > 0) { 3070 while (num > 0) {
2088 cache = btrfs_lookup_block_group(fs_info, bytenr); 3071 cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2098,14 +3081,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2098 spin_unlock(&cache->space_info->lock); 3081 spin_unlock(&cache->space_info->lock);
2099 fs_info->total_pinned += len; 3082 fs_info->total_pinned += len;
2100 } else { 3083 } else {
3084 int unpin = 0;
3085
3086 /*
3087 * in order to not race with the block group caching, we
3088 * only want to unpin the extent if we are cached. If
3089 * we aren't cached, we want to start async caching this
3090 * block group so we can free the extent the next time
3091 * around.
3092 */
2101 spin_lock(&cache->space_info->lock); 3093 spin_lock(&cache->space_info->lock);
2102 spin_lock(&cache->lock); 3094 spin_lock(&cache->lock);
2103 cache->pinned -= len; 3095 unpin = (cache->cached == BTRFS_CACHE_FINISHED);
2104 cache->space_info->bytes_pinned -= len; 3096 if (likely(unpin)) {
3097 cache->pinned -= len;
3098 cache->space_info->bytes_pinned -= len;
3099 fs_info->total_pinned -= len;
3100 }
2105 spin_unlock(&cache->lock); 3101 spin_unlock(&cache->lock);
2106 spin_unlock(&cache->space_info->lock); 3102 spin_unlock(&cache->space_info->lock);
2107 fs_info->total_pinned -= len; 3103
2108 if (cache->cached) 3104 if (likely(unpin))
3105 clear_extent_dirty(&fs_info->pinned_extents,
3106 bytenr, bytenr + len -1,
3107 GFP_NOFS);
3108 else
3109 cache_block_group(cache);
3110
3111 if (unpin)
2109 btrfs_add_free_space(cache, bytenr, len); 3112 btrfs_add_free_space(cache, bytenr, len);
2110 } 3113 }
2111 btrfs_put_block_group(cache); 3114 btrfs_put_block_group(cache);
@@ -2159,6 +3162,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2159 &start, &end, EXTENT_DIRTY); 3162 &start, &end, EXTENT_DIRTY);
2160 if (ret) 3163 if (ret)
2161 break; 3164 break;
3165
2162 set_extent_dirty(copy, start, end, GFP_NOFS); 3166 set_extent_dirty(copy, start, end, GFP_NOFS);
2163 last = end + 1; 3167 last = end + 1;
2164 } 3168 }
@@ -2187,6 +3191,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2187 3191
2188 cond_resched(); 3192 cond_resched();
2189 } 3193 }
3194
2190 return ret; 3195 return ret;
2191} 3196}
2192 3197
@@ -2216,8 +3221,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
2216 u64 header_owner = btrfs_header_owner(buf); 3221 u64 header_owner = btrfs_header_owner(buf);
2217 u64 header_transid = btrfs_header_generation(buf); 3222 u64 header_transid = btrfs_header_generation(buf);
2218 if (header_owner != BTRFS_TREE_LOG_OBJECTID && 3223 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2219 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2220 header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
2221 header_transid == trans->transid && 3224 header_transid == trans->transid &&
2222 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 3225 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2223 *must_clean = buf; 3226 *must_clean = buf;
@@ -2235,63 +3238,77 @@ pinit:
2235 return 0; 3238 return 0;
2236} 3239}
2237 3240
2238/* 3241
2239 * remove an extent from the root, returns 0 on success 3242static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2240 */ 3243 struct btrfs_root *root,
2241static int __free_extent(struct btrfs_trans_handle *trans, 3244 u64 bytenr, u64 num_bytes, u64 parent,
2242 struct btrfs_root *root, 3245 u64 root_objectid, u64 owner_objectid,
2243 u64 bytenr, u64 num_bytes, u64 parent, 3246 u64 owner_offset, int refs_to_drop,
2244 u64 root_objectid, u64 ref_generation, 3247 struct btrfs_delayed_extent_op *extent_op)
2245 u64 owner_objectid, int pin, int mark_free,
2246 int refs_to_drop)
2247{ 3248{
2248 struct btrfs_path *path;
2249 struct btrfs_key key; 3249 struct btrfs_key key;
3250 struct btrfs_path *path;
2250 struct btrfs_fs_info *info = root->fs_info; 3251 struct btrfs_fs_info *info = root->fs_info;
2251 struct btrfs_root *extent_root = info->extent_root; 3252 struct btrfs_root *extent_root = info->extent_root;
2252 struct extent_buffer *leaf; 3253 struct extent_buffer *leaf;
3254 struct btrfs_extent_item *ei;
3255 struct btrfs_extent_inline_ref *iref;
2253 int ret; 3256 int ret;
3257 int is_data;
2254 int extent_slot = 0; 3258 int extent_slot = 0;
2255 int found_extent = 0; 3259 int found_extent = 0;
2256 int num_to_del = 1; 3260 int num_to_del = 1;
2257 struct btrfs_extent_item *ei; 3261 u32 item_size;
2258 u32 refs; 3262 u64 refs;
2259 3263
2260 key.objectid = bytenr;
2261 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2262 key.offset = num_bytes;
2263 path = btrfs_alloc_path(); 3264 path = btrfs_alloc_path();
2264 if (!path) 3265 if (!path)
2265 return -ENOMEM; 3266 return -ENOMEM;
2266 3267
2267 path->reada = 1; 3268 path->reada = 1;
2268 path->leave_spinning = 1; 3269 path->leave_spinning = 1;
2269 ret = lookup_extent_backref(trans, extent_root, path, 3270
2270 bytenr, parent, root_objectid, 3271 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
2271 ref_generation, owner_objectid, 1); 3272 BUG_ON(!is_data && refs_to_drop != 1);
3273
3274 ret = lookup_extent_backref(trans, extent_root, path, &iref,
3275 bytenr, num_bytes, parent,
3276 root_objectid, owner_objectid,
3277 owner_offset);
2272 if (ret == 0) { 3278 if (ret == 0) {
2273 struct btrfs_key found_key;
2274 extent_slot = path->slots[0]; 3279 extent_slot = path->slots[0];
2275 while (extent_slot > 0) { 3280 while (extent_slot >= 0) {
2276 extent_slot--; 3281 btrfs_item_key_to_cpu(path->nodes[0], &key,
2277 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2278 extent_slot); 3282 extent_slot);
2279 if (found_key.objectid != bytenr) 3283 if (key.objectid != bytenr)
2280 break; 3284 break;
2281 if (found_key.type == BTRFS_EXTENT_ITEM_KEY && 3285 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
2282 found_key.offset == num_bytes) { 3286 key.offset == num_bytes) {
2283 found_extent = 1; 3287 found_extent = 1;
2284 break; 3288 break;
2285 } 3289 }
2286 if (path->slots[0] - extent_slot > 5) 3290 if (path->slots[0] - extent_slot > 5)
2287 break; 3291 break;
3292 extent_slot--;
2288 } 3293 }
3294#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3295 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
3296 if (found_extent && item_size < sizeof(*ei))
3297 found_extent = 0;
3298#endif
2289 if (!found_extent) { 3299 if (!found_extent) {
3300 BUG_ON(iref);
2290 ret = remove_extent_backref(trans, extent_root, path, 3301 ret = remove_extent_backref(trans, extent_root, path,
2291 refs_to_drop); 3302 NULL, refs_to_drop,
3303 is_data);
2292 BUG_ON(ret); 3304 BUG_ON(ret);
2293 btrfs_release_path(extent_root, path); 3305 btrfs_release_path(extent_root, path);
2294 path->leave_spinning = 1; 3306 path->leave_spinning = 1;
3307
3308 key.objectid = bytenr;
3309 key.type = BTRFS_EXTENT_ITEM_KEY;
3310 key.offset = num_bytes;
3311
2295 ret = btrfs_search_slot(trans, extent_root, 3312 ret = btrfs_search_slot(trans, extent_root,
2296 &key, path, -1, 1); 3313 &key, path, -1, 1);
2297 if (ret) { 3314 if (ret) {
@@ -2307,82 +3324,98 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2307 btrfs_print_leaf(extent_root, path->nodes[0]); 3324 btrfs_print_leaf(extent_root, path->nodes[0]);
2308 WARN_ON(1); 3325 WARN_ON(1);
2309 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 3326 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2310 "parent %llu root %llu gen %llu owner %llu\n", 3327 "parent %llu root %llu owner %llu offset %llu\n",
2311 (unsigned long long)bytenr, 3328 (unsigned long long)bytenr,
2312 (unsigned long long)parent, 3329 (unsigned long long)parent,
2313 (unsigned long long)root_objectid, 3330 (unsigned long long)root_objectid,
2314 (unsigned long long)ref_generation, 3331 (unsigned long long)owner_objectid,
2315 (unsigned long long)owner_objectid); 3332 (unsigned long long)owner_offset);
2316 } 3333 }
2317 3334
2318 leaf = path->nodes[0]; 3335 leaf = path->nodes[0];
3336 item_size = btrfs_item_size_nr(leaf, extent_slot);
3337#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3338 if (item_size < sizeof(*ei)) {
3339 BUG_ON(found_extent || extent_slot != path->slots[0]);
3340 ret = convert_extent_item_v0(trans, extent_root, path,
3341 owner_objectid, 0);
3342 BUG_ON(ret < 0);
3343
3344 btrfs_release_path(extent_root, path);
3345 path->leave_spinning = 1;
3346
3347 key.objectid = bytenr;
3348 key.type = BTRFS_EXTENT_ITEM_KEY;
3349 key.offset = num_bytes;
3350
3351 ret = btrfs_search_slot(trans, extent_root, &key, path,
3352 -1, 1);
3353 if (ret) {
3354 printk(KERN_ERR "umm, got %d back from search"
3355 ", was looking for %llu\n", ret,
3356 (unsigned long long)bytenr);
3357 btrfs_print_leaf(extent_root, path->nodes[0]);
3358 }
3359 BUG_ON(ret);
3360 extent_slot = path->slots[0];
3361 leaf = path->nodes[0];
3362 item_size = btrfs_item_size_nr(leaf, extent_slot);
3363 }
3364#endif
3365 BUG_ON(item_size < sizeof(*ei));
2319 ei = btrfs_item_ptr(leaf, extent_slot, 3366 ei = btrfs_item_ptr(leaf, extent_slot,
2320 struct btrfs_extent_item); 3367 struct btrfs_extent_item);
2321 refs = btrfs_extent_refs(leaf, ei); 3368 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2322 3369 struct btrfs_tree_block_info *bi;
2323 /* 3370 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
2324 * we're not allowed to delete the extent item if there 3371 bi = (struct btrfs_tree_block_info *)(ei + 1);
2325 * are other delayed ref updates pending 3372 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
2326 */ 3373 }
2327 3374
3375 refs = btrfs_extent_refs(leaf, ei);
2328 BUG_ON(refs < refs_to_drop); 3376 BUG_ON(refs < refs_to_drop);
2329 refs -= refs_to_drop; 3377 refs -= refs_to_drop;
2330 btrfs_set_extent_refs(leaf, ei, refs);
2331 btrfs_mark_buffer_dirty(leaf);
2332 3378
2333 if (refs == 0 && found_extent && 3379 if (refs > 0) {
2334 path->slots[0] == extent_slot + 1) { 3380 if (extent_op)
2335 struct btrfs_extent_ref *ref; 3381 __run_delayed_extent_op(extent_op, leaf, ei);
2336 ref = btrfs_item_ptr(leaf, path->slots[0], 3382 /*
2337 struct btrfs_extent_ref); 3383 * In the case of inline back ref, reference count will
2338 BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop); 3384 * be updated by remove_extent_backref
2339 /* if the back ref and the extent are next to each other
2340 * they get deleted below in one shot
2341 */ 3385 */
2342 path->slots[0] = extent_slot; 3386 if (iref) {
2343 num_to_del = 2; 3387 BUG_ON(!found_extent);
2344 } else if (found_extent) { 3388 } else {
2345 /* otherwise delete the extent back ref */ 3389 btrfs_set_extent_refs(leaf, ei, refs);
2346 ret = remove_extent_backref(trans, extent_root, path, 3390 btrfs_mark_buffer_dirty(leaf);
2347 refs_to_drop); 3391 }
2348 BUG_ON(ret); 3392 if (found_extent) {
2349 /* if refs are 0, we need to setup the path for deletion */ 3393 ret = remove_extent_backref(trans, extent_root, path,
2350 if (refs == 0) { 3394 iref, refs_to_drop,
2351 btrfs_release_path(extent_root, path); 3395 is_data);
2352 path->leave_spinning = 1;
2353 ret = btrfs_search_slot(trans, extent_root, &key, path,
2354 -1, 1);
2355 BUG_ON(ret); 3396 BUG_ON(ret);
2356 } 3397 }
2357 } 3398 } else {
2358 3399 int mark_free = 0;
2359 if (refs == 0) {
2360 u64 super_used;
2361 u64 root_used;
2362 struct extent_buffer *must_clean = NULL; 3400 struct extent_buffer *must_clean = NULL;
2363 3401
2364 if (pin) { 3402 if (found_extent) {
2365 ret = pin_down_bytes(trans, root, path, 3403 BUG_ON(is_data && refs_to_drop !=
2366 bytenr, num_bytes, 3404 extent_data_ref_count(root, path, iref));
2367 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, 3405 if (iref) {
2368 &must_clean); 3406 BUG_ON(path->slots[0] != extent_slot);
2369 if (ret > 0) 3407 } else {
2370 mark_free = 1; 3408 BUG_ON(path->slots[0] != extent_slot + 1);
2371 BUG_ON(ret < 0); 3409 path->slots[0] = extent_slot;
3410 num_to_del = 2;
3411 }
2372 } 3412 }
2373 3413
2374 /* block accounting for super block */ 3414 ret = pin_down_bytes(trans, root, path, bytenr,
2375 spin_lock(&info->delalloc_lock); 3415 num_bytes, is_data, &must_clean);
2376 super_used = btrfs_super_bytes_used(&info->super_copy); 3416 if (ret > 0)
2377 btrfs_set_super_bytes_used(&info->super_copy, 3417 mark_free = 1;
2378 super_used - num_bytes); 3418 BUG_ON(ret < 0);
2379
2380 /* block accounting for root item */
2381 root_used = btrfs_root_used(&root->root_item);
2382 btrfs_set_root_used(&root->root_item,
2383 root_used - num_bytes);
2384 spin_unlock(&info->delalloc_lock);
2385
2386 /* 3419 /*
2387 * it is going to be very rare for someone to be waiting 3420 * it is going to be very rare for someone to be waiting
2388 * on the block we're freeing. del_items might need to 3421 * on the block we're freeing. del_items might need to
@@ -2403,7 +3436,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2403 free_extent_buffer(must_clean); 3436 free_extent_buffer(must_clean);
2404 } 3437 }
2405 3438
2406 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 3439 if (is_data) {
2407 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 3440 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2408 BUG_ON(ret); 3441 BUG_ON(ret);
2409 } else { 3442 } else {
@@ -2421,34 +3454,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2421} 3454}
2422 3455
2423/* 3456/*
2424 * remove an extent from the root, returns 0 on success
2425 */
2426static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2427 struct btrfs_root *root,
2428 u64 bytenr, u64 num_bytes, u64 parent,
2429 u64 root_objectid, u64 ref_generation,
2430 u64 owner_objectid, int pin,
2431 int refs_to_drop)
2432{
2433 WARN_ON(num_bytes < root->sectorsize);
2434
2435 /*
2436 * if metadata always pin
2437 * if data pin when any transaction has committed this
2438 */
2439 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
2440 ref_generation != trans->transid)
2441 pin = 1;
2442
2443 if (ref_generation != trans->transid)
2444 pin = 1;
2445
2446 return __free_extent(trans, root, bytenr, num_bytes, parent,
2447 root_objectid, ref_generation,
2448 owner_objectid, pin, pin == 0, refs_to_drop);
2449}
2450
2451/*
2452 * when we free an extent, it is possible (and likely) that we free the last 3457 * when we free an extent, it is possible (and likely) that we free the last
2453 * delayed ref for that extent as well. This searches the delayed ref tree for 3458 * delayed ref for that extent as well. This searches the delayed ref tree for
2454 * a given extent, and if there are no other delayed refs to be processed, it 3459 * a given extent, and if there are no other delayed refs to be processed, it
@@ -2479,6 +3484,13 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2479 if (ref->bytenr == bytenr) 3484 if (ref->bytenr == bytenr)
2480 goto out; 3485 goto out;
2481 3486
3487 if (head->extent_op) {
3488 if (!head->must_insert_reserved)
3489 goto out;
3490 kfree(head->extent_op);
3491 head->extent_op = NULL;
3492 }
3493
2482 /* 3494 /*
2483 * waiting for the lock here would deadlock. If someone else has it 3495 * waiting for the lock here would deadlock. If someone else has it
2484 * locked they are already in the process of dropping it anyway 3496 * locked they are already in the process of dropping it anyway
@@ -2507,7 +3519,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2507 spin_unlock(&delayed_refs->lock); 3519 spin_unlock(&delayed_refs->lock);
2508 3520
2509 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 3521 ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
2510 &head->node, head->must_insert_reserved); 3522 &head->node, head->extent_op,
3523 head->must_insert_reserved);
2511 BUG_ON(ret); 3524 BUG_ON(ret);
2512 btrfs_put_delayed_ref(&head->node); 3525 btrfs_put_delayed_ref(&head->node);
2513 return 0; 3526 return 0;
@@ -2519,32 +3532,32 @@ out:
2519int btrfs_free_extent(struct btrfs_trans_handle *trans, 3532int btrfs_free_extent(struct btrfs_trans_handle *trans,
2520 struct btrfs_root *root, 3533 struct btrfs_root *root,
2521 u64 bytenr, u64 num_bytes, u64 parent, 3534 u64 bytenr, u64 num_bytes, u64 parent,
2522 u64 root_objectid, u64 ref_generation, 3535 u64 root_objectid, u64 owner, u64 offset)
2523 u64 owner_objectid, int pin)
2524{ 3536{
2525 int ret; 3537 int ret;
2526 3538
2527 /* 3539 /*
2528 * tree log blocks never actually go into the extent allocation 3540 * tree log blocks never actually go into the extent allocation
2529 * tree, just update pinning info and exit early. 3541 * tree, just update pinning info and exit early.
2530 *
2531 * data extents referenced by the tree log do need to have
2532 * their reference counts bumped.
2533 */ 3542 */
2534 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && 3543 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
2535 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 3544 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
2536 /* unlocks the pinned mutex */ 3545 /* unlocks the pinned mutex */
2537 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 3546 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2538 update_reserved_extents(root, bytenr, num_bytes, 0); 3547 update_reserved_extents(root, bytenr, num_bytes, 0);
2539 ret = 0; 3548 ret = 0;
2540 } else { 3549 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2541 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, 3550 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
2542 root_objectid, ref_generation, 3551 parent, root_objectid, (int)owner,
2543 owner_objectid, 3552 BTRFS_DROP_DELAYED_REF, NULL);
2544 BTRFS_DROP_DELAYED_REF, 1);
2545 BUG_ON(ret); 3553 BUG_ON(ret);
2546 ret = check_ref_cleanup(trans, root, bytenr); 3554 ret = check_ref_cleanup(trans, root, bytenr);
2547 BUG_ON(ret); 3555 BUG_ON(ret);
3556 } else {
3557 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
3558 parent, root_objectid, owner,
3559 offset, BTRFS_DROP_DELAYED_REF, NULL);
3560 BUG_ON(ret);
2548 } 3561 }
2549 return ret; 3562 return ret;
2550} 3563}
@@ -2557,6 +3570,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
2557} 3570}
2558 3571
2559/* 3572/*
3573 * when we wait for progress in the block group caching, its because
3574 * our allocation attempt failed at least once. So, we must sleep
3575 * and let some progress happen before we try again.
3576 *
3577 * This function will sleep at least once waiting for new free space to
3578 * show up, and then it will check the block group free space numbers
3579 * for our min num_bytes. Another option is to have it go ahead
3580 * and look in the rbtree for a free extent of a given size, but this
3581 * is a good start.
3582 */
3583static noinline int
3584wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
3585 u64 num_bytes)
3586{
3587 DEFINE_WAIT(wait);
3588
3589 prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
3590
3591 if (block_group_cache_done(cache)) {
3592 finish_wait(&cache->caching_q, &wait);
3593 return 0;
3594 }
3595 schedule();
3596 finish_wait(&cache->caching_q, &wait);
3597
3598 wait_event(cache->caching_q, block_group_cache_done(cache) ||
3599 (cache->free_space >= num_bytes));
3600 return 0;
3601}
3602
3603enum btrfs_loop_type {
3604 LOOP_CACHED_ONLY = 0,
3605 LOOP_CACHING_NOWAIT = 1,
3606 LOOP_CACHING_WAIT = 2,
3607 LOOP_ALLOC_CHUNK = 3,
3608 LOOP_NO_EMPTY_SIZE = 4,
3609};
3610
3611/*
2560 * walks the btree of allocated extents and find a hole of a given size. 3612 * walks the btree of allocated extents and find a hole of a given size.
2561 * The key ins is changed to record the hole: 3613 * The key ins is changed to record the hole:
2562 * ins->objectid == block start 3614 * ins->objectid == block start
@@ -2581,6 +3633,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2581 struct btrfs_space_info *space_info; 3633 struct btrfs_space_info *space_info;
2582 int last_ptr_loop = 0; 3634 int last_ptr_loop = 0;
2583 int loop = 0; 3635 int loop = 0;
3636 bool found_uncached_bg = false;
2584 3637
2585 WARN_ON(num_bytes < root->sectorsize); 3638 WARN_ON(num_bytes < root->sectorsize);
2586 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 3639 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -2612,15 +3665,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2612 search_start = max(search_start, first_logical_byte(root, 0)); 3665 search_start = max(search_start, first_logical_byte(root, 0));
2613 search_start = max(search_start, hint_byte); 3666 search_start = max(search_start, hint_byte);
2614 3667
2615 if (!last_ptr) { 3668 if (!last_ptr)
2616 empty_cluster = 0; 3669 empty_cluster = 0;
2617 loop = 1;
2618 }
2619 3670
2620 if (search_start == hint_byte) { 3671 if (search_start == hint_byte) {
2621 block_group = btrfs_lookup_block_group(root->fs_info, 3672 block_group = btrfs_lookup_block_group(root->fs_info,
2622 search_start); 3673 search_start);
2623 if (block_group && block_group_bits(block_group, data)) { 3674 /*
3675 * we don't want to use the block group if it doesn't match our
3676 * allocation bits, or if its not cached.
3677 */
3678 if (block_group && block_group_bits(block_group, data) &&
3679 block_group_cache_done(block_group)) {
2624 down_read(&space_info->groups_sem); 3680 down_read(&space_info->groups_sem);
2625 if (list_empty(&block_group->list) || 3681 if (list_empty(&block_group->list) ||
2626 block_group->ro) { 3682 block_group->ro) {
@@ -2643,21 +3699,35 @@ search:
2643 down_read(&space_info->groups_sem); 3699 down_read(&space_info->groups_sem);
2644 list_for_each_entry(block_group, &space_info->block_groups, list) { 3700 list_for_each_entry(block_group, &space_info->block_groups, list) {
2645 u64 offset; 3701 u64 offset;
3702 int cached;
2646 3703
2647 atomic_inc(&block_group->count); 3704 atomic_inc(&block_group->count);
2648 search_start = block_group->key.objectid; 3705 search_start = block_group->key.objectid;
2649 3706
2650have_block_group: 3707have_block_group:
2651 if (unlikely(!block_group->cached)) { 3708 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
2652 mutex_lock(&block_group->cache_mutex); 3709 /*
2653 ret = cache_block_group(root, block_group); 3710 * we want to start caching kthreads, but not too many
2654 mutex_unlock(&block_group->cache_mutex); 3711 * right off the bat so we don't overwhelm the system,
2655 if (ret) { 3712 * so only start them if there are less than 2 and we're
2656 btrfs_put_block_group(block_group); 3713 * in the initial allocation phase.
2657 break; 3714 */
3715 if (loop > LOOP_CACHING_NOWAIT ||
3716 atomic_read(&space_info->caching_threads) < 2) {
3717 ret = cache_block_group(block_group);
3718 BUG_ON(ret);
2658 } 3719 }
2659 } 3720 }
2660 3721
3722 cached = block_group_cache_done(block_group);
3723 if (unlikely(!cached)) {
3724 found_uncached_bg = true;
3725
3726 /* if we only want cached bgs, loop */
3727 if (loop == LOOP_CACHED_ONLY)
3728 goto loop;
3729 }
3730
2661 if (unlikely(block_group->ro)) 3731 if (unlikely(block_group->ro))
2662 goto loop; 3732 goto loop;
2663 3733
@@ -2719,7 +3789,7 @@ refill_cluster:
2719 last_ptr_loop = 0; 3789 last_ptr_loop = 0;
2720 3790
2721 /* allocate a cluster in this block group */ 3791 /* allocate a cluster in this block group */
2722 ret = btrfs_find_space_cluster(trans, 3792 ret = btrfs_find_space_cluster(trans, root,
2723 block_group, last_ptr, 3793 block_group, last_ptr,
2724 offset, num_bytes, 3794 offset, num_bytes,
2725 empty_cluster + empty_size); 3795 empty_cluster + empty_size);
@@ -2736,14 +3806,21 @@ refill_cluster:
2736 spin_unlock(&last_ptr->refill_lock); 3806 spin_unlock(&last_ptr->refill_lock);
2737 goto checks; 3807 goto checks;
2738 } 3808 }
3809 } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
3810 spin_unlock(&last_ptr->refill_lock);
3811
3812 wait_block_group_cache_progress(block_group,
3813 num_bytes + empty_cluster + empty_size);
3814 goto have_block_group;
2739 } 3815 }
3816
2740 /* 3817 /*
2741 * at this point we either didn't find a cluster 3818 * at this point we either didn't find a cluster
2742 * or we weren't able to allocate a block from our 3819 * or we weren't able to allocate a block from our
2743 * cluster. Free the cluster we've been trying 3820 * cluster. Free the cluster we've been trying
2744 * to use, and go to the next block group 3821 * to use, and go to the next block group
2745 */ 3822 */
2746 if (loop < 2) { 3823 if (loop < LOOP_NO_EMPTY_SIZE) {
2747 btrfs_return_cluster_to_free_space(NULL, 3824 btrfs_return_cluster_to_free_space(NULL,
2748 last_ptr); 3825 last_ptr);
2749 spin_unlock(&last_ptr->refill_lock); 3826 spin_unlock(&last_ptr->refill_lock);
@@ -2754,11 +3831,17 @@ refill_cluster:
2754 3831
2755 offset = btrfs_find_space_for_alloc(block_group, search_start, 3832 offset = btrfs_find_space_for_alloc(block_group, search_start,
2756 num_bytes, empty_size); 3833 num_bytes, empty_size);
2757 if (!offset) 3834 if (!offset && (cached || (!cached &&
3835 loop == LOOP_CACHING_NOWAIT))) {
2758 goto loop; 3836 goto loop;
3837 } else if (!offset && (!cached &&
3838 loop > LOOP_CACHING_NOWAIT)) {
3839 wait_block_group_cache_progress(block_group,
3840 num_bytes + empty_size);
3841 goto have_block_group;
3842 }
2759checks: 3843checks:
2760 search_start = stripe_align(root, offset); 3844 search_start = stripe_align(root, offset);
2761
2762 /* move on to the next group */ 3845 /* move on to the next group */
2763 if (search_start + num_bytes >= search_end) { 3846 if (search_start + num_bytes >= search_end) {
2764 btrfs_add_free_space(block_group, offset, num_bytes); 3847 btrfs_add_free_space(block_group, offset, num_bytes);
@@ -2804,13 +3887,26 @@ loop:
2804 } 3887 }
2805 up_read(&space_info->groups_sem); 3888 up_read(&space_info->groups_sem);
2806 3889
2807 /* loop == 0, try to find a clustered alloc in every block group 3890 /* LOOP_CACHED_ONLY, only search fully cached block groups
2808 * loop == 1, try again after forcing a chunk allocation 3891 * LOOP_CACHING_NOWAIT, search partially cached block groups, but
2809 * loop == 2, set empty_size and empty_cluster to 0 and try again 3892 * dont wait foR them to finish caching
3893 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3894 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3895 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3896 * again
2810 */ 3897 */
2811 if (!ins->objectid && loop < 3 && 3898 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
2812 (empty_size || empty_cluster || allowed_chunk_alloc)) { 3899 (found_uncached_bg || empty_size || empty_cluster ||
2813 if (loop >= 2) { 3900 allowed_chunk_alloc)) {
3901 if (found_uncached_bg) {
3902 found_uncached_bg = false;
3903 if (loop < LOOP_CACHING_WAIT) {
3904 loop++;
3905 goto search;
3906 }
3907 }
3908
3909 if (loop == LOOP_ALLOC_CHUNK) {
2814 empty_size = 0; 3910 empty_size = 0;
2815 empty_cluster = 0; 3911 empty_cluster = 0;
2816 } 3912 }
@@ -2823,7 +3919,7 @@ loop:
2823 space_info->force_alloc = 1; 3919 space_info->force_alloc = 1;
2824 } 3920 }
2825 3921
2826 if (loop < 3) { 3922 if (loop < LOOP_NO_EMPTY_SIZE) {
2827 loop++; 3923 loop++;
2828 goto search; 3924 goto search;
2829 } 3925 }
@@ -2919,7 +4015,7 @@ again:
2919 num_bytes, data, 1); 4015 num_bytes, data, 1);
2920 goto again; 4016 goto again;
2921 } 4017 }
2922 if (ret) { 4018 if (ret == -ENOSPC) {
2923 struct btrfs_space_info *sinfo; 4019 struct btrfs_space_info *sinfo;
2924 4020
2925 sinfo = __find_space_info(root->fs_info, data); 4021 sinfo = __find_space_info(root->fs_info, data);
@@ -2927,7 +4023,6 @@ again:
2927 "wanted %llu\n", (unsigned long long)data, 4023 "wanted %llu\n", (unsigned long long)data,
2928 (unsigned long long)num_bytes); 4024 (unsigned long long)num_bytes);
2929 dump_space_info(sinfo, num_bytes); 4025 dump_space_info(sinfo, num_bytes);
2930 BUG();
2931 } 4026 }
2932 4027
2933 return ret; 4028 return ret;
@@ -2965,103 +4060,153 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2965 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, 4060 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
2966 empty_size, hint_byte, search_end, ins, 4061 empty_size, hint_byte, search_end, ins,
2967 data); 4062 data);
2968 update_reserved_extents(root, ins->objectid, ins->offset, 1); 4063 if (!ret)
4064 update_reserved_extents(root, ins->objectid, ins->offset, 1);
4065
2969 return ret; 4066 return ret;
2970} 4067}
2971 4068
2972static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 4069static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
2973 struct btrfs_root *root, u64 parent, 4070 struct btrfs_root *root,
2974 u64 root_objectid, u64 ref_generation, 4071 u64 parent, u64 root_objectid,
2975 u64 owner, struct btrfs_key *ins, 4072 u64 flags, u64 owner, u64 offset,
2976 int ref_mod) 4073 struct btrfs_key *ins, int ref_mod)
2977{ 4074{
2978 int ret; 4075 int ret;
2979 u64 super_used; 4076 struct btrfs_fs_info *fs_info = root->fs_info;
2980 u64 root_used;
2981 u64 num_bytes = ins->offset;
2982 u32 sizes[2];
2983 struct btrfs_fs_info *info = root->fs_info;
2984 struct btrfs_root *extent_root = info->extent_root;
2985 struct btrfs_extent_item *extent_item; 4077 struct btrfs_extent_item *extent_item;
2986 struct btrfs_extent_ref *ref; 4078 struct btrfs_extent_inline_ref *iref;
2987 struct btrfs_path *path; 4079 struct btrfs_path *path;
2988 struct btrfs_key keys[2]; 4080 struct extent_buffer *leaf;
2989 4081 int type;
2990 if (parent == 0) 4082 u32 size;
2991 parent = ins->objectid;
2992
2993 /* block accounting for super block */
2994 spin_lock(&info->delalloc_lock);
2995 super_used = btrfs_super_bytes_used(&info->super_copy);
2996 btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
2997 4083
2998 /* block accounting for root item */ 4084 if (parent > 0)
2999 root_used = btrfs_root_used(&root->root_item); 4085 type = BTRFS_SHARED_DATA_REF_KEY;
3000 btrfs_set_root_used(&root->root_item, root_used + num_bytes); 4086 else
3001 spin_unlock(&info->delalloc_lock); 4087 type = BTRFS_EXTENT_DATA_REF_KEY;
3002 4088
3003 memcpy(&keys[0], ins, sizeof(*ins)); 4089 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
3004 keys[1].objectid = ins->objectid;
3005 keys[1].type = BTRFS_EXTENT_REF_KEY;
3006 keys[1].offset = parent;
3007 sizes[0] = sizeof(*extent_item);
3008 sizes[1] = sizeof(*ref);
3009 4090
3010 path = btrfs_alloc_path(); 4091 path = btrfs_alloc_path();
3011 BUG_ON(!path); 4092 BUG_ON(!path);
3012 4093
3013 path->leave_spinning = 1; 4094 path->leave_spinning = 1;
3014 ret = btrfs_insert_empty_items(trans, extent_root, path, keys, 4095 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
3015 sizes, 2); 4096 ins, size);
3016 BUG_ON(ret); 4097 BUG_ON(ret);
3017 4098
3018 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4099 leaf = path->nodes[0];
4100 extent_item = btrfs_item_ptr(leaf, path->slots[0],
3019 struct btrfs_extent_item); 4101 struct btrfs_extent_item);
3020 btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod); 4102 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
3021 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 4103 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
3022 struct btrfs_extent_ref); 4104 btrfs_set_extent_flags(leaf, extent_item,
3023 4105 flags | BTRFS_EXTENT_FLAG_DATA);
3024 btrfs_set_ref_root(path->nodes[0], ref, root_objectid); 4106
3025 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); 4107 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
3026 btrfs_set_ref_objectid(path->nodes[0], ref, owner); 4108 btrfs_set_extent_inline_ref_type(leaf, iref, type);
3027 btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod); 4109 if (parent > 0) {
4110 struct btrfs_shared_data_ref *ref;
4111 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4112 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4113 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4114 } else {
4115 struct btrfs_extent_data_ref *ref;
4116 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4117 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4118 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4119 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4120 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4121 }
3028 4122
3029 btrfs_mark_buffer_dirty(path->nodes[0]); 4123 btrfs_mark_buffer_dirty(path->nodes[0]);
3030
3031 trans->alloc_exclude_start = 0;
3032 trans->alloc_exclude_nr = 0;
3033 btrfs_free_path(path); 4124 btrfs_free_path(path);
3034 4125
3035 if (ret) 4126 ret = update_block_group(trans, root, ins->objectid, ins->offset,
3036 goto out; 4127 1, 0);
3037
3038 ret = update_block_group(trans, root, ins->objectid,
3039 ins->offset, 1, 0);
3040 if (ret) { 4128 if (ret) {
3041 printk(KERN_ERR "btrfs update block group failed for %llu " 4129 printk(KERN_ERR "btrfs update block group failed for %llu "
3042 "%llu\n", (unsigned long long)ins->objectid, 4130 "%llu\n", (unsigned long long)ins->objectid,
3043 (unsigned long long)ins->offset); 4131 (unsigned long long)ins->offset);
3044 BUG(); 4132 BUG();
3045 } 4133 }
3046out:
3047 return ret; 4134 return ret;
3048} 4135}
3049 4136
3050int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 4137static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
3051 struct btrfs_root *root, u64 parent, 4138 struct btrfs_root *root,
3052 u64 root_objectid, u64 ref_generation, 4139 u64 parent, u64 root_objectid,
3053 u64 owner, struct btrfs_key *ins) 4140 u64 flags, struct btrfs_disk_key *key,
4141 int level, struct btrfs_key *ins)
3054{ 4142{
3055 int ret; 4143 int ret;
4144 struct btrfs_fs_info *fs_info = root->fs_info;
4145 struct btrfs_extent_item *extent_item;
4146 struct btrfs_tree_block_info *block_info;
4147 struct btrfs_extent_inline_ref *iref;
4148 struct btrfs_path *path;
4149 struct extent_buffer *leaf;
4150 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
3056 4151
3057 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) 4152 path = btrfs_alloc_path();
3058 return 0; 4153 BUG_ON(!path);
3059 4154
3060 ret = btrfs_add_delayed_ref(trans, ins->objectid, 4155 path->leave_spinning = 1;
3061 ins->offset, parent, root_objectid, 4156 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
3062 ref_generation, owner, 4157 ins, size);
3063 BTRFS_ADD_DELAYED_EXTENT, 0);
3064 BUG_ON(ret); 4158 BUG_ON(ret);
4159
4160 leaf = path->nodes[0];
4161 extent_item = btrfs_item_ptr(leaf, path->slots[0],
4162 struct btrfs_extent_item);
4163 btrfs_set_extent_refs(leaf, extent_item, 1);
4164 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4165 btrfs_set_extent_flags(leaf, extent_item,
4166 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4167 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4168
4169 btrfs_set_tree_block_key(leaf, block_info, key);
4170 btrfs_set_tree_block_level(leaf, block_info, level);
4171
4172 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4173 if (parent > 0) {
4174 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
4175 btrfs_set_extent_inline_ref_type(leaf, iref,
4176 BTRFS_SHARED_BLOCK_REF_KEY);
4177 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4178 } else {
4179 btrfs_set_extent_inline_ref_type(leaf, iref,
4180 BTRFS_TREE_BLOCK_REF_KEY);
4181 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
4182 }
4183
4184 btrfs_mark_buffer_dirty(leaf);
4185 btrfs_free_path(path);
4186
4187 ret = update_block_group(trans, root, ins->objectid, ins->offset,
4188 1, 0);
4189 if (ret) {
4190 printk(KERN_ERR "btrfs update block group failed for %llu "
4191 "%llu\n", (unsigned long long)ins->objectid,
4192 (unsigned long long)ins->offset);
4193 BUG();
4194 }
4195 return ret;
4196}
4197
4198int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4199 struct btrfs_root *root,
4200 u64 root_objectid, u64 owner,
4201 u64 offset, struct btrfs_key *ins)
4202{
4203 int ret;
4204
4205 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
4206
4207 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
4208 0, root_objectid, owner, offset,
4209 BTRFS_ADD_DELAYED_EXTENT, NULL);
3065 return ret; 4210 return ret;
3066} 4211}
3067 4212
@@ -3070,25 +4215,25 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3070 * an extent has been allocated and makes sure to clear the free 4215 * an extent has been allocated and makes sure to clear the free
3071 * space cache bits as well 4216 * space cache bits as well
3072 */ 4217 */
3073int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, 4218int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
3074 struct btrfs_root *root, u64 parent, 4219 struct btrfs_root *root,
3075 u64 root_objectid, u64 ref_generation, 4220 u64 root_objectid, u64 owner, u64 offset,
3076 u64 owner, struct btrfs_key *ins) 4221 struct btrfs_key *ins)
3077{ 4222{
3078 int ret; 4223 int ret;
3079 struct btrfs_block_group_cache *block_group; 4224 struct btrfs_block_group_cache *block_group;
3080 4225
3081 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 4226 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3082 mutex_lock(&block_group->cache_mutex); 4227 cache_block_group(block_group);
3083 cache_block_group(root, block_group); 4228 wait_event(block_group->caching_q,
3084 mutex_unlock(&block_group->cache_mutex); 4229 block_group_cache_done(block_group));
3085 4230
3086 ret = btrfs_remove_free_space(block_group, ins->objectid, 4231 ret = btrfs_remove_free_space(block_group, ins->objectid,
3087 ins->offset); 4232 ins->offset);
3088 BUG_ON(ret); 4233 BUG_ON(ret);
3089 btrfs_put_block_group(block_group); 4234 btrfs_put_block_group(block_group);
3090 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 4235 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
3091 ref_generation, owner, ins, 1); 4236 0, owner, offset, ins, 1);
3092 return ret; 4237 return ret;
3093} 4238}
3094 4239
@@ -3099,26 +4244,49 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3099 * 4244 *
3100 * returns 0 if everything worked, non-zero otherwise. 4245 * returns 0 if everything worked, non-zero otherwise.
3101 */ 4246 */
3102int btrfs_alloc_extent(struct btrfs_trans_handle *trans, 4247static int alloc_tree_block(struct btrfs_trans_handle *trans,
3103 struct btrfs_root *root, 4248 struct btrfs_root *root,
3104 u64 num_bytes, u64 parent, u64 min_alloc_size, 4249 u64 num_bytes, u64 parent, u64 root_objectid,
3105 u64 root_objectid, u64 ref_generation, 4250 struct btrfs_disk_key *key, int level,
3106 u64 owner_objectid, u64 empty_size, u64 hint_byte, 4251 u64 empty_size, u64 hint_byte, u64 search_end,
3107 u64 search_end, struct btrfs_key *ins, u64 data) 4252 struct btrfs_key *ins)
3108{ 4253{
3109 int ret; 4254 int ret;
3110 ret = __btrfs_reserve_extent(trans, root, num_bytes, 4255 u64 flags = 0;
3111 min_alloc_size, empty_size, hint_byte, 4256
3112 search_end, ins, data); 4257 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
3113 BUG_ON(ret); 4258 empty_size, hint_byte, search_end,
4259 ins, 0);
4260 if (ret)
4261 return ret;
4262
4263 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4264 if (parent == 0)
4265 parent = ins->objectid;
4266 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4267 } else
4268 BUG_ON(parent > 0);
4269
4270 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3114 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 4271 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3115 ret = btrfs_add_delayed_ref(trans, ins->objectid, 4272 struct btrfs_delayed_extent_op *extent_op;
3116 ins->offset, parent, root_objectid, 4273 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3117 ref_generation, owner_objectid, 4274 BUG_ON(!extent_op);
3118 BTRFS_ADD_DELAYED_EXTENT, 0); 4275 if (key)
4276 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4277 else
4278 memset(&extent_op->key, 0, sizeof(extent_op->key));
4279 extent_op->flags_to_set = flags;
4280 extent_op->update_key = 1;
4281 extent_op->update_flags = 1;
4282 extent_op->is_data = 0;
4283
4284 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4285 ins->offset, parent, root_objectid,
4286 level, BTRFS_ADD_DELAYED_EXTENT,
4287 extent_op);
3119 BUG_ON(ret); 4288 BUG_ON(ret);
3120 } 4289 }
3121 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3122 return ret; 4290 return ret;
3123} 4291}
3124 4292
@@ -3157,21 +4325,17 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3157 * returns the tree buffer or NULL. 4325 * returns the tree buffer or NULL.
3158 */ 4326 */
3159struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 4327struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3160 struct btrfs_root *root, 4328 struct btrfs_root *root, u32 blocksize,
3161 u32 blocksize, u64 parent, 4329 u64 parent, u64 root_objectid,
3162 u64 root_objectid, 4330 struct btrfs_disk_key *key, int level,
3163 u64 ref_generation, 4331 u64 hint, u64 empty_size)
3164 int level,
3165 u64 hint,
3166 u64 empty_size)
3167{ 4332{
3168 struct btrfs_key ins; 4333 struct btrfs_key ins;
3169 int ret; 4334 int ret;
3170 struct extent_buffer *buf; 4335 struct extent_buffer *buf;
3171 4336
3172 ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize, 4337 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
3173 root_objectid, ref_generation, level, 4338 key, level, empty_size, hint, (u64)-1, &ins);
3174 empty_size, hint, (u64)-1, &ins, 0);
3175 if (ret) { 4339 if (ret) {
3176 BUG_ON(ret > 0); 4340 BUG_ON(ret > 0);
3177 return ERR_PTR(ret); 4341 return ERR_PTR(ret);
@@ -3182,35 +4346,23 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3182 return buf; 4346 return buf;
3183} 4347}
3184 4348
4349#if 0
3185int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 4350int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3186 struct btrfs_root *root, struct extent_buffer *leaf) 4351 struct btrfs_root *root, struct extent_buffer *leaf)
3187{ 4352{
3188 u64 leaf_owner; 4353 u64 disk_bytenr;
3189 u64 leaf_generation; 4354 u64 num_bytes;
3190 struct refsort *sorted;
3191 struct btrfs_key key; 4355 struct btrfs_key key;
3192 struct btrfs_file_extent_item *fi; 4356 struct btrfs_file_extent_item *fi;
4357 u32 nritems;
3193 int i; 4358 int i;
3194 int nritems;
3195 int ret; 4359 int ret;
3196 int refi = 0;
3197 int slot;
3198 4360
3199 BUG_ON(!btrfs_is_leaf(leaf)); 4361 BUG_ON(!btrfs_is_leaf(leaf));
3200 nritems = btrfs_header_nritems(leaf); 4362 nritems = btrfs_header_nritems(leaf);
3201 leaf_owner = btrfs_header_owner(leaf);
3202 leaf_generation = btrfs_header_generation(leaf);
3203 4363
3204 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3205 /* we do this loop twice. The first time we build a list
3206 * of the extents we have a reference on, then we sort the list
3207 * by bytenr. The second time around we actually do the
3208 * extent freeing.
3209 */
3210 for (i = 0; i < nritems; i++) { 4364 for (i = 0; i < nritems; i++) {
3211 u64 disk_bytenr;
3212 cond_resched(); 4365 cond_resched();
3213
3214 btrfs_item_key_to_cpu(leaf, &key, i); 4366 btrfs_item_key_to_cpu(leaf, &key, i);
3215 4367
3216 /* only extents have references, skip everything else */ 4368 /* only extents have references, skip everything else */
@@ -3230,42 +4382,11 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3230 if (disk_bytenr == 0) 4382 if (disk_bytenr == 0)
3231 continue; 4383 continue;
3232 4384
3233 sorted[refi].bytenr = disk_bytenr; 4385 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
3234 sorted[refi].slot = i; 4386 ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
3235 refi++; 4387 leaf->start, 0, key.objectid, 0);
3236 }
3237
3238 if (refi == 0)
3239 goto out;
3240
3241 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3242
3243 for (i = 0; i < refi; i++) {
3244 u64 disk_bytenr;
3245
3246 disk_bytenr = sorted[i].bytenr;
3247 slot = sorted[i].slot;
3248
3249 cond_resched();
3250
3251 btrfs_item_key_to_cpu(leaf, &key, slot);
3252 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3253 continue;
3254
3255 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3256
3257 ret = btrfs_free_extent(trans, root, disk_bytenr,
3258 btrfs_file_extent_disk_num_bytes(leaf, fi),
3259 leaf->start, leaf_owner, leaf_generation,
3260 key.objectid, 0);
3261 BUG_ON(ret); 4388 BUG_ON(ret);
3262
3263 atomic_inc(&root->fs_info->throttle_gen);
3264 wake_up(&root->fs_info->transaction_throttle);
3265 cond_resched();
3266 } 4389 }
3267out:
3268 kfree(sorted);
3269 return 0; 4390 return 0;
3270} 4391}
3271 4392
@@ -3311,13 +4432,14 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3311 return 0; 4432 return 0;
3312} 4433}
3313 4434
4435
3314static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, 4436static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3315 struct btrfs_root *root, u64 start, 4437 struct btrfs_root *root, u64 start,
3316 u64 len, u32 *refs) 4438 u64 len, u32 *refs)
3317{ 4439{
3318 int ret; 4440 int ret;
3319 4441
3320 ret = btrfs_lookup_extent_ref(trans, root, start, len, refs); 4442 ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
3321 BUG_ON(ret); 4443 BUG_ON(ret);
3322 4444
3323#if 0 /* some debugging code in case we see problems here */ 4445#if 0 /* some debugging code in case we see problems here */
@@ -3352,6 +4474,7 @@ static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3352 return ret; 4474 return ret;
3353} 4475}
3354 4476
4477
3355/* 4478/*
3356 * this is used while deleting old snapshots, and it drops the refs 4479 * this is used while deleting old snapshots, and it drops the refs
3357 * on a whole subtree starting from a level 1 node. 4480 * on a whole subtree starting from a level 1 node.
@@ -3645,279 +4768,473 @@ out:
3645 cond_resched(); 4768 cond_resched();
3646 return 0; 4769 return 0;
3647} 4770}
4771#endif
4772
4773struct walk_control {
4774 u64 refs[BTRFS_MAX_LEVEL];
4775 u64 flags[BTRFS_MAX_LEVEL];
4776 struct btrfs_key update_progress;
4777 int stage;
4778 int level;
4779 int shared_level;
4780 int update_ref;
4781 int keep_locks;
4782};
4783
4784#define DROP_REFERENCE 1
4785#define UPDATE_BACKREF 2
3648 4786
3649/* 4787/*
3650 * helper function for drop_subtree, this function is similar to 4788 * hepler to process tree block while walking down the tree.
3651 * walk_down_tree. The main difference is that it checks reference 4789 *
3652 * counts while tree blocks are locked. 4790 * when wc->stage == DROP_REFERENCE, this function checks
4791 * reference count of the block. if the block is shared and
4792 * we need update back refs for the subtree rooted at the
4793 * block, this function changes wc->stage to UPDATE_BACKREF
4794 *
4795 * when wc->stage == UPDATE_BACKREF, this function updates
4796 * back refs for pointers in the block.
4797 *
4798 * NOTE: return value 1 means we should stop walking down.
3653 */ 4799 */
3654static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, 4800static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
3655 struct btrfs_root *root, 4801 struct btrfs_root *root,
3656 struct btrfs_path *path, int *level) 4802 struct btrfs_path *path,
4803 struct walk_control *wc)
4804{
4805 int level = wc->level;
4806 struct extent_buffer *eb = path->nodes[level];
4807 struct btrfs_key key;
4808 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
4809 int ret;
4810
4811 if (wc->stage == UPDATE_BACKREF &&
4812 btrfs_header_owner(eb) != root->root_key.objectid)
4813 return 1;
4814
4815 /*
4816 * when reference count of tree block is 1, it won't increase
4817 * again. once full backref flag is set, we never clear it.
4818 */
4819 if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
4820 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
4821 BUG_ON(!path->locks[level]);
4822 ret = btrfs_lookup_extent_info(trans, root,
4823 eb->start, eb->len,
4824 &wc->refs[level],
4825 &wc->flags[level]);
4826 BUG_ON(ret);
4827 BUG_ON(wc->refs[level] == 0);
4828 }
4829
4830 if (wc->stage == DROP_REFERENCE &&
4831 wc->update_ref && wc->refs[level] > 1) {
4832 BUG_ON(eb == root->node);
4833 BUG_ON(path->slots[level] > 0);
4834 if (level == 0)
4835 btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
4836 else
4837 btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
4838 if (btrfs_header_owner(eb) == root->root_key.objectid &&
4839 btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
4840 wc->stage = UPDATE_BACKREF;
4841 wc->shared_level = level;
4842 }
4843 }
4844
4845 if (wc->stage == DROP_REFERENCE) {
4846 if (wc->refs[level] > 1)
4847 return 1;
4848
4849 if (path->locks[level] && !wc->keep_locks) {
4850 btrfs_tree_unlock(eb);
4851 path->locks[level] = 0;
4852 }
4853 return 0;
4854 }
4855
4856 /* wc->stage == UPDATE_BACKREF */
4857 if (!(wc->flags[level] & flag)) {
4858 BUG_ON(!path->locks[level]);
4859 ret = btrfs_inc_ref(trans, root, eb, 1);
4860 BUG_ON(ret);
4861 ret = btrfs_dec_ref(trans, root, eb, 0);
4862 BUG_ON(ret);
4863 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
4864 eb->len, flag, 0);
4865 BUG_ON(ret);
4866 wc->flags[level] |= flag;
4867 }
4868
4869 /*
4870 * the block is shared by multiple trees, so it's not good to
4871 * keep the tree lock
4872 */
4873 if (path->locks[level] && level > 0) {
4874 btrfs_tree_unlock(eb);
4875 path->locks[level] = 0;
4876 }
4877 return 0;
4878}
4879
4880/*
4881 * hepler to process tree block while walking up the tree.
4882 *
4883 * when wc->stage == DROP_REFERENCE, this function drops
4884 * reference count on the block.
4885 *
4886 * when wc->stage == UPDATE_BACKREF, this function changes
4887 * wc->stage back to DROP_REFERENCE if we changed wc->stage
4888 * to UPDATE_BACKREF previously while processing the block.
4889 *
4890 * NOTE: return value 1 means we should stop walking up.
4891 */
4892static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4893 struct btrfs_root *root,
4894 struct btrfs_path *path,
4895 struct walk_control *wc)
4896{
4897 int ret = 0;
4898 int level = wc->level;
4899 struct extent_buffer *eb = path->nodes[level];
4900 u64 parent = 0;
4901
4902 if (wc->stage == UPDATE_BACKREF) {
4903 BUG_ON(wc->shared_level < level);
4904 if (level < wc->shared_level)
4905 goto out;
4906
4907 BUG_ON(wc->refs[level] <= 1);
4908 ret = find_next_key(path, level + 1, &wc->update_progress);
4909 if (ret > 0)
4910 wc->update_ref = 0;
4911
4912 wc->stage = DROP_REFERENCE;
4913 wc->shared_level = -1;
4914 path->slots[level] = 0;
4915
4916 /*
4917 * check reference count again if the block isn't locked.
4918 * we should start walking down the tree again if reference
4919 * count is one.
4920 */
4921 if (!path->locks[level]) {
4922 BUG_ON(level == 0);
4923 btrfs_tree_lock(eb);
4924 btrfs_set_lock_blocking(eb);
4925 path->locks[level] = 1;
4926
4927 ret = btrfs_lookup_extent_info(trans, root,
4928 eb->start, eb->len,
4929 &wc->refs[level],
4930 &wc->flags[level]);
4931 BUG_ON(ret);
4932 BUG_ON(wc->refs[level] == 0);
4933 if (wc->refs[level] == 1) {
4934 btrfs_tree_unlock(eb);
4935 path->locks[level] = 0;
4936 return 1;
4937 }
4938 } else {
4939 BUG_ON(level != 0);
4940 }
4941 }
4942
4943 /* wc->stage == DROP_REFERENCE */
4944 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
4945
4946 if (wc->refs[level] == 1) {
4947 if (level == 0) {
4948 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
4949 ret = btrfs_dec_ref(trans, root, eb, 1);
4950 else
4951 ret = btrfs_dec_ref(trans, root, eb, 0);
4952 BUG_ON(ret);
4953 }
4954 /* make block locked assertion in clean_tree_block happy */
4955 if (!path->locks[level] &&
4956 btrfs_header_generation(eb) == trans->transid) {
4957 btrfs_tree_lock(eb);
4958 btrfs_set_lock_blocking(eb);
4959 path->locks[level] = 1;
4960 }
4961 clean_tree_block(trans, root, eb);
4962 }
4963
4964 if (eb == root->node) {
4965 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
4966 parent = eb->start;
4967 else
4968 BUG_ON(root->root_key.objectid !=
4969 btrfs_header_owner(eb));
4970 } else {
4971 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
4972 parent = path->nodes[level + 1]->start;
4973 else
4974 BUG_ON(root->root_key.objectid !=
4975 btrfs_header_owner(path->nodes[level + 1]));
4976 }
4977
4978 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
4979 root->root_key.objectid, level, 0);
4980 BUG_ON(ret);
4981out:
4982 wc->refs[level] = 0;
4983 wc->flags[level] = 0;
4984 return ret;
4985}
4986
4987static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4988 struct btrfs_root *root,
4989 struct btrfs_path *path,
4990 struct walk_control *wc)
3657{ 4991{
3658 struct extent_buffer *next; 4992 struct extent_buffer *next;
3659 struct extent_buffer *cur; 4993 struct extent_buffer *cur;
3660 struct extent_buffer *parent;
3661 u64 bytenr; 4994 u64 bytenr;
3662 u64 ptr_gen; 4995 u64 ptr_gen;
3663 u32 blocksize; 4996 u32 blocksize;
3664 u32 refs; 4997 int level = wc->level;
3665 int ret; 4998 int ret;
3666 4999
3667 cur = path->nodes[*level]; 5000 while (level >= 0) {
3668 ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len, 5001 cur = path->nodes[level];
3669 &refs); 5002 BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
3670 BUG_ON(ret);
3671 if (refs > 1)
3672 goto out;
3673 5003
3674 while (*level >= 0) { 5004 ret = walk_down_proc(trans, root, path, wc);
3675 cur = path->nodes[*level]; 5005 if (ret > 0)
3676 if (*level == 0) {
3677 ret = btrfs_drop_leaf_ref(trans, root, cur);
3678 BUG_ON(ret);
3679 clean_tree_block(trans, root, cur);
3680 break; 5006 break;
3681 } 5007
3682 if (path->slots[*level] >= btrfs_header_nritems(cur)) { 5008 if (level == 0)
3683 clean_tree_block(trans, root, cur);
3684 break; 5009 break;
3685 }
3686 5010
3687 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 5011 bytenr = btrfs_node_blockptr(cur, path->slots[level]);
3688 blocksize = btrfs_level_size(root, *level - 1); 5012 blocksize = btrfs_level_size(root, level - 1);
3689 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 5013 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
3690 5014
3691 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 5015 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3692 btrfs_tree_lock(next); 5016 btrfs_tree_lock(next);
3693 btrfs_set_lock_blocking(next); 5017 btrfs_set_lock_blocking(next);
3694 5018
3695 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 5019 level--;
3696 &refs); 5020 BUG_ON(level != btrfs_header_level(next));
3697 BUG_ON(ret); 5021 path->nodes[level] = next;
3698 if (refs > 1) { 5022 path->slots[level] = 0;
3699 parent = path->nodes[*level]; 5023 path->locks[level] = 1;
3700 ret = btrfs_free_extent(trans, root, bytenr, 5024 wc->level = level;
3701 blocksize, parent->start,
3702 btrfs_header_owner(parent),
3703 btrfs_header_generation(parent),
3704 *level - 1, 1);
3705 BUG_ON(ret);
3706 path->slots[*level]++;
3707 btrfs_tree_unlock(next);
3708 free_extent_buffer(next);
3709 continue;
3710 }
3711
3712 *level = btrfs_header_level(next);
3713 path->nodes[*level] = next;
3714 path->slots[*level] = 0;
3715 path->locks[*level] = 1;
3716 cond_resched();
3717 }
3718out:
3719 parent = path->nodes[*level + 1];
3720 bytenr = path->nodes[*level]->start;
3721 blocksize = path->nodes[*level]->len;
3722
3723 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3724 parent->start, btrfs_header_owner(parent),
3725 btrfs_header_generation(parent), *level, 1);
3726 BUG_ON(ret);
3727
3728 if (path->locks[*level]) {
3729 btrfs_tree_unlock(path->nodes[*level]);
3730 path->locks[*level] = 0;
3731 } 5025 }
3732 free_extent_buffer(path->nodes[*level]);
3733 path->nodes[*level] = NULL;
3734 *level += 1;
3735 cond_resched();
3736 return 0; 5026 return 0;
3737} 5027}
3738 5028
3739/*
3740 * helper for dropping snapshots. This walks back up the tree in the path
3741 * to find the first node higher up where we haven't yet gone through
3742 * all the slots
3743 */
3744static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 5029static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3745 struct btrfs_root *root, 5030 struct btrfs_root *root,
3746 struct btrfs_path *path, 5031 struct btrfs_path *path,
3747 int *level, int max_level) 5032 struct walk_control *wc, int max_level)
3748{ 5033{
3749 u64 root_owner; 5034 int level = wc->level;
3750 u64 root_gen;
3751 struct btrfs_root_item *root_item = &root->root_item;
3752 int i;
3753 int slot;
3754 int ret; 5035 int ret;
3755 5036
3756 for (i = *level; i < max_level && path->nodes[i]; i++) { 5037 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
3757 slot = path->slots[i]; 5038 while (level < max_level && path->nodes[level]) {
3758 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 5039 wc->level = level;
3759 struct extent_buffer *node; 5040 if (path->slots[level] + 1 <
3760 struct btrfs_disk_key disk_key; 5041 btrfs_header_nritems(path->nodes[level])) {
3761 5042 path->slots[level]++;
3762 /*
3763 * there is more work to do in this level.
3764 * Update the drop_progress marker to reflect
3765 * the work we've done so far, and then bump
3766 * the slot number
3767 */
3768 node = path->nodes[i];
3769 path->slots[i]++;
3770 *level = i;
3771 WARN_ON(*level == 0);
3772 btrfs_node_key(node, &disk_key, path->slots[i]);
3773 memcpy(&root_item->drop_progress,
3774 &disk_key, sizeof(disk_key));
3775 root_item->drop_level = i;
3776 return 0; 5043 return 0;
3777 } else { 5044 } else {
3778 struct extent_buffer *parent; 5045 ret = walk_up_proc(trans, root, path, wc);
3779 5046 if (ret > 0)
3780 /* 5047 return 0;
3781 * this whole node is done, free our reference
3782 * on it and go up one level
3783 */
3784 if (path->nodes[*level] == root->node)
3785 parent = path->nodes[*level];
3786 else
3787 parent = path->nodes[*level + 1];
3788
3789 root_owner = btrfs_header_owner(parent);
3790 root_gen = btrfs_header_generation(parent);
3791 5048
3792 clean_tree_block(trans, root, path->nodes[*level]); 5049 if (path->locks[level]) {
3793 ret = btrfs_free_extent(trans, root, 5050 btrfs_tree_unlock(path->nodes[level]);
3794 path->nodes[*level]->start, 5051 path->locks[level] = 0;
3795 path->nodes[*level]->len,
3796 parent->start, root_owner,
3797 root_gen, *level, 1);
3798 BUG_ON(ret);
3799 if (path->locks[*level]) {
3800 btrfs_tree_unlock(path->nodes[*level]);
3801 path->locks[*level] = 0;
3802 } 5052 }
3803 free_extent_buffer(path->nodes[*level]); 5053 free_extent_buffer(path->nodes[level]);
3804 path->nodes[*level] = NULL; 5054 path->nodes[level] = NULL;
3805 *level = i + 1; 5055 level++;
3806 } 5056 }
3807 } 5057 }
3808 return 1; 5058 return 1;
3809} 5059}
3810 5060
3811/* 5061/*
3812 * drop the reference count on the tree rooted at 'snap'. This traverses 5062 * drop a subvolume tree.
3813 * the tree freeing any blocks that have a ref count of zero after being 5063 *
3814 * decremented. 5064 * this function traverses the tree freeing any blocks that only
5065 * referenced by the tree.
5066 *
5067 * when a shared tree block is found. this function decreases its
5068 * reference count by one. if update_ref is true, this function
5069 * also make sure backrefs for the shared block and all lower level
5070 * blocks are properly updated.
3815 */ 5071 */
3816int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root 5072int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
3817 *root)
3818{ 5073{
3819 int ret = 0;
3820 int wret;
3821 int level;
3822 struct btrfs_path *path; 5074 struct btrfs_path *path;
3823 int i; 5075 struct btrfs_trans_handle *trans;
3824 int orig_level; 5076 struct btrfs_root *tree_root = root->fs_info->tree_root;
3825 int update_count;
3826 struct btrfs_root_item *root_item = &root->root_item; 5077 struct btrfs_root_item *root_item = &root->root_item;
5078 struct walk_control *wc;
5079 struct btrfs_key key;
5080 int err = 0;
5081 int ret;
5082 int level;
3827 5083
3828 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3829 path = btrfs_alloc_path(); 5084 path = btrfs_alloc_path();
3830 BUG_ON(!path); 5085 BUG_ON(!path);
3831 5086
3832 level = btrfs_header_level(root->node); 5087 wc = kzalloc(sizeof(*wc), GFP_NOFS);
3833 orig_level = level; 5088 BUG_ON(!wc);
5089
5090 trans = btrfs_start_transaction(tree_root, 1);
5091
3834 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5092 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3835 path->nodes[level] = root->node; 5093 level = btrfs_header_level(root->node);
3836 extent_buffer_get(root->node); 5094 path->nodes[level] = btrfs_lock_root_node(root);
5095 btrfs_set_lock_blocking(path->nodes[level]);
3837 path->slots[level] = 0; 5096 path->slots[level] = 0;
5097 path->locks[level] = 1;
5098 memset(&wc->update_progress, 0,
5099 sizeof(wc->update_progress));
3838 } else { 5100 } else {
3839 struct btrfs_key key;
3840 struct btrfs_disk_key found_key;
3841 struct extent_buffer *node;
3842
3843 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 5101 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5102 memcpy(&wc->update_progress, &key,
5103 sizeof(wc->update_progress));
5104
3844 level = root_item->drop_level; 5105 level = root_item->drop_level;
5106 BUG_ON(level == 0);
3845 path->lowest_level = level; 5107 path->lowest_level = level;
3846 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5108 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3847 if (wret < 0) { 5109 path->lowest_level = 0;
3848 ret = wret; 5110 if (ret < 0) {
5111 err = ret;
3849 goto out; 5112 goto out;
3850 } 5113 }
3851 node = path->nodes[level]; 5114 btrfs_node_key_to_cpu(path->nodes[level], &key,
3852 btrfs_node_key(node, &found_key, path->slots[level]); 5115 path->slots[level]);
3853 WARN_ON(memcmp(&found_key, &root_item->drop_progress, 5116 WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
3854 sizeof(found_key))); 5117
3855 /* 5118 /*
3856 * unlock our path, this is safe because only this 5119 * unlock our path, this is safe because only this
3857 * function is allowed to delete this snapshot 5120 * function is allowed to delete this snapshot
3858 */ 5121 */
3859 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 5122 btrfs_unlock_up_safe(path, 0);
3860 if (path->nodes[i] && path->locks[i]) { 5123
3861 path->locks[i] = 0; 5124 level = btrfs_header_level(root->node);
3862 btrfs_tree_unlock(path->nodes[i]); 5125 while (1) {
3863 } 5126 btrfs_tree_lock(path->nodes[level]);
5127 btrfs_set_lock_blocking(path->nodes[level]);
5128
5129 ret = btrfs_lookup_extent_info(trans, root,
5130 path->nodes[level]->start,
5131 path->nodes[level]->len,
5132 &wc->refs[level],
5133 &wc->flags[level]);
5134 BUG_ON(ret);
5135 BUG_ON(wc->refs[level] == 0);
5136
5137 if (level == root_item->drop_level)
5138 break;
5139
5140 btrfs_tree_unlock(path->nodes[level]);
5141 WARN_ON(wc->refs[level] != 1);
5142 level--;
3864 } 5143 }
3865 } 5144 }
5145
5146 wc->level = level;
5147 wc->shared_level = -1;
5148 wc->stage = DROP_REFERENCE;
5149 wc->update_ref = update_ref;
5150 wc->keep_locks = 0;
5151
3866 while (1) { 5152 while (1) {
3867 unsigned long update; 5153 ret = walk_down_tree(trans, root, path, wc);
3868 wret = walk_down_tree(trans, root, path, &level); 5154 if (ret < 0) {
3869 if (wret > 0) 5155 err = ret;
3870 break; 5156 break;
3871 if (wret < 0) 5157 }
3872 ret = wret;
3873 5158
3874 wret = walk_up_tree(trans, root, path, &level, 5159 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
3875 BTRFS_MAX_LEVEL); 5160 if (ret < 0) {
3876 if (wret > 0) 5161 err = ret;
3877 break; 5162 break;
3878 if (wret < 0) 5163 }
3879 ret = wret; 5164
3880 if (trans->transaction->in_commit || 5165 if (ret > 0) {
3881 trans->transaction->delayed_refs.flushing) { 5166 BUG_ON(wc->stage != DROP_REFERENCE);
3882 ret = -EAGAIN;
3883 break; 5167 break;
3884 } 5168 }
3885 atomic_inc(&root->fs_info->throttle_gen); 5169
3886 wake_up(&root->fs_info->transaction_throttle); 5170 if (wc->stage == DROP_REFERENCE) {
3887 for (update_count = 0; update_count < 16; update_count++) { 5171 level = wc->level;
5172 btrfs_node_key(path->nodes[level],
5173 &root_item->drop_progress,
5174 path->slots[level]);
5175 root_item->drop_level = level;
5176 }
5177
5178 BUG_ON(wc->level == 0);
5179 if (trans->transaction->in_commit ||
5180 trans->transaction->delayed_refs.flushing) {
5181 ret = btrfs_update_root(trans, tree_root,
5182 &root->root_key,
5183 root_item);
5184 BUG_ON(ret);
5185
5186 btrfs_end_transaction(trans, tree_root);
5187 trans = btrfs_start_transaction(tree_root, 1);
5188 } else {
5189 unsigned long update;
3888 update = trans->delayed_ref_updates; 5190 update = trans->delayed_ref_updates;
3889 trans->delayed_ref_updates = 0; 5191 trans->delayed_ref_updates = 0;
3890 if (update) 5192 if (update)
3891 btrfs_run_delayed_refs(trans, root, update); 5193 btrfs_run_delayed_refs(trans, tree_root,
3892 else 5194 update);
3893 break;
3894 }
3895 }
3896 for (i = 0; i <= orig_level; i++) {
3897 if (path->nodes[i]) {
3898 free_extent_buffer(path->nodes[i]);
3899 path->nodes[i] = NULL;
3900 } 5195 }
3901 } 5196 }
5197 btrfs_release_path(root, path);
5198 BUG_ON(err);
5199
5200 ret = btrfs_del_root(trans, tree_root, &root->root_key);
5201 BUG_ON(ret);
5202
5203 free_extent_buffer(root->node);
5204 free_extent_buffer(root->commit_root);
5205 kfree(root);
3902out: 5206out:
5207 btrfs_end_transaction(trans, tree_root);
5208 kfree(wc);
3903 btrfs_free_path(path); 5209 btrfs_free_path(path);
3904 return ret; 5210 return err;
3905} 5211}
3906 5212
5213/*
5214 * drop subtree rooted at tree block 'node'.
5215 *
5216 * NOTE: this function will unlock and release tree block 'node'
5217 */
3907int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 5218int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3908 struct btrfs_root *root, 5219 struct btrfs_root *root,
3909 struct extent_buffer *node, 5220 struct extent_buffer *node,
3910 struct extent_buffer *parent) 5221 struct extent_buffer *parent)
3911{ 5222{
3912 struct btrfs_path *path; 5223 struct btrfs_path *path;
5224 struct walk_control *wc;
3913 int level; 5225 int level;
3914 int parent_level; 5226 int parent_level;
3915 int ret = 0; 5227 int ret = 0;
3916 int wret; 5228 int wret;
3917 5229
5230 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5231
3918 path = btrfs_alloc_path(); 5232 path = btrfs_alloc_path();
3919 BUG_ON(!path); 5233 BUG_ON(!path);
3920 5234
5235 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5236 BUG_ON(!wc);
5237
3921 btrfs_assert_tree_locked(parent); 5238 btrfs_assert_tree_locked(parent);
3922 parent_level = btrfs_header_level(parent); 5239 parent_level = btrfs_header_level(parent);
3923 extent_buffer_get(parent); 5240 extent_buffer_get(parent);
@@ -3926,28 +5243,38 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3926 5243
3927 btrfs_assert_tree_locked(node); 5244 btrfs_assert_tree_locked(node);
3928 level = btrfs_header_level(node); 5245 level = btrfs_header_level(node);
3929 extent_buffer_get(node);
3930 path->nodes[level] = node; 5246 path->nodes[level] = node;
3931 path->slots[level] = 0; 5247 path->slots[level] = 0;
5248 path->locks[level] = 1;
5249
5250 wc->refs[parent_level] = 1;
5251 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5252 wc->level = level;
5253 wc->shared_level = -1;
5254 wc->stage = DROP_REFERENCE;
5255 wc->update_ref = 0;
5256 wc->keep_locks = 1;
3932 5257
3933 while (1) { 5258 while (1) {
3934 wret = walk_down_subtree(trans, root, path, &level); 5259 wret = walk_down_tree(trans, root, path, wc);
3935 if (wret < 0) 5260 if (wret < 0) {
3936 ret = wret; 5261 ret = wret;
3937 if (wret != 0)
3938 break; 5262 break;
5263 }
3939 5264
3940 wret = walk_up_tree(trans, root, path, &level, parent_level); 5265 wret = walk_up_tree(trans, root, path, wc, parent_level);
3941 if (wret < 0) 5266 if (wret < 0)
3942 ret = wret; 5267 ret = wret;
3943 if (wret != 0) 5268 if (wret != 0)
3944 break; 5269 break;
3945 } 5270 }
3946 5271
5272 kfree(wc);
3947 btrfs_free_path(path); 5273 btrfs_free_path(path);
3948 return ret; 5274 return ret;
3949} 5275}
3950 5276
5277#if 0
3951static unsigned long calc_ra(unsigned long start, unsigned long last, 5278static unsigned long calc_ra(unsigned long start, unsigned long last,
3952 unsigned long nr) 5279 unsigned long nr)
3953{ 5280{
@@ -5429,6 +6756,7 @@ out:
5429 kfree(ref_path); 6756 kfree(ref_path);
5430 return ret; 6757 return ret;
5431} 6758}
6759#endif
5432 6760
5433static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 6761static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5434{ 6762{
@@ -5477,7 +6805,8 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5477 u64 calc; 6805 u64 calc;
5478 6806
5479 spin_lock(&shrink_block_group->lock); 6807 spin_lock(&shrink_block_group->lock);
5480 if (btrfs_block_group_used(&shrink_block_group->item) > 0) { 6808 if (btrfs_block_group_used(&shrink_block_group->item) +
6809 shrink_block_group->reserved > 0) {
5481 spin_unlock(&shrink_block_group->lock); 6810 spin_unlock(&shrink_block_group->lock);
5482 6811
5483 trans = btrfs_start_transaction(root, 1); 6812 trans = btrfs_start_transaction(root, 1);
@@ -5502,6 +6831,17 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5502 return 0; 6831 return 0;
5503} 6832}
5504 6833
6834
6835int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
6836 struct btrfs_block_group_cache *group)
6837
6838{
6839 __alloc_chunk_for_shrink(root, group, 1);
6840 set_block_group_readonly(group);
6841 return 0;
6842}
6843
6844#if 0
5505static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 6845static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5506 struct btrfs_root *root, 6846 struct btrfs_root *root,
5507 u64 objectid, u64 size) 6847 u64 objectid, u64 size)
@@ -5781,6 +7121,7 @@ out:
5781 btrfs_free_path(path); 7121 btrfs_free_path(path);
5782 return ret; 7122 return ret;
5783} 7123}
7124#endif
5784 7125
5785static int find_first_block_group(struct btrfs_root *root, 7126static int find_first_block_group(struct btrfs_root *root,
5786 struct btrfs_path *path, struct btrfs_key *key) 7127 struct btrfs_path *path, struct btrfs_key *key)
@@ -5833,11 +7174,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
5833 &info->block_group_cache_tree); 7174 &info->block_group_cache_tree);
5834 spin_unlock(&info->block_group_cache_lock); 7175 spin_unlock(&info->block_group_cache_lock);
5835 7176
5836 btrfs_remove_free_space_cache(block_group);
5837 down_write(&block_group->space_info->groups_sem); 7177 down_write(&block_group->space_info->groups_sem);
5838 list_del(&block_group->list); 7178 list_del(&block_group->list);
5839 up_write(&block_group->space_info->groups_sem); 7179 up_write(&block_group->space_info->groups_sem);
5840 7180
7181 if (block_group->cached == BTRFS_CACHE_STARTED)
7182 wait_event(block_group->caching_q,
7183 block_group_cache_done(block_group));
7184
7185 btrfs_remove_free_space_cache(block_group);
7186
5841 WARN_ON(atomic_read(&block_group->count) != 1); 7187 WARN_ON(atomic_read(&block_group->count) != 1);
5842 kfree(block_group); 7188 kfree(block_group);
5843 7189
@@ -5903,9 +7249,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
5903 atomic_set(&cache->count, 1); 7249 atomic_set(&cache->count, 1);
5904 spin_lock_init(&cache->lock); 7250 spin_lock_init(&cache->lock);
5905 spin_lock_init(&cache->tree_lock); 7251 spin_lock_init(&cache->tree_lock);
5906 mutex_init(&cache->cache_mutex); 7252 cache->fs_info = info;
7253 init_waitqueue_head(&cache->caching_q);
5907 INIT_LIST_HEAD(&cache->list); 7254 INIT_LIST_HEAD(&cache->list);
5908 INIT_LIST_HEAD(&cache->cluster_list); 7255 INIT_LIST_HEAD(&cache->cluster_list);
7256
7257 /*
7258 * we only want to have 32k of ram per block group for keeping
7259 * track of free space, and if we pass 1/2 of that we want to
7260 * start converting things over to using bitmaps
7261 */
7262 cache->extents_thresh = ((1024 * 32) / 2) /
7263 sizeof(struct btrfs_free_space);
7264
5909 read_extent_buffer(leaf, &cache->item, 7265 read_extent_buffer(leaf, &cache->item,
5910 btrfs_item_ptr_offset(leaf, path->slots[0]), 7266 btrfs_item_ptr_offset(leaf, path->slots[0]),
5911 sizeof(cache->item)); 7267 sizeof(cache->item));
@@ -5914,6 +7270,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
5914 key.objectid = found_key.objectid + found_key.offset; 7270 key.objectid = found_key.objectid + found_key.offset;
5915 btrfs_release_path(root, path); 7271 btrfs_release_path(root, path);
5916 cache->flags = btrfs_block_group_flags(&cache->item); 7272 cache->flags = btrfs_block_group_flags(&cache->item);
7273 cache->sectorsize = root->sectorsize;
7274
7275 remove_sb_from_cache(root, cache);
7276
7277 /*
7278 * check for two cases, either we are full, and therefore
7279 * don't need to bother with the caching work since we won't
7280 * find any space, or we are empty, and we can just add all
7281 * the space in and be done with it. This saves us _alot_ of
7282 * time, particularly in the full case.
7283 */
7284 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7285 cache->cached = BTRFS_CACHE_FINISHED;
7286 } else if (btrfs_block_group_used(&cache->item) == 0) {
7287 cache->cached = BTRFS_CACHE_FINISHED;
7288 add_new_free_space(cache, root->fs_info,
7289 found_key.objectid,
7290 found_key.objectid +
7291 found_key.offset);
7292 }
5917 7293
5918 ret = update_space_info(info, cache->flags, found_key.offset, 7294 ret = update_space_info(info, cache->flags, found_key.offset,
5919 btrfs_block_group_used(&cache->item), 7295 btrfs_block_group_used(&cache->item),
@@ -5957,10 +7333,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5957 cache->key.objectid = chunk_offset; 7333 cache->key.objectid = chunk_offset;
5958 cache->key.offset = size; 7334 cache->key.offset = size;
5959 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7335 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7336 cache->sectorsize = root->sectorsize;
7337
7338 /*
7339 * we only want to have 32k of ram per block group for keeping track
7340 * of free space, and if we pass 1/2 of that we want to start
7341 * converting things over to using bitmaps
7342 */
7343 cache->extents_thresh = ((1024 * 32) / 2) /
7344 sizeof(struct btrfs_free_space);
5960 atomic_set(&cache->count, 1); 7345 atomic_set(&cache->count, 1);
5961 spin_lock_init(&cache->lock); 7346 spin_lock_init(&cache->lock);
5962 spin_lock_init(&cache->tree_lock); 7347 spin_lock_init(&cache->tree_lock);
5963 mutex_init(&cache->cache_mutex); 7348 init_waitqueue_head(&cache->caching_q);
5964 INIT_LIST_HEAD(&cache->list); 7349 INIT_LIST_HEAD(&cache->list);
5965 INIT_LIST_HEAD(&cache->cluster_list); 7350 INIT_LIST_HEAD(&cache->cluster_list);
5966 7351
@@ -5969,6 +7354,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5969 cache->flags = type; 7354 cache->flags = type;
5970 btrfs_set_block_group_flags(&cache->item, type); 7355 btrfs_set_block_group_flags(&cache->item, type);
5971 7356
7357 cache->cached = BTRFS_CACHE_FINISHED;
7358 remove_sb_from_cache(root, cache);
7359
7360 add_new_free_space(cache, root->fs_info, chunk_offset,
7361 chunk_offset + size);
7362
5972 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7363 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5973 &cache->space_info); 7364 &cache->space_info);
5974 BUG_ON(ret); 7365 BUG_ON(ret);
@@ -6027,7 +7418,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6027 rb_erase(&block_group->cache_node, 7418 rb_erase(&block_group->cache_node,
6028 &root->fs_info->block_group_cache_tree); 7419 &root->fs_info->block_group_cache_tree);
6029 spin_unlock(&root->fs_info->block_group_cache_lock); 7420 spin_unlock(&root->fs_info->block_group_cache_lock);
6030 btrfs_remove_free_space_cache(block_group); 7421
6031 down_write(&block_group->space_info->groups_sem); 7422 down_write(&block_group->space_info->groups_sem);
6032 /* 7423 /*
6033 * we must use list_del_init so people can check to see if they 7424 * we must use list_del_init so people can check to see if they
@@ -6036,11 +7427,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6036 list_del_init(&block_group->list); 7427 list_del_init(&block_group->list);
6037 up_write(&block_group->space_info->groups_sem); 7428 up_write(&block_group->space_info->groups_sem);
6038 7429
7430 if (block_group->cached == BTRFS_CACHE_STARTED)
7431 wait_event(block_group->caching_q,
7432 block_group_cache_done(block_group));
7433
7434 btrfs_remove_free_space_cache(block_group);
7435
6039 spin_lock(&block_group->space_info->lock); 7436 spin_lock(&block_group->space_info->lock);
6040 block_group->space_info->total_bytes -= block_group->key.offset; 7437 block_group->space_info->total_bytes -= block_group->key.offset;
6041 block_group->space_info->bytes_readonly -= block_group->key.offset; 7438 block_group->space_info->bytes_readonly -= block_group->key.offset;
6042 spin_unlock(&block_group->space_info->lock); 7439 spin_unlock(&block_group->space_info->lock);
6043 block_group->space_info->full = 0; 7440
7441 btrfs_clear_space_info_full(root->fs_info);
6044 7442
6045 btrfs_put_block_group(block_group); 7443 btrfs_put_block_group(block_group);
6046 btrfs_put_block_group(block_group); 7444 btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fe9eb990e443..68260180f587 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -476,6 +476,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
476 struct extent_state *state; 476 struct extent_state *state;
477 struct extent_state *prealloc = NULL; 477 struct extent_state *prealloc = NULL;
478 struct rb_node *node; 478 struct rb_node *node;
479 u64 last_end;
479 int err; 480 int err;
480 int set = 0; 481 int set = 0;
481 482
@@ -498,6 +499,7 @@ again:
498 if (state->start > end) 499 if (state->start > end)
499 goto out; 500 goto out;
500 WARN_ON(state->end < start); 501 WARN_ON(state->end < start);
502 last_end = state->end;
501 503
502 /* 504 /*
503 * | ---- desired range ---- | 505 * | ---- desired range ---- |
@@ -524,9 +526,11 @@ again:
524 if (err) 526 if (err)
525 goto out; 527 goto out;
526 if (state->end <= end) { 528 if (state->end <= end) {
527 start = state->end + 1;
528 set |= clear_state_bit(tree, state, bits, 529 set |= clear_state_bit(tree, state, bits,
529 wake, delete); 530 wake, delete);
531 if (last_end == (u64)-1)
532 goto out;
533 start = last_end + 1;
530 } else { 534 } else {
531 start = state->start; 535 start = state->start;
532 } 536 }
@@ -552,8 +556,10 @@ again:
552 goto out; 556 goto out;
553 } 557 }
554 558
555 start = state->end + 1;
556 set |= clear_state_bit(tree, state, bits, wake, delete); 559 set |= clear_state_bit(tree, state, bits, wake, delete);
560 if (last_end == (u64)-1)
561 goto out;
562 start = last_end + 1;
557 goto search_again; 563 goto search_again;
558 564
559out: 565out:
@@ -707,8 +713,10 @@ again:
707 goto out; 713 goto out;
708 } 714 }
709 set_state_bits(tree, state, bits); 715 set_state_bits(tree, state, bits);
710 start = state->end + 1;
711 merge_state(tree, state); 716 merge_state(tree, state);
717 if (last_end == (u64)-1)
718 goto out;
719 start = last_end + 1;
712 goto search_again; 720 goto search_again;
713 } 721 }
714 722
@@ -742,8 +750,10 @@ again:
742 goto out; 750 goto out;
743 if (state->end <= end) { 751 if (state->end <= end) {
744 set_state_bits(tree, state, bits); 752 set_state_bits(tree, state, bits);
745 start = state->end + 1;
746 merge_state(tree, state); 753 merge_state(tree, state);
754 if (last_end == (u64)-1)
755 goto out;
756 start = last_end + 1;
747 } else { 757 } else {
748 start = state->start; 758 start = state->start;
749 } 759 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1d51dc38bb49..4b833972273a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -22,7 +22,6 @@
22#include <linux/time.h> 22#include <linux/time.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/smp_lock.h>
26#include <linux/backing-dev.h> 25#include <linux/backing-dev.h>
27#include <linux/mpage.h> 26#include <linux/mpage.h>
28#include <linux/swap.h> 27#include <linux/swap.h>
@@ -151,7 +150,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
151 } 150 }
152 if (end_pos > isize) { 151 if (end_pos > isize) {
153 i_size_write(inode, end_pos); 152 i_size_write(inode, end_pos);
154 btrfs_update_inode(trans, root, inode); 153 /* we've only changed i_size in ram, and we haven't updated
154 * the disk i_size. There is no need to log the inode
155 * at this time.
156 */
155 } 157 }
156 err = btrfs_end_transaction(trans, root); 158 err = btrfs_end_transaction(trans, root);
157out_unlock: 159out_unlock:
@@ -291,16 +293,12 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
291{ 293{
292 u64 extent_end = 0; 294 u64 extent_end = 0;
293 u64 search_start = start; 295 u64 search_start = start;
294 u64 leaf_start;
295 u64 ram_bytes = 0; 296 u64 ram_bytes = 0;
296 u64 orig_parent = 0;
297 u64 disk_bytenr = 0; 297 u64 disk_bytenr = 0;
298 u64 orig_locked_end = locked_end; 298 u64 orig_locked_end = locked_end;
299 u8 compression; 299 u8 compression;
300 u8 encryption; 300 u8 encryption;
301 u16 other_encoding = 0; 301 u16 other_encoding = 0;
302 u64 root_gen;
303 u64 root_owner;
304 struct extent_buffer *leaf; 302 struct extent_buffer *leaf;
305 struct btrfs_file_extent_item *extent; 303 struct btrfs_file_extent_item *extent;
306 struct btrfs_path *path; 304 struct btrfs_path *path;
@@ -340,9 +338,6 @@ next_slot:
340 bookend = 0; 338 bookend = 0;
341 found_extent = 0; 339 found_extent = 0;
342 found_inline = 0; 340 found_inline = 0;
343 leaf_start = 0;
344 root_gen = 0;
345 root_owner = 0;
346 compression = 0; 341 compression = 0;
347 encryption = 0; 342 encryption = 0;
348 extent = NULL; 343 extent = NULL;
@@ -417,9 +412,6 @@ next_slot:
417 if (found_extent) { 412 if (found_extent) {
418 read_extent_buffer(leaf, &old, (unsigned long)extent, 413 read_extent_buffer(leaf, &old, (unsigned long)extent,
419 sizeof(old)); 414 sizeof(old));
420 root_gen = btrfs_header_generation(leaf);
421 root_owner = btrfs_header_owner(leaf);
422 leaf_start = leaf->start;
423 } 415 }
424 416
425 if (end < extent_end && end >= key.offset) { 417 if (end < extent_end && end >= key.offset) {
@@ -443,14 +435,14 @@ next_slot:
443 } 435 }
444 locked_end = extent_end; 436 locked_end = extent_end;
445 } 437 }
446 orig_parent = path->nodes[0]->start;
447 disk_bytenr = le64_to_cpu(old.disk_bytenr); 438 disk_bytenr = le64_to_cpu(old.disk_bytenr);
448 if (disk_bytenr != 0) { 439 if (disk_bytenr != 0) {
449 ret = btrfs_inc_extent_ref(trans, root, 440 ret = btrfs_inc_extent_ref(trans, root,
450 disk_bytenr, 441 disk_bytenr,
451 le64_to_cpu(old.disk_num_bytes), 442 le64_to_cpu(old.disk_num_bytes), 0,
452 orig_parent, root->root_key.objectid, 443 root->root_key.objectid,
453 trans->transid, inode->i_ino); 444 key.objectid, key.offset -
445 le64_to_cpu(old.offset));
454 BUG_ON(ret); 446 BUG_ON(ret);
455 } 447 }
456 } 448 }
@@ -568,17 +560,6 @@ next_slot:
568 btrfs_mark_buffer_dirty(path->nodes[0]); 560 btrfs_mark_buffer_dirty(path->nodes[0]);
569 btrfs_set_lock_blocking(path->nodes[0]); 561 btrfs_set_lock_blocking(path->nodes[0]);
570 562
571 if (disk_bytenr != 0) {
572 ret = btrfs_update_extent_ref(trans, root,
573 disk_bytenr,
574 le64_to_cpu(old.disk_num_bytes),
575 orig_parent,
576 leaf->start,
577 root->root_key.objectid,
578 trans->transid, ins.objectid);
579
580 BUG_ON(ret);
581 }
582 path->leave_spinning = 0; 563 path->leave_spinning = 0;
583 btrfs_release_path(root, path); 564 btrfs_release_path(root, path);
584 if (disk_bytenr != 0) 565 if (disk_bytenr != 0)
@@ -594,8 +575,9 @@ next_slot:
594 ret = btrfs_free_extent(trans, root, 575 ret = btrfs_free_extent(trans, root,
595 old_disk_bytenr, 576 old_disk_bytenr,
596 le64_to_cpu(old.disk_num_bytes), 577 le64_to_cpu(old.disk_num_bytes),
597 leaf_start, root_owner, 578 0, root->root_key.objectid,
598 root_gen, key.objectid, 0); 579 key.objectid, key.offset -
580 le64_to_cpu(old.offset));
599 BUG_ON(ret); 581 BUG_ON(ret);
600 *hint_byte = old_disk_bytenr; 582 *hint_byte = old_disk_bytenr;
601 } 583 }
@@ -664,12 +646,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
664 u64 bytenr; 646 u64 bytenr;
665 u64 num_bytes; 647 u64 num_bytes;
666 u64 extent_end; 648 u64 extent_end;
667 u64 extent_offset; 649 u64 orig_offset;
668 u64 other_start; 650 u64 other_start;
669 u64 other_end; 651 u64 other_end;
670 u64 split = start; 652 u64 split = start;
671 u64 locked_end = end; 653 u64 locked_end = end;
672 u64 orig_parent;
673 int extent_type; 654 int extent_type;
674 int split_end = 1; 655 int split_end = 1;
675 int ret; 656 int ret;
@@ -703,7 +684,7 @@ again:
703 684
704 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 685 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
705 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 686 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
706 extent_offset = btrfs_file_extent_offset(leaf, fi); 687 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
707 688
708 if (key.offset == start) 689 if (key.offset == start)
709 split = end; 690 split = end;
@@ -711,8 +692,6 @@ again:
711 if (key.offset == start && extent_end == end) { 692 if (key.offset == start && extent_end == end) {
712 int del_nr = 0; 693 int del_nr = 0;
713 int del_slot = 0; 694 int del_slot = 0;
714 u64 leaf_owner = btrfs_header_owner(leaf);
715 u64 leaf_gen = btrfs_header_generation(leaf);
716 other_start = end; 695 other_start = end;
717 other_end = 0; 696 other_end = 0;
718 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, 697 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
@@ -721,8 +700,8 @@ again:
721 del_slot = path->slots[0] + 1; 700 del_slot = path->slots[0] + 1;
722 del_nr++; 701 del_nr++;
723 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 702 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
724 leaf->start, leaf_owner, 703 0, root->root_key.objectid,
725 leaf_gen, inode->i_ino, 0); 704 inode->i_ino, orig_offset);
726 BUG_ON(ret); 705 BUG_ON(ret);
727 } 706 }
728 other_start = 0; 707 other_start = 0;
@@ -733,8 +712,8 @@ again:
733 del_slot = path->slots[0]; 712 del_slot = path->slots[0];
734 del_nr++; 713 del_nr++;
735 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 714 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
736 leaf->start, leaf_owner, 715 0, root->root_key.objectid,
737 leaf_gen, inode->i_ino, 0); 716 inode->i_ino, orig_offset);
738 BUG_ON(ret); 717 BUG_ON(ret);
739 } 718 }
740 split_end = 0; 719 split_end = 0;
@@ -768,13 +747,12 @@ again:
768 locked_end = extent_end; 747 locked_end = extent_end;
769 } 748 }
770 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); 749 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
771 extent_offset += split - key.offset;
772 } else { 750 } else {
773 BUG_ON(key.offset != start); 751 BUG_ON(key.offset != start);
774 btrfs_set_file_extent_offset(leaf, fi, extent_offset +
775 split - key.offset);
776 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
777 key.offset = split; 752 key.offset = split;
753 btrfs_set_file_extent_offset(leaf, fi, key.offset -
754 orig_offset);
755 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
778 btrfs_set_item_key_safe(trans, root, path, &key); 756 btrfs_set_item_key_safe(trans, root, path, &key);
779 extent_end = split; 757 extent_end = split;
780 } 758 }
@@ -793,7 +771,8 @@ again:
793 struct btrfs_file_extent_item); 771 struct btrfs_file_extent_item);
794 key.offset = split; 772 key.offset = split;
795 btrfs_set_item_key_safe(trans, root, path, &key); 773 btrfs_set_item_key_safe(trans, root, path, &key);
796 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 774 btrfs_set_file_extent_offset(leaf, fi, key.offset -
775 orig_offset);
797 btrfs_set_file_extent_num_bytes(leaf, fi, 776 btrfs_set_file_extent_num_bytes(leaf, fi,
798 other_end - split); 777 other_end - split);
799 goto done; 778 goto done;
@@ -815,10 +794,9 @@ again:
815 794
816 btrfs_mark_buffer_dirty(leaf); 795 btrfs_mark_buffer_dirty(leaf);
817 796
818 orig_parent = leaf->start; 797 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
819 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 798 root->root_key.objectid,
820 orig_parent, root->root_key.objectid, 799 inode->i_ino, orig_offset);
821 trans->transid, inode->i_ino);
822 BUG_ON(ret); 800 BUG_ON(ret);
823 btrfs_release_path(root, path); 801 btrfs_release_path(root, path);
824 802
@@ -833,20 +811,12 @@ again:
833 btrfs_set_file_extent_type(leaf, fi, extent_type); 811 btrfs_set_file_extent_type(leaf, fi, extent_type);
834 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); 812 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
835 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); 813 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
836 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 814 btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
837 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); 815 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
838 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 816 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
839 btrfs_set_file_extent_compression(leaf, fi, 0); 817 btrfs_set_file_extent_compression(leaf, fi, 0);
840 btrfs_set_file_extent_encryption(leaf, fi, 0); 818 btrfs_set_file_extent_encryption(leaf, fi, 0);
841 btrfs_set_file_extent_other_encoding(leaf, fi, 0); 819 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
842
843 if (orig_parent != leaf->start) {
844 ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
845 orig_parent, leaf->start,
846 root->root_key.objectid,
847 trans->transid, inode->i_ino);
848 BUG_ON(ret);
849 }
850done: 820done:
851 btrfs_mark_buffer_dirty(leaf); 821 btrfs_mark_buffer_dirty(leaf);
852 822
@@ -1189,6 +1159,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1189 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1159 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1190 root->log_batch++; 1160 root->log_batch++;
1191 1161
1162 if (datasync && !(inode->i_state & I_DIRTY_PAGES))
1163 goto out;
1192 /* 1164 /*
1193 * ok we haven't committed the transaction yet, lets do a commit 1165 * ok we haven't committed the transaction yet, lets do a commit
1194 */ 1166 */
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0bc93657b460..5edcee3a617f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/pagemap.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/math64.h>
20#include "ctree.h" 22#include "ctree.h"
21#include "free-space-cache.h" 23#include "free-space-cache.h"
22#include "transaction.h" 24#include "transaction.h"
23 25
24struct btrfs_free_space { 26#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
25 struct rb_node bytes_index; 27#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
30 28
31static int tree_insert_offset(struct rb_root *root, u64 offset, 29static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
32 struct rb_node *node) 30 u64 offset)
33{ 31{
34 struct rb_node **p = &root->rb_node; 32 BUG_ON(offset < bitmap_start);
35 struct rb_node *parent = NULL; 33 offset -= bitmap_start;
36 struct btrfs_free_space *info; 34 return (unsigned long)(div64_u64(offset, sectorsize));
35}
37 36
38 while (*p) { 37static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
39 parent = *p; 38{
40 info = rb_entry(parent, struct btrfs_free_space, offset_index); 39 return (unsigned long)(div64_u64(bytes, sectorsize));
40}
41 41
42 if (offset < info->offset) 42static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
43 p = &(*p)->rb_left; 43 u64 offset)
44 else if (offset > info->offset) 44{
45 p = &(*p)->rb_right; 45 u64 bitmap_start;
46 else 46 u64 bytes_per_bitmap;
47 return -EEXIST;
48 }
49 47
50 rb_link_node(node, parent, p); 48 bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
51 rb_insert_color(node, root); 49 bitmap_start = offset - block_group->key.objectid;
50 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
51 bitmap_start *= bytes_per_bitmap;
52 bitmap_start += block_group->key.objectid;
52 53
53 return 0; 54 return bitmap_start;
54} 55}
55 56
56static int tree_insert_bytes(struct rb_root *root, u64 bytes, 57static int tree_insert_offset(struct rb_root *root, u64 offset,
57 struct rb_node *node) 58 struct rb_node *node, int bitmap)
58{ 59{
59 struct rb_node **p = &root->rb_node; 60 struct rb_node **p = &root->rb_node;
60 struct rb_node *parent = NULL; 61 struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
62 63
63 while (*p) { 64 while (*p) {
64 parent = *p; 65 parent = *p;
65 info = rb_entry(parent, struct btrfs_free_space, bytes_index); 66 info = rb_entry(parent, struct btrfs_free_space, offset_index);
66 67
67 if (bytes < info->bytes) 68 if (offset < info->offset) {
68 p = &(*p)->rb_left; 69 p = &(*p)->rb_left;
69 else 70 } else if (offset > info->offset) {
70 p = &(*p)->rb_right; 71 p = &(*p)->rb_right;
72 } else {
73 /*
74 * we could have a bitmap entry and an extent entry
75 * share the same offset. If this is the case, we want
76 * the extent entry to always be found first if we do a
77 * linear search through the tree, since we want to have
78 * the quickest allocation time, and allocating from an
79 * extent is faster than allocating from a bitmap. So
80 * if we're inserting a bitmap and we find an entry at
81 * this offset, we want to go right, or after this entry
82 * logically. If we are inserting an extent and we've
83 * found a bitmap, we want to go left, or before
84 * logically.
85 */
86 if (bitmap) {
87 WARN_ON(info->bitmap);
88 p = &(*p)->rb_right;
89 } else {
90 WARN_ON(!info->bitmap);
91 p = &(*p)->rb_left;
92 }
93 }
71 } 94 }
72 95
73 rb_link_node(node, parent, p); 96 rb_link_node(node, parent, p);
@@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
79/* 102/*
80 * searches the tree for the given offset. 103 * searches the tree for the given offset.
81 * 104 *
82 * fuzzy == 1: this is used for allocations where we are given a hint of where 105 * fuzzy - If this is set, then we are trying to make an allocation, and we just
83 * to look for free space. Because the hint may not be completely on an offset 106 * want a section that has at least bytes size and comes at or after the given
84 * mark, or the hint may no longer point to free space we need to fudge our 107 * offset.
85 * results a bit. So we look for free space starting at or after offset with at
86 * least bytes size. We prefer to find as close to the given offset as we can.
87 * Also if the offset is within a free space range, then we will return the free
88 * space that contains the given offset, which means we can return a free space
89 * chunk with an offset before the provided offset.
90 *
91 * fuzzy == 0: this is just a normal tree search. Give us the free space that
92 * starts at the given offset which is at least bytes size, and if its not there
93 * return NULL.
94 */ 108 */
95static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 109static struct btrfs_free_space *
96 u64 offset, u64 bytes, 110tree_search_offset(struct btrfs_block_group_cache *block_group,
97 int fuzzy) 111 u64 offset, int bitmap_only, int fuzzy)
98{ 112{
99 struct rb_node *n = root->rb_node; 113 struct rb_node *n = block_group->free_space_offset.rb_node;
100 struct btrfs_free_space *entry, *ret = NULL; 114 struct btrfs_free_space *entry, *prev = NULL;
115
116 /* find entry that is closest to the 'offset' */
117 while (1) {
118 if (!n) {
119 entry = NULL;
120 break;
121 }
101 122
102 while (n) {
103 entry = rb_entry(n, struct btrfs_free_space, offset_index); 123 entry = rb_entry(n, struct btrfs_free_space, offset_index);
124 prev = entry;
104 125
105 if (offset < entry->offset) { 126 if (offset < entry->offset)
106 if (fuzzy &&
107 (!ret || entry->offset < ret->offset) &&
108 (bytes <= entry->bytes))
109 ret = entry;
110 n = n->rb_left; 127 n = n->rb_left;
111 } else if (offset > entry->offset) { 128 else if (offset > entry->offset)
112 if (fuzzy &&
113 (entry->offset + entry->bytes - 1) >= offset &&
114 bytes <= entry->bytes) {
115 ret = entry;
116 break;
117 }
118 n = n->rb_right; 129 n = n->rb_right;
119 } else { 130 else
120 if (bytes > entry->bytes) {
121 n = n->rb_right;
122 continue;
123 }
124 ret = entry;
125 break; 131 break;
126 }
127 } 132 }
128 133
129 return ret; 134 if (bitmap_only) {
130} 135 if (!entry)
136 return NULL;
137 if (entry->bitmap)
138 return entry;
131 139
132/* 140 /*
133 * return a chunk at least bytes size, as close to offset that we can get. 141 * bitmap entry and extent entry may share same offset,
134 */ 142 * in that case, bitmap entry comes after extent entry.
135static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, 143 */
136 u64 offset, u64 bytes) 144 n = rb_next(n);
137{ 145 if (!n)
138 struct rb_node *n = root->rb_node; 146 return NULL;
139 struct btrfs_free_space *entry, *ret = NULL; 147 entry = rb_entry(n, struct btrfs_free_space, offset_index);
140 148 if (entry->offset != offset)
141 while (n) { 149 return NULL;
142 entry = rb_entry(n, struct btrfs_free_space, bytes_index);
143 150
144 if (bytes < entry->bytes) { 151 WARN_ON(!entry->bitmap);
152 return entry;
153 } else if (entry) {
154 if (entry->bitmap) {
145 /* 155 /*
146 * We prefer to get a hole size as close to the size we 156 * if previous extent entry covers the offset,
147 * are asking for so we don't take small slivers out of 157 * we should return it instead of the bitmap entry
148 * huge holes, but we also want to get as close to the
149 * offset as possible so we don't have a whole lot of
150 * fragmentation.
151 */ 158 */
152 if (offset <= entry->offset) { 159 n = &entry->offset_index;
153 if (!ret) 160 while (1) {
154 ret = entry; 161 n = rb_prev(n);
155 else if (entry->bytes < ret->bytes) 162 if (!n)
156 ret = entry; 163 break;
157 else if (entry->offset < ret->offset) 164 prev = rb_entry(n, struct btrfs_free_space,
158 ret = entry; 165 offset_index);
166 if (!prev->bitmap) {
167 if (prev->offset + prev->bytes > offset)
168 entry = prev;
169 break;
170 }
159 } 171 }
160 n = n->rb_left; 172 }
161 } else if (bytes > entry->bytes) { 173 return entry;
162 n = n->rb_right; 174 }
175
176 if (!prev)
177 return NULL;
178
179 /* find last entry before the 'offset' */
180 entry = prev;
181 if (entry->offset > offset) {
182 n = rb_prev(&entry->offset_index);
183 if (n) {
184 entry = rb_entry(n, struct btrfs_free_space,
185 offset_index);
186 BUG_ON(entry->offset > offset);
163 } else { 187 } else {
164 /* 188 if (fuzzy)
165 * Ok we may have multiple chunks of the wanted size, 189 return entry;
166 * so we don't want to take the first one we find, we 190 else
167 * want to take the one closest to our given offset, so 191 return NULL;
168 * keep searching just in case theres a better match.
169 */
170 n = n->rb_right;
171 if (offset > entry->offset)
172 continue;
173 else if (!ret || entry->offset < ret->offset)
174 ret = entry;
175 } 192 }
176 } 193 }
177 194
178 return ret; 195 if (entry->bitmap) {
196 n = &entry->offset_index;
197 while (1) {
198 n = rb_prev(n);
199 if (!n)
200 break;
201 prev = rb_entry(n, struct btrfs_free_space,
202 offset_index);
203 if (!prev->bitmap) {
204 if (prev->offset + prev->bytes > offset)
205 return prev;
206 break;
207 }
208 }
209 if (entry->offset + BITS_PER_BITMAP *
210 block_group->sectorsize > offset)
211 return entry;
212 } else if (entry->offset + entry->bytes > offset)
213 return entry;
214
215 if (!fuzzy)
216 return NULL;
217
218 while (1) {
219 if (entry->bitmap) {
220 if (entry->offset + BITS_PER_BITMAP *
221 block_group->sectorsize > offset)
222 break;
223 } else {
224 if (entry->offset + entry->bytes > offset)
225 break;
226 }
227
228 n = rb_next(&entry->offset_index);
229 if (!n)
230 return NULL;
231 entry = rb_entry(n, struct btrfs_free_space, offset_index);
232 }
233 return entry;
179} 234}
180 235
181static void unlink_free_space(struct btrfs_block_group_cache *block_group, 236static void unlink_free_space(struct btrfs_block_group_cache *block_group,
182 struct btrfs_free_space *info) 237 struct btrfs_free_space *info)
183{ 238{
184 rb_erase(&info->offset_index, &block_group->free_space_offset); 239 rb_erase(&info->offset_index, &block_group->free_space_offset);
185 rb_erase(&info->bytes_index, &block_group->free_space_bytes); 240 block_group->free_extents--;
241 block_group->free_space -= info->bytes;
186} 242}
187 243
188static int link_free_space(struct btrfs_block_group_cache *block_group, 244static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +246,353 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
190{ 246{
191 int ret = 0; 247 int ret = 0;
192 248
193 249 BUG_ON(!info->bitmap && !info->bytes);
194 BUG_ON(!info->bytes);
195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 250 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
196 &info->offset_index); 251 &info->offset_index, (info->bitmap != NULL));
197 if (ret) 252 if (ret)
198 return ret; 253 return ret;
199 254
200 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, 255 block_group->free_space += info->bytes;
201 &info->bytes_index); 256 block_group->free_extents++;
202 if (ret) 257 return ret;
203 return ret; 258}
259
260static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
261{
262 u64 max_bytes, possible_bytes;
263
264 /*
265 * The goal is to keep the total amount of memory used per 1gb of space
266 * at or below 32k, so we need to adjust how much memory we allow to be
267 * used by extent based free space tracking
268 */
269 max_bytes = MAX_CACHE_BYTES_PER_GIG *
270 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
271
272 possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
273 (sizeof(struct btrfs_free_space) *
274 block_group->extents_thresh);
275
276 if (possible_bytes > max_bytes) {
277 int extent_bytes = max_bytes -
278 (block_group->total_bitmaps * PAGE_CACHE_SIZE);
279
280 if (extent_bytes <= 0) {
281 block_group->extents_thresh = 0;
282 return;
283 }
284
285 block_group->extents_thresh = extent_bytes /
286 (sizeof(struct btrfs_free_space));
287 }
288}
289
290static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
291 struct btrfs_free_space *info, u64 offset,
292 u64 bytes)
293{
294 unsigned long start, end;
295 unsigned long i;
296
297 start = offset_to_bit(info->offset, block_group->sectorsize, offset);
298 end = start + bytes_to_bits(bytes, block_group->sectorsize);
299 BUG_ON(end > BITS_PER_BITMAP);
300
301 for (i = start; i < end; i++)
302 clear_bit(i, info->bitmap);
303
304 info->bytes -= bytes;
305 block_group->free_space -= bytes;
306}
307
308static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
309 struct btrfs_free_space *info, u64 offset,
310 u64 bytes)
311{
312 unsigned long start, end;
313 unsigned long i;
314
315 start = offset_to_bit(info->offset, block_group->sectorsize, offset);
316 end = start + bytes_to_bits(bytes, block_group->sectorsize);
317 BUG_ON(end > BITS_PER_BITMAP);
318
319 for (i = start; i < end; i++)
320 set_bit(i, info->bitmap);
321
322 info->bytes += bytes;
323 block_group->free_space += bytes;
324}
325
326static int search_bitmap(struct btrfs_block_group_cache *block_group,
327 struct btrfs_free_space *bitmap_info, u64 *offset,
328 u64 *bytes)
329{
330 unsigned long found_bits = 0;
331 unsigned long bits, i;
332 unsigned long next_zero;
333
334 i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
335 max_t(u64, *offset, bitmap_info->offset));
336 bits = bytes_to_bits(*bytes, block_group->sectorsize);
337
338 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
339 i < BITS_PER_BITMAP;
340 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
341 next_zero = find_next_zero_bit(bitmap_info->bitmap,
342 BITS_PER_BITMAP, i);
343 if ((next_zero - i) >= bits) {
344 found_bits = next_zero - i;
345 break;
346 }
347 i = next_zero;
348 }
349
350 if (found_bits) {
351 *offset = (u64)(i * block_group->sectorsize) +
352 bitmap_info->offset;
353 *bytes = (u64)(found_bits) * block_group->sectorsize;
354 return 0;
355 }
356
357 return -1;
358}
359
360static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
361 *block_group, u64 *offset,
362 u64 *bytes, int debug)
363{
364 struct btrfs_free_space *entry;
365 struct rb_node *node;
366 int ret;
367
368 if (!block_group->free_space_offset.rb_node)
369 return NULL;
370
371 entry = tree_search_offset(block_group,
372 offset_to_bitmap(block_group, *offset),
373 0, 1);
374 if (!entry)
375 return NULL;
376
377 for (node = &entry->offset_index; node; node = rb_next(node)) {
378 entry = rb_entry(node, struct btrfs_free_space, offset_index);
379 if (entry->bytes < *bytes)
380 continue;
381
382 if (entry->bitmap) {
383 ret = search_bitmap(block_group, entry, offset, bytes);
384 if (!ret)
385 return entry;
386 continue;
387 }
388
389 *offset = entry->offset;
390 *bytes = entry->bytes;
391 return entry;
392 }
393
394 return NULL;
395}
396
397static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
398 struct btrfs_free_space *info, u64 offset)
399{
400 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
401 int max_bitmaps = (int)div64_u64(block_group->key.offset +
402 bytes_per_bg - 1, bytes_per_bg);
403 BUG_ON(block_group->total_bitmaps >= max_bitmaps);
404
405 info->offset = offset_to_bitmap(block_group, offset);
406 link_free_space(block_group, info);
407 block_group->total_bitmaps++;
408
409 recalculate_thresholds(block_group);
410}
411
412static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
413 struct btrfs_free_space *bitmap_info,
414 u64 *offset, u64 *bytes)
415{
416 u64 end;
417 u64 search_start, search_bytes;
418 int ret;
419
420again:
421 end = bitmap_info->offset +
422 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
423
424 /*
425 * XXX - this can go away after a few releases.
426 *
427 * since the only user of btrfs_remove_free_space is the tree logging
428 * stuff, and the only way to test that is under crash conditions, we
429 * want to have this debug stuff here just in case somethings not
430 * working. Search the bitmap for the space we are trying to use to
431 * make sure its actually there. If its not there then we need to stop
432 * because something has gone wrong.
433 */
434 search_start = *offset;
435 search_bytes = *bytes;
436 ret = search_bitmap(block_group, bitmap_info, &search_start,
437 &search_bytes);
438 BUG_ON(ret < 0 || search_start != *offset);
439
440 if (*offset > bitmap_info->offset && *offset + *bytes > end) {
441 bitmap_clear_bits(block_group, bitmap_info, *offset,
442 end - *offset + 1);
443 *bytes -= end - *offset + 1;
444 *offset = end + 1;
445 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
446 bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
447 *bytes = 0;
448 }
449
450 if (*bytes) {
451 struct rb_node *next = rb_next(&bitmap_info->offset_index);
452 if (!bitmap_info->bytes) {
453 unlink_free_space(block_group, bitmap_info);
454 kfree(bitmap_info->bitmap);
455 kfree(bitmap_info);
456 block_group->total_bitmaps--;
457 recalculate_thresholds(block_group);
458 }
459
460 /*
461 * no entry after this bitmap, but we still have bytes to
462 * remove, so something has gone wrong.
463 */
464 if (!next)
465 return -EINVAL;
466
467 bitmap_info = rb_entry(next, struct btrfs_free_space,
468 offset_index);
469
470 /*
471 * if the next entry isn't a bitmap we need to return to let the
472 * extent stuff do its work.
473 */
474 if (!bitmap_info->bitmap)
475 return -EAGAIN;
476
477 /*
478 * Ok the next item is a bitmap, but it may not actually hold
479 * the information for the rest of this free space stuff, so
480 * look for it, and if we don't find it return so we can try
481 * everything over again.
482 */
483 search_start = *offset;
484 search_bytes = *bytes;
485 ret = search_bitmap(block_group, bitmap_info, &search_start,
486 &search_bytes);
487 if (ret < 0 || search_start != *offset)
488 return -EAGAIN;
489
490 goto again;
491 } else if (!bitmap_info->bytes) {
492 unlink_free_space(block_group, bitmap_info);
493 kfree(bitmap_info->bitmap);
494 kfree(bitmap_info);
495 block_group->total_bitmaps--;
496 recalculate_thresholds(block_group);
497 }
498
499 return 0;
500}
501
502static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
503 struct btrfs_free_space *info)
504{
505 struct btrfs_free_space *bitmap_info;
506 int added = 0;
507 u64 bytes, offset, end;
508 int ret;
509
510 /*
511 * If we are below the extents threshold then we can add this as an
512 * extent, and don't have to deal with the bitmap
513 */
514 if (block_group->free_extents < block_group->extents_thresh &&
515 info->bytes > block_group->sectorsize * 4)
516 return 0;
517
518 /*
519 * some block groups are so tiny they can't be enveloped by a bitmap, so
520 * don't even bother to create a bitmap for this
521 */
522 if (BITS_PER_BITMAP * block_group->sectorsize >
523 block_group->key.offset)
524 return 0;
525
526 bytes = info->bytes;
527 offset = info->offset;
528
529again:
530 bitmap_info = tree_search_offset(block_group,
531 offset_to_bitmap(block_group, offset),
532 1, 0);
533 if (!bitmap_info) {
534 BUG_ON(added);
535 goto new_bitmap;
536 }
537
538 end = bitmap_info->offset +
539 (u64)(BITS_PER_BITMAP * block_group->sectorsize);
540
541 if (offset >= bitmap_info->offset && offset + bytes > end) {
542 bitmap_set_bits(block_group, bitmap_info, offset,
543 end - offset);
544 bytes -= end - offset;
545 offset = end;
546 added = 0;
547 } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
548 bitmap_set_bits(block_group, bitmap_info, offset, bytes);
549 bytes = 0;
550 } else {
551 BUG();
552 }
553
554 if (!bytes) {
555 ret = 1;
556 goto out;
557 } else
558 goto again;
559
560new_bitmap:
561 if (info && info->bitmap) {
562 add_new_bitmap(block_group, info, offset);
563 added = 1;
564 info = NULL;
565 goto again;
566 } else {
567 spin_unlock(&block_group->tree_lock);
568
569 /* no pre-allocated info, allocate a new one */
570 if (!info) {
571 info = kzalloc(sizeof(struct btrfs_free_space),
572 GFP_NOFS);
573 if (!info) {
574 spin_lock(&block_group->tree_lock);
575 ret = -ENOMEM;
576 goto out;
577 }
578 }
579
580 /* allocate the bitmap */
581 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
582 spin_lock(&block_group->tree_lock);
583 if (!info->bitmap) {
584 ret = -ENOMEM;
585 goto out;
586 }
587 goto again;
588 }
589
590out:
591 if (info) {
592 if (info->bitmap)
593 kfree(info->bitmap);
594 kfree(info);
595 }
204 596
205 return ret; 597 return ret;
206} 598}
@@ -208,8 +600,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
208int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 600int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
209 u64 offset, u64 bytes) 601 u64 offset, u64 bytes)
210{ 602{
211 struct btrfs_free_space *right_info; 603 struct btrfs_free_space *right_info = NULL;
212 struct btrfs_free_space *left_info; 604 struct btrfs_free_space *left_info = NULL;
213 struct btrfs_free_space *info = NULL; 605 struct btrfs_free_space *info = NULL;
214 int ret = 0; 606 int ret = 0;
215 607
@@ -227,18 +619,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
227 * are adding, if there is remove that struct and add a new one to 619 * are adding, if there is remove that struct and add a new one to
228 * cover the entire range 620 * cover the entire range
229 */ 621 */
230 right_info = tree_search_offset(&block_group->free_space_offset, 622 right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
231 offset+bytes, 0, 0); 623 if (right_info && rb_prev(&right_info->offset_index))
232 left_info = tree_search_offset(&block_group->free_space_offset, 624 left_info = rb_entry(rb_prev(&right_info->offset_index),
233 offset-1, 0, 1); 625 struct btrfs_free_space, offset_index);
626 else
627 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
628
629 /*
630 * If there was no extent directly to the left or right of this new
631 * extent then we know we're going to have to allocate a new extent, so
632 * before we do that see if we need to drop this into a bitmap
633 */
634 if ((!left_info || left_info->bitmap) &&
635 (!right_info || right_info->bitmap)) {
636 ret = insert_into_bitmap(block_group, info);
637
638 if (ret < 0) {
639 goto out;
640 } else if (ret) {
641 ret = 0;
642 goto out;
643 }
644 }
234 645
235 if (right_info) { 646 if (right_info && !right_info->bitmap) {
236 unlink_free_space(block_group, right_info); 647 unlink_free_space(block_group, right_info);
237 info->bytes += right_info->bytes; 648 info->bytes += right_info->bytes;
238 kfree(right_info); 649 kfree(right_info);
239 } 650 }
240 651
241 if (left_info && left_info->offset + left_info->bytes == offset) { 652 if (left_info && !left_info->bitmap &&
653 left_info->offset + left_info->bytes == offset) {
242 unlink_free_space(block_group, left_info); 654 unlink_free_space(block_group, left_info);
243 info->offset = left_info->offset; 655 info->offset = left_info->offset;
244 info->bytes += left_info->bytes; 656 info->bytes += left_info->bytes;
@@ -248,11 +660,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
248 ret = link_free_space(block_group, info); 660 ret = link_free_space(block_group, info);
249 if (ret) 661 if (ret)
250 kfree(info); 662 kfree(info);
251 663out:
252 spin_unlock(&block_group->tree_lock); 664 spin_unlock(&block_group->tree_lock);
253 665
254 if (ret) { 666 if (ret) {
255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 667 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
256 BUG_ON(ret == -EEXIST); 668 BUG_ON(ret == -EEXIST);
257 } 669 }
258 670
@@ -263,40 +675,74 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
263 u64 offset, u64 bytes) 675 u64 offset, u64 bytes)
264{ 676{
265 struct btrfs_free_space *info; 677 struct btrfs_free_space *info;
678 struct btrfs_free_space *next_info = NULL;
266 int ret = 0; 679 int ret = 0;
267 680
268 spin_lock(&block_group->tree_lock); 681 spin_lock(&block_group->tree_lock);
269 682
270 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 683again:
271 1); 684 info = tree_search_offset(block_group, offset, 0, 0);
272 if (info && info->offset == offset) { 685 if (!info) {
273 if (info->bytes < bytes) { 686 /*
274 printk(KERN_ERR "Found free space at %llu, size %llu," 687 * oops didn't find an extent that matched the space we wanted
275 "trying to use %llu\n", 688 * to remove, look for a bitmap instead
276 (unsigned long long)info->offset, 689 */
277 (unsigned long long)info->bytes, 690 info = tree_search_offset(block_group,
278 (unsigned long long)bytes); 691 offset_to_bitmap(block_group, offset),
692 1, 0);
693 if (!info) {
694 WARN_ON(1);
695 goto out_lock;
696 }
697 }
698
699 if (info->bytes < bytes && rb_next(&info->offset_index)) {
700 u64 end;
701 next_info = rb_entry(rb_next(&info->offset_index),
702 struct btrfs_free_space,
703 offset_index);
704
705 if (next_info->bitmap)
706 end = next_info->offset + BITS_PER_BITMAP *
707 block_group->sectorsize - 1;
708 else
709 end = next_info->offset + next_info->bytes;
710
711 if (next_info->bytes < bytes ||
712 next_info->offset > offset || offset > end) {
713 printk(KERN_CRIT "Found free space at %llu, size %llu,"
714 " trying to use %llu\n",
715 (unsigned long long)info->offset,
716 (unsigned long long)info->bytes,
717 (unsigned long long)bytes);
279 WARN_ON(1); 718 WARN_ON(1);
280 ret = -EINVAL; 719 ret = -EINVAL;
281 spin_unlock(&block_group->tree_lock); 720 goto out_lock;
282 goto out;
283 } 721 }
284 unlink_free_space(block_group, info);
285 722
286 if (info->bytes == bytes) { 723 info = next_info;
287 kfree(info); 724 }
288 spin_unlock(&block_group->tree_lock); 725
289 goto out; 726 if (info->bytes == bytes) {
727 unlink_free_space(block_group, info);
728 if (info->bitmap) {
729 kfree(info->bitmap);
730 block_group->total_bitmaps--;
290 } 731 }
732 kfree(info);
733 goto out_lock;
734 }
291 735
736 if (!info->bitmap && info->offset == offset) {
737 unlink_free_space(block_group, info);
292 info->offset += bytes; 738 info->offset += bytes;
293 info->bytes -= bytes; 739 info->bytes -= bytes;
740 link_free_space(block_group, info);
741 goto out_lock;
742 }
294 743
295 ret = link_free_space(block_group, info); 744 if (!info->bitmap && info->offset <= offset &&
296 spin_unlock(&block_group->tree_lock); 745 info->offset + info->bytes >= offset + bytes) {
297 BUG_ON(ret);
298 } else if (info && info->offset < offset &&
299 info->offset + info->bytes >= offset + bytes) {
300 u64 old_start = info->offset; 746 u64 old_start = info->offset;
301 /* 747 /*
302 * we're freeing space in the middle of the info, 748 * we're freeing space in the middle of the info,
@@ -312,7 +758,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
312 info->offset = offset + bytes; 758 info->offset = offset + bytes;
313 info->bytes = old_end - info->offset; 759 info->bytes = old_end - info->offset;
314 ret = link_free_space(block_group, info); 760 ret = link_free_space(block_group, info);
315 BUG_ON(ret); 761 WARN_ON(ret);
762 if (ret)
763 goto out_lock;
316 } else { 764 } else {
317 /* the hole we're creating ends at the end 765 /* the hole we're creating ends at the end
318 * of the info struct, just free the info 766 * of the info struct, just free the info
@@ -320,32 +768,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
320 kfree(info); 768 kfree(info);
321 } 769 }
322 spin_unlock(&block_group->tree_lock); 770 spin_unlock(&block_group->tree_lock);
323 /* step two, insert a new info struct to cover anything 771
324 * before the hole 772 /* step two, insert a new info struct to cover
773 * anything before the hole
325 */ 774 */
326 ret = btrfs_add_free_space(block_group, old_start, 775 ret = btrfs_add_free_space(block_group, old_start,
327 offset - old_start); 776 offset - old_start);
328 BUG_ON(ret); 777 WARN_ON(ret);
329 } else { 778 goto out;
330 spin_unlock(&block_group->tree_lock);
331 if (!info) {
332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached,
336 (unsigned long long)block_group->key.objectid,
337 (unsigned long long)block_group->key.offset);
338 btrfs_dump_free_space(block_group, bytes);
339 } else if (info) {
340 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
341 "but wanted offset=%llu bytes=%llu\n",
342 (unsigned long long)info->offset,
343 (unsigned long long)info->bytes,
344 (unsigned long long)offset,
345 (unsigned long long)bytes);
346 }
347 WARN_ON(1);
348 } 779 }
780
781 ret = remove_from_bitmap(block_group, info, &offset, &bytes);
782 if (ret == -EAGAIN)
783 goto again;
784 BUG_ON(ret);
785out_lock:
786 spin_unlock(&block_group->tree_lock);
349out: 787out:
350 return ret; 788 return ret;
351} 789}
@@ -361,10 +799,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
361 info = rb_entry(n, struct btrfs_free_space, offset_index); 799 info = rb_entry(n, struct btrfs_free_space, offset_index);
362 if (info->bytes >= bytes) 800 if (info->bytes >= bytes)
363 count++; 801 count++;
364 printk(KERN_ERR "entry offset %llu, bytes %llu\n", 802 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
365 (unsigned long long)info->offset, 803 (unsigned long long)info->offset,
366 (unsigned long long)info->bytes); 804 (unsigned long long)info->bytes,
805 (info->bitmap) ? "yes" : "no");
367 } 806 }
807 printk(KERN_INFO "block group has cluster?: %s\n",
808 list_empty(&block_group->cluster_list) ? "no" : "yes");
368 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 809 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
369 "\n", count); 810 "\n", count);
370} 811}
@@ -397,26 +838,35 @@ __btrfs_return_cluster_to_free_space(
397{ 838{
398 struct btrfs_free_space *entry; 839 struct btrfs_free_space *entry;
399 struct rb_node *node; 840 struct rb_node *node;
841 bool bitmap;
400 842
401 spin_lock(&cluster->lock); 843 spin_lock(&cluster->lock);
402 if (cluster->block_group != block_group) 844 if (cluster->block_group != block_group)
403 goto out; 845 goto out;
404 846
847 bitmap = cluster->points_to_bitmap;
848 cluster->block_group = NULL;
405 cluster->window_start = 0; 849 cluster->window_start = 0;
850 list_del_init(&cluster->block_group_list);
851 cluster->points_to_bitmap = false;
852
853 if (bitmap)
854 goto out;
855
406 node = rb_first(&cluster->root); 856 node = rb_first(&cluster->root);
407 while(node) { 857 while (node) {
408 entry = rb_entry(node, struct btrfs_free_space, offset_index); 858 entry = rb_entry(node, struct btrfs_free_space, offset_index);
409 node = rb_next(&entry->offset_index); 859 node = rb_next(&entry->offset_index);
410 rb_erase(&entry->offset_index, &cluster->root); 860 rb_erase(&entry->offset_index, &cluster->root);
411 link_free_space(block_group, entry); 861 BUG_ON(entry->bitmap);
862 tree_insert_offset(&block_group->free_space_offset,
863 entry->offset, &entry->offset_index, 0);
412 } 864 }
413 list_del_init(&cluster->block_group_list);
414
415 btrfs_put_block_group(cluster->block_group);
416 cluster->block_group = NULL;
417 cluster->root.rb_node = NULL; 865 cluster->root.rb_node = NULL;
866
418out: 867out:
419 spin_unlock(&cluster->lock); 868 spin_unlock(&cluster->lock);
869 btrfs_put_block_group(block_group);
420 return 0; 870 return 0;
421} 871}
422 872
@@ -425,20 +875,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
425 struct btrfs_free_space *info; 875 struct btrfs_free_space *info;
426 struct rb_node *node; 876 struct rb_node *node;
427 struct btrfs_free_cluster *cluster; 877 struct btrfs_free_cluster *cluster;
428 struct btrfs_free_cluster *safe; 878 struct list_head *head;
429 879
430 spin_lock(&block_group->tree_lock); 880 spin_lock(&block_group->tree_lock);
431 881 while ((head = block_group->cluster_list.next) !=
432 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, 882 &block_group->cluster_list) {
433 block_group_list) { 883 cluster = list_entry(head, struct btrfs_free_cluster,
884 block_group_list);
434 885
435 WARN_ON(cluster->block_group != block_group); 886 WARN_ON(cluster->block_group != block_group);
436 __btrfs_return_cluster_to_free_space(block_group, cluster); 887 __btrfs_return_cluster_to_free_space(block_group, cluster);
888 if (need_resched()) {
889 spin_unlock(&block_group->tree_lock);
890 cond_resched();
891 spin_lock(&block_group->tree_lock);
892 }
437 } 893 }
438 894
439 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 895 while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
440 info = rb_entry(node, struct btrfs_free_space, bytes_index); 896 info = rb_entry(node, struct btrfs_free_space, offset_index);
441 unlink_free_space(block_group, info); 897 unlink_free_space(block_group, info);
898 if (info->bitmap)
899 kfree(info->bitmap);
442 kfree(info); 900 kfree(info);
443 if (need_resched()) { 901 if (need_resched()) {
444 spin_unlock(&block_group->tree_lock); 902 spin_unlock(&block_group->tree_lock);
@@ -446,6 +904,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
446 spin_lock(&block_group->tree_lock); 904 spin_lock(&block_group->tree_lock);
447 } 905 }
448 } 906 }
907
449 spin_unlock(&block_group->tree_lock); 908 spin_unlock(&block_group->tree_lock);
450} 909}
451 910
@@ -453,25 +912,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
453 u64 offset, u64 bytes, u64 empty_size) 912 u64 offset, u64 bytes, u64 empty_size)
454{ 913{
455 struct btrfs_free_space *entry = NULL; 914 struct btrfs_free_space *entry = NULL;
915 u64 bytes_search = bytes + empty_size;
456 u64 ret = 0; 916 u64 ret = 0;
457 917
458 spin_lock(&block_group->tree_lock); 918 spin_lock(&block_group->tree_lock);
459 entry = tree_search_offset(&block_group->free_space_offset, offset, 919 entry = find_free_space(block_group, &offset, &bytes_search, 0);
460 bytes + empty_size, 1);
461 if (!entry) 920 if (!entry)
462 entry = tree_search_bytes(&block_group->free_space_bytes, 921 goto out;
463 offset, bytes + empty_size); 922
464 if (entry) { 923 ret = offset;
924 if (entry->bitmap) {
925 bitmap_clear_bits(block_group, entry, offset, bytes);
926 if (!entry->bytes) {
927 unlink_free_space(block_group, entry);
928 kfree(entry->bitmap);
929 kfree(entry);
930 block_group->total_bitmaps--;
931 recalculate_thresholds(block_group);
932 }
933 } else {
465 unlink_free_space(block_group, entry); 934 unlink_free_space(block_group, entry);
466 ret = entry->offset;
467 entry->offset += bytes; 935 entry->offset += bytes;
468 entry->bytes -= bytes; 936 entry->bytes -= bytes;
469
470 if (!entry->bytes) 937 if (!entry->bytes)
471 kfree(entry); 938 kfree(entry);
472 else 939 else
473 link_free_space(block_group, entry); 940 link_free_space(block_group, entry);
474 } 941 }
942
943out:
475 spin_unlock(&block_group->tree_lock); 944 spin_unlock(&block_group->tree_lock);
476 945
477 return ret; 946 return ret;
@@ -517,6 +986,54 @@ int btrfs_return_cluster_to_free_space(
517 return ret; 986 return ret;
518} 987}
519 988
989static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
990 struct btrfs_free_cluster *cluster,
991 u64 bytes, u64 min_start)
992{
993 struct btrfs_free_space *entry;
994 int err;
995 u64 search_start = cluster->window_start;
996 u64 search_bytes = bytes;
997 u64 ret = 0;
998
999 spin_lock(&block_group->tree_lock);
1000 spin_lock(&cluster->lock);
1001
1002 if (!cluster->points_to_bitmap)
1003 goto out;
1004
1005 if (cluster->block_group != block_group)
1006 goto out;
1007
1008 /*
1009 * search_start is the beginning of the bitmap, but at some point it may
1010 * be a good idea to point to the actual start of the free area in the
1011 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
1012 * to 1 to make sure we get the bitmap entry
1013 */
1014 entry = tree_search_offset(block_group,
1015 offset_to_bitmap(block_group, search_start),
1016 1, 0);
1017 if (!entry || !entry->bitmap)
1018 goto out;
1019
1020 search_start = min_start;
1021 search_bytes = bytes;
1022
1023 err = search_bitmap(block_group, entry, &search_start,
1024 &search_bytes);
1025 if (err)
1026 goto out;
1027
1028 ret = search_start;
1029 bitmap_clear_bits(block_group, entry, ret, bytes);
1030out:
1031 spin_unlock(&cluster->lock);
1032 spin_unlock(&block_group->tree_lock);
1033
1034 return ret;
1035}
1036
520/* 1037/*
521 * given a cluster, try to allocate 'bytes' from it, returns 0 1038 * given a cluster, try to allocate 'bytes' from it, returns 0
522 * if it couldn't find anything suitably large, or a logical disk offset 1039 * if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +1047,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
530 struct rb_node *node; 1047 struct rb_node *node;
531 u64 ret = 0; 1048 u64 ret = 0;
532 1049
1050 if (cluster->points_to_bitmap)
1051 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
1052 min_start);
1053
533 spin_lock(&cluster->lock); 1054 spin_lock(&cluster->lock);
534 if (bytes > cluster->max_size) 1055 if (bytes > cluster->max_size)
535 goto out; 1056 goto out;
@@ -567,9 +1088,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
567 } 1088 }
568out: 1089out:
569 spin_unlock(&cluster->lock); 1090 spin_unlock(&cluster->lock);
1091
570 return ret; 1092 return ret;
571} 1093}
572 1094
1095static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1096 struct btrfs_free_space *entry,
1097 struct btrfs_free_cluster *cluster,
1098 u64 offset, u64 bytes, u64 min_bytes)
1099{
1100 unsigned long next_zero;
1101 unsigned long i;
1102 unsigned long search_bits;
1103 unsigned long total_bits;
1104 unsigned long found_bits;
1105 unsigned long start = 0;
1106 unsigned long total_found = 0;
1107 bool found = false;
1108
1109 i = offset_to_bit(entry->offset, block_group->sectorsize,
1110 max_t(u64, offset, entry->offset));
1111 search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1112 total_bits = bytes_to_bits(bytes, block_group->sectorsize);
1113
1114again:
1115 found_bits = 0;
1116 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
1117 i < BITS_PER_BITMAP;
1118 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
1119 next_zero = find_next_zero_bit(entry->bitmap,
1120 BITS_PER_BITMAP, i);
1121 if (next_zero - i >= search_bits) {
1122 found_bits = next_zero - i;
1123 break;
1124 }
1125 i = next_zero;
1126 }
1127
1128 if (!found_bits)
1129 return -1;
1130
1131 if (!found) {
1132 start = i;
1133 found = true;
1134 }
1135
1136 total_found += found_bits;
1137
1138 if (cluster->max_size < found_bits * block_group->sectorsize)
1139 cluster->max_size = found_bits * block_group->sectorsize;
1140
1141 if (total_found < total_bits) {
1142 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
1143 if (i - start > total_bits * 2) {
1144 total_found = 0;
1145 cluster->max_size = 0;
1146 found = false;
1147 }
1148 goto again;
1149 }
1150
1151 cluster->window_start = start * block_group->sectorsize +
1152 entry->offset;
1153 cluster->points_to_bitmap = true;
1154
1155 return 0;
1156}
1157
573/* 1158/*
574 * here we try to find a cluster of blocks in a block group. The goal 1159 * here we try to find a cluster of blocks in a block group. The goal
575 * is to find at least bytes free and up to empty_size + bytes free. 1160 * is to find at least bytes free and up to empty_size + bytes free.
@@ -579,6 +1164,7 @@ out:
579 * it returns -enospc 1164 * it returns -enospc
580 */ 1165 */
581int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 1166int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
1167 struct btrfs_root *root,
582 struct btrfs_block_group_cache *block_group, 1168 struct btrfs_block_group_cache *block_group,
583 struct btrfs_free_cluster *cluster, 1169 struct btrfs_free_cluster *cluster,
584 u64 offset, u64 bytes, u64 empty_size) 1170 u64 offset, u64 bytes, u64 empty_size)
@@ -586,16 +1172,18 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
586 struct btrfs_free_space *entry = NULL; 1172 struct btrfs_free_space *entry = NULL;
587 struct rb_node *node; 1173 struct rb_node *node;
588 struct btrfs_free_space *next; 1174 struct btrfs_free_space *next;
589 struct btrfs_free_space *last; 1175 struct btrfs_free_space *last = NULL;
590 u64 min_bytes; 1176 u64 min_bytes;
591 u64 window_start; 1177 u64 window_start;
592 u64 window_free; 1178 u64 window_free;
593 u64 max_extent = 0; 1179 u64 max_extent = 0;
594 int total_retries = 0; 1180 bool found_bitmap = false;
595 int ret; 1181 int ret;
596 1182
597 /* for metadata, allow allocates with more holes */ 1183 /* for metadata, allow allocates with more holes */
598 if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 1184 if (btrfs_test_opt(root, SSD_SPREAD)) {
1185 min_bytes = bytes + empty_size;
1186 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
599 /* 1187 /*
600 * we want to do larger allocations when we are 1188 * we want to do larger allocations when we are
601 * flushing out the delayed refs, it helps prevent 1189 * flushing out the delayed refs, it helps prevent
@@ -617,53 +1205,90 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
617 goto out; 1205 goto out;
618 } 1206 }
619again: 1207again:
620 min_bytes = min(min_bytes, bytes + empty_size); 1208 entry = tree_search_offset(block_group, offset, found_bitmap, 1);
621 entry = tree_search_bytes(&block_group->free_space_bytes,
622 offset, min_bytes);
623 if (!entry) { 1209 if (!entry) {
624 ret = -ENOSPC; 1210 ret = -ENOSPC;
625 goto out; 1211 goto out;
626 } 1212 }
1213
1214 /*
1215 * If found_bitmap is true, we exhausted our search for extent entries,
1216 * and we just want to search all of the bitmaps that we can find, and
1217 * ignore any extent entries we find.
1218 */
1219 while (entry->bitmap || found_bitmap ||
1220 (!entry->bitmap && entry->bytes < min_bytes)) {
1221 struct rb_node *node = rb_next(&entry->offset_index);
1222
1223 if (entry->bitmap && entry->bytes > bytes + empty_size) {
1224 ret = btrfs_bitmap_cluster(block_group, entry, cluster,
1225 offset, bytes + empty_size,
1226 min_bytes);
1227 if (!ret)
1228 goto got_it;
1229 }
1230
1231 if (!node) {
1232 ret = -ENOSPC;
1233 goto out;
1234 }
1235 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1236 }
1237
1238 /*
1239 * We already searched all the extent entries from the passed in offset
1240 * to the end and didn't find enough space for the cluster, and we also
1241 * didn't find any bitmaps that met our criteria, just go ahead and exit
1242 */
1243 if (found_bitmap) {
1244 ret = -ENOSPC;
1245 goto out;
1246 }
1247
1248 cluster->points_to_bitmap = false;
627 window_start = entry->offset; 1249 window_start = entry->offset;
628 window_free = entry->bytes; 1250 window_free = entry->bytes;
629 last = entry; 1251 last = entry;
630 max_extent = entry->bytes; 1252 max_extent = entry->bytes;
631 1253
632 while(1) { 1254 while (1) {
633 /* out window is just right, lets fill it */ 1255 /* out window is just right, lets fill it */
634 if (window_free >= bytes + empty_size) 1256 if (window_free >= bytes + empty_size)
635 break; 1257 break;
636 1258
637 node = rb_next(&last->offset_index); 1259 node = rb_next(&last->offset_index);
638 if (!node) { 1260 if (!node) {
1261 if (found_bitmap)
1262 goto again;
639 ret = -ENOSPC; 1263 ret = -ENOSPC;
640 goto out; 1264 goto out;
641 } 1265 }
642 next = rb_entry(node, struct btrfs_free_space, offset_index); 1266 next = rb_entry(node, struct btrfs_free_space, offset_index);
643 1267
644 /* 1268 /*
1269 * we found a bitmap, so if this search doesn't result in a
1270 * cluster, we know to go and search again for the bitmaps and
1271 * start looking for space there
1272 */
1273 if (next->bitmap) {
1274 if (!found_bitmap)
1275 offset = next->offset;
1276 found_bitmap = true;
1277 last = next;
1278 continue;
1279 }
1280
1281 /*
645 * we haven't filled the empty size and the window is 1282 * we haven't filled the empty size and the window is
646 * very large. reset and try again 1283 * very large. reset and try again
647 */ 1284 */
648 if (next->offset - window_start > (bytes + empty_size) * 2) { 1285 if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
1286 next->offset - window_start > (bytes + empty_size) * 2) {
649 entry = next; 1287 entry = next;
650 window_start = entry->offset; 1288 window_start = entry->offset;
651 window_free = entry->bytes; 1289 window_free = entry->bytes;
652 last = entry; 1290 last = entry;
653 max_extent = 0; 1291 max_extent = 0;
654 total_retries++;
655 if (total_retries % 256 == 0) {
656 if (min_bytes >= (bytes + empty_size)) {
657 ret = -ENOSPC;
658 goto out;
659 }
660 /*
661 * grow our allocation a bit, we're not having
662 * much luck
663 */
664 min_bytes *= 2;
665 goto again;
666 }
667 } else { 1292 } else {
668 last = next; 1293 last = next;
669 window_free += next->bytes; 1294 window_free += next->bytes;
@@ -681,11 +1306,19 @@ again:
681 * The cluster includes an rbtree, but only uses the offset index 1306 * The cluster includes an rbtree, but only uses the offset index
682 * of each free space cache entry. 1307 * of each free space cache entry.
683 */ 1308 */
684 while(1) { 1309 while (1) {
685 node = rb_next(&entry->offset_index); 1310 node = rb_next(&entry->offset_index);
686 unlink_free_space(block_group, entry); 1311 if (entry->bitmap && node) {
1312 entry = rb_entry(node, struct btrfs_free_space,
1313 offset_index);
1314 continue;
1315 } else if (entry->bitmap && !node) {
1316 break;
1317 }
1318
1319 rb_erase(&entry->offset_index, &block_group->free_space_offset);
687 ret = tree_insert_offset(&cluster->root, entry->offset, 1320 ret = tree_insert_offset(&cluster->root, entry->offset,
688 &entry->offset_index); 1321 &entry->offset_index, 0);
689 BUG_ON(ret); 1322 BUG_ON(ret);
690 1323
691 if (!node || entry == last) 1324 if (!node || entry == last)
@@ -693,8 +1326,10 @@ again:
693 1326
694 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1327 entry = rb_entry(node, struct btrfs_free_space, offset_index);
695 } 1328 }
696 ret = 0; 1329
697 cluster->max_size = max_extent; 1330 cluster->max_size = max_extent;
1331got_it:
1332 ret = 0;
698 atomic_inc(&block_group->count); 1333 atomic_inc(&block_group->count);
699 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 1334 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
700 cluster->block_group = block_group; 1335 cluster->block_group = block_group;
@@ -714,6 +1349,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
714 spin_lock_init(&cluster->refill_lock); 1349 spin_lock_init(&cluster->refill_lock);
715 cluster->root.rb_node = NULL; 1350 cluster->root.rb_node = NULL;
716 cluster->max_size = 0; 1351 cluster->max_size = 0;
1352 cluster->points_to_bitmap = false;
717 INIT_LIST_HEAD(&cluster->block_group_list); 1353 INIT_LIST_HEAD(&cluster->block_group_list);
718 cluster->block_group = NULL; 1354 cluster->block_group = NULL;
719} 1355}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index ab0bdc0a63ce..890a8e79011b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
19#ifndef __BTRFS_FREE_SPACE_CACHE 19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE 20#define __BTRFS_FREE_SPACE_CACHE
21 21
22struct btrfs_free_space {
23 struct rb_node offset_index;
24 u64 offset;
25 u64 bytes;
26 unsigned long *bitmap;
27 struct list_head list;
28};
29
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 30int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size); 31 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 32int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
@@ -31,6 +39,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytes); 39 u64 bytes);
32u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); 40u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
33int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 41int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root,
34 struct btrfs_block_group_cache *block_group, 43 struct btrfs_block_group_cache *block_group,
35 struct btrfs_free_cluster *cluster, 44 struct btrfs_free_cluster *cluster,
36 u64 offset, u64 bytes, u64 empty_size); 45 u64 offset, u64 bytes, u64 empty_size);
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 2a020b276768..db2ff9773b99 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -19,9 +19,9 @@
19#ifndef __HASH__ 19#ifndef __HASH__
20#define __HASH__ 20#define __HASH__
21 21
22#include "crc32c.h" 22#include <linux/crc32c.h>
23static inline u64 btrfs_name_hash(const char *name, int len) 23static inline u64 btrfs_name_hash(const char *name, int len)
24{ 24{
25 return btrfs_crc32c((u32)~1, name, len); 25 return crc32c((u32)~1, name, len);
26} 26}
27#endif 27#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c8b0190d031..59cba180fe83 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -26,7 +26,6 @@
26#include <linux/time.h> 26#include <linux/time.h>
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
31#include <linux/mpage.h> 30#include <linux/mpage.h>
32#include <linux/swap.h> 31#include <linux/swap.h>
@@ -48,7 +47,6 @@
48#include "ordered-data.h" 47#include "ordered-data.h"
49#include "xattr.h" 48#include "xattr.h"
50#include "tree-log.h" 49#include "tree-log.h"
51#include "ref-cache.h"
52#include "compression.h" 50#include "compression.h"
53#include "locking.h" 51#include "locking.h"
54 52
@@ -369,7 +367,7 @@ again:
369 * inode has not been flagged as nocompress. This flag can 367 * inode has not been flagged as nocompress. This flag can
370 * change at any time if we discover bad compression ratios. 368 * change at any time if we discover bad compression ratios.
371 */ 369 */
372 if (!btrfs_test_flag(inode, NOCOMPRESS) && 370 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
373 btrfs_test_opt(root, COMPRESS)) { 371 btrfs_test_opt(root, COMPRESS)) {
374 WARN_ON(pages); 372 WARN_ON(pages);
375 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 373 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
@@ -470,7 +468,7 @@ again:
470 nr_pages_ret = 0; 468 nr_pages_ret = 0;
471 469
472 /* flag the file so we don't compress in the future */ 470 /* flag the file so we don't compress in the future */
473 btrfs_set_flag(inode, NOCOMPRESS); 471 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
474 } 472 }
475 if (will_compress) { 473 if (will_compress) {
476 *num_added += 1; 474 *num_added += 1;
@@ -863,7 +861,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
863 async_cow->locked_page = locked_page; 861 async_cow->locked_page = locked_page;
864 async_cow->start = start; 862 async_cow->start = start;
865 863
866 if (btrfs_test_flag(inode, NOCOMPRESS)) 864 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
867 cur_end = end; 865 cur_end = end;
868 else 866 else
869 cur_end = min(end, start + 512 * 1024 - 1); 867 cur_end = min(end, start + 512 * 1024 - 1);
@@ -944,6 +942,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
944 u64 cow_start; 942 u64 cow_start;
945 u64 cur_offset; 943 u64 cur_offset;
946 u64 extent_end; 944 u64 extent_end;
945 u64 extent_offset;
947 u64 disk_bytenr; 946 u64 disk_bytenr;
948 u64 num_bytes; 947 u64 num_bytes;
949 int extent_type; 948 int extent_type;
@@ -1005,6 +1004,7 @@ next_slot:
1005 if (extent_type == BTRFS_FILE_EXTENT_REG || 1004 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1006 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1005 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1007 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1006 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1007 extent_offset = btrfs_file_extent_offset(leaf, fi);
1008 extent_end = found_key.offset + 1008 extent_end = found_key.offset +
1009 btrfs_file_extent_num_bytes(leaf, fi); 1009 btrfs_file_extent_num_bytes(leaf, fi);
1010 if (extent_end <= start) { 1010 if (extent_end <= start) {
@@ -1022,9 +1022,10 @@ next_slot:
1022 if (btrfs_extent_readonly(root, disk_bytenr)) 1022 if (btrfs_extent_readonly(root, disk_bytenr))
1023 goto out_check; 1023 goto out_check;
1024 if (btrfs_cross_ref_exist(trans, root, inode->i_ino, 1024 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
1025 disk_bytenr)) 1025 found_key.offset -
1026 extent_offset, disk_bytenr))
1026 goto out_check; 1027 goto out_check;
1027 disk_bytenr += btrfs_file_extent_offset(leaf, fi); 1028 disk_bytenr += extent_offset;
1028 disk_bytenr += cur_offset - found_key.offset; 1029 disk_bytenr += cur_offset - found_key.offset;
1029 num_bytes = min(end + 1, extent_end) - cur_offset; 1030 num_bytes = min(end + 1, extent_end) - cur_offset;
1030 /* 1031 /*
@@ -1131,10 +1132,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1131 int ret; 1132 int ret;
1132 struct btrfs_root *root = BTRFS_I(inode)->root; 1133 struct btrfs_root *root = BTRFS_I(inode)->root;
1133 1134
1134 if (btrfs_test_flag(inode, NODATACOW)) 1135 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
1135 ret = run_delalloc_nocow(inode, locked_page, start, end, 1136 ret = run_delalloc_nocow(inode, locked_page, start, end,
1136 page_started, 1, nr_written); 1137 page_started, 1, nr_written);
1137 else if (btrfs_test_flag(inode, PREALLOC)) 1138 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
1138 ret = run_delalloc_nocow(inode, locked_page, start, end, 1139 ret = run_delalloc_nocow(inode, locked_page, start, end,
1139 page_started, 0, nr_written); 1140 page_started, 0, nr_written);
1140 else if (!btrfs_test_opt(root, COMPRESS)) 1141 else if (!btrfs_test_opt(root, COMPRESS))
@@ -1288,7 +1289,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1288 int ret = 0; 1289 int ret = 0;
1289 int skip_sum; 1290 int skip_sum;
1290 1291
1291 skip_sum = btrfs_test_flag(inode, NODATASUM); 1292 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1292 1293
1293 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1294 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1294 BUG_ON(ret); 1295 BUG_ON(ret);
@@ -1489,9 +1490,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1489 ins.objectid = disk_bytenr; 1490 ins.objectid = disk_bytenr;
1490 ins.offset = disk_num_bytes; 1491 ins.offset = disk_num_bytes;
1491 ins.type = BTRFS_EXTENT_ITEM_KEY; 1492 ins.type = BTRFS_EXTENT_ITEM_KEY;
1492 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, 1493 ret = btrfs_alloc_reserved_file_extent(trans, root,
1493 root->root_key.objectid, 1494 root->root_key.objectid,
1494 trans->transid, inode->i_ino, &ins); 1495 inode->i_ino, file_pos, &ins);
1495 BUG_ON(ret); 1496 BUG_ON(ret);
1496 btrfs_free_path(path); 1497 btrfs_free_path(path);
1497 1498
@@ -1788,7 +1789,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1788 ClearPageChecked(page); 1789 ClearPageChecked(page);
1789 goto good; 1790 goto good;
1790 } 1791 }
1791 if (btrfs_test_flag(inode, NODATASUM)) 1792
1793 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
1792 return 0; 1794 return 0;
1793 1795
1794 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 1796 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
@@ -1956,23 +1958,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
1956 * crossing root thing. we store the inode number in the 1958 * crossing root thing. we store the inode number in the
1957 * offset of the orphan item. 1959 * offset of the orphan item.
1958 */ 1960 */
1959 inode = btrfs_iget_locked(root->fs_info->sb, 1961 found_key.objectid = found_key.offset;
1960 found_key.offset, root); 1962 found_key.type = BTRFS_INODE_ITEM_KEY;
1961 if (!inode) 1963 found_key.offset = 0;
1964 inode = btrfs_iget(root->fs_info->sb, &found_key, root);
1965 if (IS_ERR(inode))
1962 break; 1966 break;
1963 1967
1964 if (inode->i_state & I_NEW) {
1965 BTRFS_I(inode)->root = root;
1966
1967 /* have to set the location manually */
1968 BTRFS_I(inode)->location.objectid = inode->i_ino;
1969 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1970 BTRFS_I(inode)->location.offset = 0;
1971
1972 btrfs_read_locked_inode(inode);
1973 unlock_new_inode(inode);
1974 }
1975
1976 /* 1968 /*
1977 * add this inode to the orphan list so btrfs_orphan_del does 1969 * add this inode to the orphan list so btrfs_orphan_del does
1978 * the proper thing when we hit it 1970 * the proper thing when we hit it
@@ -2069,7 +2061,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
2069/* 2061/*
2070 * read an inode from the btree into the in-memory inode 2062 * read an inode from the btree into the in-memory inode
2071 */ 2063 */
2072void btrfs_read_locked_inode(struct inode *inode) 2064static void btrfs_read_locked_inode(struct inode *inode)
2073{ 2065{
2074 struct btrfs_path *path; 2066 struct btrfs_path *path;
2075 struct extent_buffer *leaf; 2067 struct extent_buffer *leaf;
@@ -2129,10 +2121,8 @@ void btrfs_read_locked_inode(struct inode *inode)
2129 * any xattrs or acls 2121 * any xattrs or acls
2130 */ 2122 */
2131 maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); 2123 maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
2132 if (!maybe_acls) { 2124 if (!maybe_acls)
2133 BTRFS_I(inode)->i_acl = NULL; 2125 cache_no_acl(inode);
2134 BTRFS_I(inode)->i_default_acl = NULL;
2135 }
2136 2126
2137 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2127 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2138 alloc_group_block, 0); 2128 alloc_group_block, 0);
@@ -2164,6 +2154,8 @@ void btrfs_read_locked_inode(struct inode *inode)
2164 init_special_inode(inode, inode->i_mode, rdev); 2154 init_special_inode(inode, inode->i_mode, rdev);
2165 break; 2155 break;
2166 } 2156 }
2157
2158 btrfs_update_iflags(inode);
2167 return; 2159 return;
2168 2160
2169make_bad: 2161make_bad:
@@ -2327,7 +2319,6 @@ err:
2327 btrfs_update_inode(trans, root, dir); 2319 btrfs_update_inode(trans, root, dir);
2328 btrfs_drop_nlink(inode); 2320 btrfs_drop_nlink(inode);
2329 ret = btrfs_update_inode(trans, root, inode); 2321 ret = btrfs_update_inode(trans, root, inode);
2330 dir->i_sb->s_dirt = 1;
2331out: 2322out:
2332 return ret; 2323 return ret;
2333} 2324}
@@ -2599,9 +2590,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2599 struct btrfs_file_extent_item *fi; 2590 struct btrfs_file_extent_item *fi;
2600 u64 extent_start = 0; 2591 u64 extent_start = 0;
2601 u64 extent_num_bytes = 0; 2592 u64 extent_num_bytes = 0;
2593 u64 extent_offset = 0;
2602 u64 item_end = 0; 2594 u64 item_end = 0;
2603 u64 root_gen = 0;
2604 u64 root_owner = 0;
2605 int found_extent; 2595 int found_extent;
2606 int del_item; 2596 int del_item;
2607 int pending_del_nr = 0; 2597 int pending_del_nr = 0;
@@ -2613,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2613 if (root->ref_cows) 2603 if (root->ref_cows)
2614 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 2604 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2615 path = btrfs_alloc_path(); 2605 path = btrfs_alloc_path();
2616 path->reada = -1;
2617 BUG_ON(!path); 2606 BUG_ON(!path);
2607 path->reada = -1;
2618 2608
2619 /* FIXME, add redo link to tree so we don't leak on crash */ 2609 /* FIXME, add redo link to tree so we don't leak on crash */
2620 key.objectid = inode->i_ino; 2610 key.objectid = inode->i_ino;
@@ -2716,6 +2706,9 @@ search_again:
2716 extent_num_bytes = 2706 extent_num_bytes =
2717 btrfs_file_extent_disk_num_bytes(leaf, 2707 btrfs_file_extent_disk_num_bytes(leaf,
2718 fi); 2708 fi);
2709 extent_offset = found_key.offset -
2710 btrfs_file_extent_offset(leaf, fi);
2711
2719 /* FIXME blocksize != 4096 */ 2712 /* FIXME blocksize != 4096 */
2720 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 2713 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2721 if (extent_start != 0) { 2714 if (extent_start != 0) {
@@ -2723,8 +2716,6 @@ search_again:
2723 if (root->ref_cows) 2716 if (root->ref_cows)
2724 inode_sub_bytes(inode, num_dec); 2717 inode_sub_bytes(inode, num_dec);
2725 } 2718 }
2726 root_gen = btrfs_header_generation(leaf);
2727 root_owner = btrfs_header_owner(leaf);
2728 } 2719 }
2729 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2720 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2730 /* 2721 /*
@@ -2768,12 +2759,12 @@ delete:
2768 } else { 2759 } else {
2769 break; 2760 break;
2770 } 2761 }
2771 if (found_extent) { 2762 if (found_extent && root->ref_cows) {
2772 btrfs_set_path_blocking(path); 2763 btrfs_set_path_blocking(path);
2773 ret = btrfs_free_extent(trans, root, extent_start, 2764 ret = btrfs_free_extent(trans, root, extent_start,
2774 extent_num_bytes, 2765 extent_num_bytes, 0,
2775 leaf->start, root_owner, 2766 btrfs_header_owner(leaf),
2776 root_gen, inode->i_ino, 0); 2767 inode->i_ino, extent_offset);
2777 BUG_ON(ret); 2768 BUG_ON(ret);
2778 } 2769 }
2779next: 2770next:
@@ -2811,7 +2802,6 @@ error:
2811 pending_del_nr); 2802 pending_del_nr);
2812 } 2803 }
2813 btrfs_free_path(path); 2804 btrfs_free_path(path);
2814 inode->i_sb->s_dirt = 1;
2815 return ret; 2805 return ret;
2816} 2806}
2817 2807
@@ -3105,13 +3095,56 @@ static int fixup_tree_root_location(struct btrfs_root *root,
3105 return 0; 3095 return 0;
3106} 3096}
3107 3097
3098static void inode_tree_add(struct inode *inode)
3099{
3100 struct btrfs_root *root = BTRFS_I(inode)->root;
3101 struct btrfs_inode *entry;
3102 struct rb_node **p;
3103 struct rb_node *parent;
3104
3105again:
3106 p = &root->inode_tree.rb_node;
3107 parent = NULL;
3108
3109 spin_lock(&root->inode_lock);
3110 while (*p) {
3111 parent = *p;
3112 entry = rb_entry(parent, struct btrfs_inode, rb_node);
3113
3114 if (inode->i_ino < entry->vfs_inode.i_ino)
3115 p = &parent->rb_left;
3116 else if (inode->i_ino > entry->vfs_inode.i_ino)
3117 p = &parent->rb_right;
3118 else {
3119 WARN_ON(!(entry->vfs_inode.i_state &
3120 (I_WILL_FREE | I_FREEING | I_CLEAR)));
3121 rb_erase(parent, &root->inode_tree);
3122 RB_CLEAR_NODE(parent);
3123 spin_unlock(&root->inode_lock);
3124 goto again;
3125 }
3126 }
3127 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
3128 rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3129 spin_unlock(&root->inode_lock);
3130}
3131
3132static void inode_tree_del(struct inode *inode)
3133{
3134 struct btrfs_root *root = BTRFS_I(inode)->root;
3135
3136 spin_lock(&root->inode_lock);
3137 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
3138 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3139 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3140 }
3141 spin_unlock(&root->inode_lock);
3142}
3143
3108static noinline void init_btrfs_i(struct inode *inode) 3144static noinline void init_btrfs_i(struct inode *inode)
3109{ 3145{
3110 struct btrfs_inode *bi = BTRFS_I(inode); 3146 struct btrfs_inode *bi = BTRFS_I(inode);
3111 3147
3112 bi->i_acl = BTRFS_ACL_NOT_CACHED;
3113 bi->i_default_acl = BTRFS_ACL_NOT_CACHED;
3114
3115 bi->generation = 0; 3148 bi->generation = 0;
3116 bi->sequence = 0; 3149 bi->sequence = 0;
3117 bi->last_trans = 0; 3150 bi->last_trans = 0;
@@ -3130,6 +3163,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3130 inode->i_mapping, GFP_NOFS); 3163 inode->i_mapping, GFP_NOFS);
3131 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3164 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3132 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); 3165 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3166 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3133 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3167 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3134 mutex_init(&BTRFS_I(inode)->extent_mutex); 3168 mutex_init(&BTRFS_I(inode)->extent_mutex);
3135 mutex_init(&BTRFS_I(inode)->log_mutex); 3169 mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3152,26 +3186,9 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
3152 args->root == BTRFS_I(inode)->root; 3186 args->root == BTRFS_I(inode)->root;
3153} 3187}
3154 3188
3155struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, 3189static struct inode *btrfs_iget_locked(struct super_block *s,
3156 struct btrfs_root *root, int wait) 3190 u64 objectid,
3157{ 3191 struct btrfs_root *root)
3158 struct inode *inode;
3159 struct btrfs_iget_args args;
3160 args.ino = objectid;
3161 args.root = root;
3162
3163 if (wait) {
3164 inode = ilookup5(s, objectid, btrfs_find_actor,
3165 (void *)&args);
3166 } else {
3167 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
3168 (void *)&args);
3169 }
3170 return inode;
3171}
3172
3173struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3174 struct btrfs_root *root)
3175{ 3192{
3176 struct inode *inode; 3193 struct inode *inode;
3177 struct btrfs_iget_args args; 3194 struct btrfs_iget_args args;
@@ -3188,24 +3205,21 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3188 * Returns in *is_new if the inode was read from disk 3205 * Returns in *is_new if the inode was read from disk
3189 */ 3206 */
3190struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3207struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3191 struct btrfs_root *root, int *is_new) 3208 struct btrfs_root *root)
3192{ 3209{
3193 struct inode *inode; 3210 struct inode *inode;
3194 3211
3195 inode = btrfs_iget_locked(s, location->objectid, root); 3212 inode = btrfs_iget_locked(s, location->objectid, root);
3196 if (!inode) 3213 if (!inode)
3197 return ERR_PTR(-EACCES); 3214 return ERR_PTR(-ENOMEM);
3198 3215
3199 if (inode->i_state & I_NEW) { 3216 if (inode->i_state & I_NEW) {
3200 BTRFS_I(inode)->root = root; 3217 BTRFS_I(inode)->root = root;
3201 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 3218 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3202 btrfs_read_locked_inode(inode); 3219 btrfs_read_locked_inode(inode);
3220
3221 inode_tree_add(inode);
3203 unlock_new_inode(inode); 3222 unlock_new_inode(inode);
3204 if (is_new)
3205 *is_new = 1;
3206 } else {
3207 if (is_new)
3208 *is_new = 0;
3209 } 3223 }
3210 3224
3211 return inode; 3225 return inode;
@@ -3218,7 +3232,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3218 struct btrfs_root *root = bi->root; 3232 struct btrfs_root *root = bi->root;
3219 struct btrfs_root *sub_root = root; 3233 struct btrfs_root *sub_root = root;
3220 struct btrfs_key location; 3234 struct btrfs_key location;
3221 int ret, new; 3235 int ret;
3222 3236
3223 if (dentry->d_name.len > BTRFS_NAME_LEN) 3237 if (dentry->d_name.len > BTRFS_NAME_LEN)
3224 return ERR_PTR(-ENAMETOOLONG); 3238 return ERR_PTR(-ENAMETOOLONG);
@@ -3236,7 +3250,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3236 return ERR_PTR(ret); 3250 return ERR_PTR(ret);
3237 if (ret > 0) 3251 if (ret > 0)
3238 return ERR_PTR(-ENOENT); 3252 return ERR_PTR(-ENOENT);
3239 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); 3253 inode = btrfs_iget(dir->i_sb, &location, sub_root);
3240 if (IS_ERR(inode)) 3254 if (IS_ERR(inode))
3241 return ERR_CAST(inode); 3255 return ERR_CAST(inode);
3242 } 3256 }
@@ -3572,12 +3586,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3572 owner = 1; 3586 owner = 1;
3573 BTRFS_I(inode)->block_group = 3587 BTRFS_I(inode)->block_group =
3574 btrfs_find_block_group(root, 0, alloc_hint, owner); 3588 btrfs_find_block_group(root, 0, alloc_hint, owner);
3575 if ((mode & S_IFREG)) {
3576 if (btrfs_test_opt(root, NODATASUM))
3577 btrfs_set_flag(inode, NODATASUM);
3578 if (btrfs_test_opt(root, NODATACOW))
3579 btrfs_set_flag(inode, NODATACOW);
3580 }
3581 3589
3582 key[0].objectid = objectid; 3590 key[0].objectid = objectid;
3583 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 3591 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -3630,7 +3638,17 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3630 location->offset = 0; 3638 location->offset = 0;
3631 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 3639 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
3632 3640
3641 btrfs_inherit_iflags(inode, dir);
3642
3643 if ((mode & S_IFREG)) {
3644 if (btrfs_test_opt(root, NODATASUM))
3645 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
3646 if (btrfs_test_opt(root, NODATACOW))
3647 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
3648 }
3649
3633 insert_inode_hash(inode); 3650 insert_inode_hash(inode);
3651 inode_tree_add(inode);
3634 return inode; 3652 return inode;
3635fail: 3653fail:
3636 if (dir) 3654 if (dir)
@@ -3750,7 +3768,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3750 init_special_inode(inode, inode->i_mode, rdev); 3768 init_special_inode(inode, inode->i_mode, rdev);
3751 btrfs_update_inode(trans, root, inode); 3769 btrfs_update_inode(trans, root, inode);
3752 } 3770 }
3753 dir->i_sb->s_dirt = 1;
3754 btrfs_update_inode_block_group(trans, inode); 3771 btrfs_update_inode_block_group(trans, inode);
3755 btrfs_update_inode_block_group(trans, dir); 3772 btrfs_update_inode_block_group(trans, dir);
3756out_unlock: 3773out_unlock:
@@ -3815,7 +3832,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3815 inode->i_op = &btrfs_file_inode_operations; 3832 inode->i_op = &btrfs_file_inode_operations;
3816 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 3833 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3817 } 3834 }
3818 dir->i_sb->s_dirt = 1;
3819 btrfs_update_inode_block_group(trans, inode); 3835 btrfs_update_inode_block_group(trans, inode);
3820 btrfs_update_inode_block_group(trans, dir); 3836 btrfs_update_inode_block_group(trans, dir);
3821out_unlock: 3837out_unlock:
@@ -3862,7 +3878,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3862 if (err) 3878 if (err)
3863 drop_inode = 1; 3879 drop_inode = 1;
3864 3880
3865 dir->i_sb->s_dirt = 1;
3866 btrfs_update_inode_block_group(trans, dir); 3881 btrfs_update_inode_block_group(trans, dir);
3867 err = btrfs_update_inode(trans, root, inode); 3882 err = btrfs_update_inode(trans, root, inode);
3868 3883
@@ -3944,7 +3959,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3944 3959
3945 d_instantiate(dentry, inode); 3960 d_instantiate(dentry, inode);
3946 drop_on_err = 0; 3961 drop_on_err = 0;
3947 dir->i_sb->s_dirt = 1;
3948 btrfs_update_inode_block_group(trans, inode); 3962 btrfs_update_inode_block_group(trans, inode);
3949 btrfs_update_inode_block_group(trans, dir); 3963 btrfs_update_inode_block_group(trans, dir);
3950 3964
@@ -4628,8 +4642,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4628 ei->last_trans = 0; 4642 ei->last_trans = 0;
4629 ei->logged_trans = 0; 4643 ei->logged_trans = 0;
4630 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 4644 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4631 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4632 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4633 INIT_LIST_HEAD(&ei->i_orphan); 4645 INIT_LIST_HEAD(&ei->i_orphan);
4634 INIT_LIST_HEAD(&ei->ordered_operations); 4646 INIT_LIST_HEAD(&ei->ordered_operations);
4635 return &ei->vfs_inode; 4647 return &ei->vfs_inode;
@@ -4643,13 +4655,6 @@ void btrfs_destroy_inode(struct inode *inode)
4643 WARN_ON(!list_empty(&inode->i_dentry)); 4655 WARN_ON(!list_empty(&inode->i_dentry));
4644 WARN_ON(inode->i_data.nrpages); 4656 WARN_ON(inode->i_data.nrpages);
4645 4657
4646 if (BTRFS_I(inode)->i_acl &&
4647 BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
4648 posix_acl_release(BTRFS_I(inode)->i_acl);
4649 if (BTRFS_I(inode)->i_default_acl &&
4650 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4651 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4652
4653 /* 4658 /*
4654 * Make sure we're properly removed from the ordered operation 4659 * Make sure we're properly removed from the ordered operation
4655 * lists. 4660 * lists.
@@ -4683,6 +4688,7 @@ void btrfs_destroy_inode(struct inode *inode)
4683 btrfs_put_ordered_extent(ordered); 4688 btrfs_put_ordered_extent(ordered);
4684 } 4689 }
4685 } 4690 }
4691 inode_tree_del(inode);
4686 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 4692 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
4687 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 4693 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4688} 4694}
@@ -4786,8 +4792,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4786 * and the replacement file is large. Start IO on it now so 4792 * and the replacement file is large. Start IO on it now so
4787 * we don't add too much work to the end of the transaction 4793 * we don't add too much work to the end of the transaction
4788 */ 4794 */
4789 if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && 4795 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
4790 new_inode->i_size &&
4791 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 4796 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4792 filemap_flush(old_inode->i_mapping); 4797 filemap_flush(old_inode->i_mapping);
4793 4798
@@ -4972,7 +4977,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4972 inode->i_op = &btrfs_file_inode_operations; 4977 inode->i_op = &btrfs_file_inode_operations;
4973 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4978 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4974 } 4979 }
4975 dir->i_sb->s_dirt = 1;
4976 btrfs_update_inode_block_group(trans, inode); 4980 btrfs_update_inode_block_group(trans, inode);
4977 btrfs_update_inode_block_group(trans, dir); 4981 btrfs_update_inode_block_group(trans, dir);
4978 if (drop_inode) 4982 if (drop_inode)
@@ -5061,7 +5065,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5061out: 5065out:
5062 if (cur_offset > start) { 5066 if (cur_offset > start) {
5063 inode->i_ctime = CURRENT_TIME; 5067 inode->i_ctime = CURRENT_TIME;
5064 btrfs_set_flag(inode, PREALLOC); 5068 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5065 if (!(mode & FALLOC_FL_KEEP_SIZE) && 5069 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5066 cur_offset > i_size_read(inode)) 5070 cur_offset > i_size_read(inode))
5067 btrfs_i_size_write(inode, cur_offset); 5071 btrfs_i_size_write(inode, cur_offset);
@@ -5084,6 +5088,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5084 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 5088 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
5085 struct extent_map *em; 5089 struct extent_map *em;
5086 struct btrfs_trans_handle *trans; 5090 struct btrfs_trans_handle *trans;
5091 struct btrfs_root *root;
5087 int ret; 5092 int ret;
5088 5093
5089 alloc_start = offset & ~mask; 5094 alloc_start = offset & ~mask;
@@ -5102,6 +5107,13 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5102 goto out; 5107 goto out;
5103 } 5108 }
5104 5109
5110 root = BTRFS_I(inode)->root;
5111
5112 ret = btrfs_check_data_free_space(root, inode,
5113 alloc_end - alloc_start);
5114 if (ret)
5115 goto out;
5116
5105 locked_end = alloc_end - 1; 5117 locked_end = alloc_end - 1;
5106 while (1) { 5118 while (1) {
5107 struct btrfs_ordered_extent *ordered; 5119 struct btrfs_ordered_extent *ordered;
@@ -5109,7 +5121,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5109 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); 5121 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
5110 if (!trans) { 5122 if (!trans) {
5111 ret = -EIO; 5123 ret = -EIO;
5112 goto out; 5124 goto out_free;
5113 } 5125 }
5114 5126
5115 /* the extent lock is ordered inside the running 5127 /* the extent lock is ordered inside the running
@@ -5170,6 +5182,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5170 GFP_NOFS); 5182 GFP_NOFS);
5171 5183
5172 btrfs_end_transaction(trans, BTRFS_I(inode)->root); 5184 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
5185out_free:
5186 btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
5173out: 5187out:
5174 mutex_unlock(&inode->i_mutex); 5188 mutex_unlock(&inode->i_mutex);
5175 return ret; 5189 return ret;
@@ -5182,7 +5196,7 @@ static int btrfs_set_page_dirty(struct page *page)
5182 5196
5183static int btrfs_permission(struct inode *inode, int mask) 5197static int btrfs_permission(struct inode *inode, int mask)
5184{ 5198{
5185 if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE)) 5199 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
5186 return -EACCES; 5200 return -EACCES;
5187 return generic_permission(inode, mask, btrfs_check_acl); 5201 return generic_permission(inode, mask, btrfs_check_acl);
5188} 5202}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2624b53ea783..bd88f25889f7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -27,7 +27,6 @@
27#include <linux/time.h> 27#include <linux/time.h>
28#include <linux/init.h> 28#include <linux/init.h>
29#include <linux/string.h> 29#include <linux/string.h>
30#include <linux/smp_lock.h>
31#include <linux/backing-dev.h> 30#include <linux/backing-dev.h>
32#include <linux/mount.h> 31#include <linux/mount.h>
33#include <linux/mpage.h> 32#include <linux/mpage.h>
@@ -50,7 +49,177 @@
50#include "volumes.h" 49#include "volumes.h"
51#include "locking.h" 50#include "locking.h"
52 51
52/* Mask out flags that are inappropriate for the given type of inode. */
53static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
54{
55 if (S_ISDIR(mode))
56 return flags;
57 else if (S_ISREG(mode))
58 return flags & ~FS_DIRSYNC_FL;
59 else
60 return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
61}
62
63/*
64 * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
65 */
66static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
67{
68 unsigned int iflags = 0;
69
70 if (flags & BTRFS_INODE_SYNC)
71 iflags |= FS_SYNC_FL;
72 if (flags & BTRFS_INODE_IMMUTABLE)
73 iflags |= FS_IMMUTABLE_FL;
74 if (flags & BTRFS_INODE_APPEND)
75 iflags |= FS_APPEND_FL;
76 if (flags & BTRFS_INODE_NODUMP)
77 iflags |= FS_NODUMP_FL;
78 if (flags & BTRFS_INODE_NOATIME)
79 iflags |= FS_NOATIME_FL;
80 if (flags & BTRFS_INODE_DIRSYNC)
81 iflags |= FS_DIRSYNC_FL;
82
83 return iflags;
84}
85
86/*
87 * Update inode->i_flags based on the btrfs internal flags.
88 */
89void btrfs_update_iflags(struct inode *inode)
90{
91 struct btrfs_inode *ip = BTRFS_I(inode);
92
93 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
94
95 if (ip->flags & BTRFS_INODE_SYNC)
96 inode->i_flags |= S_SYNC;
97 if (ip->flags & BTRFS_INODE_IMMUTABLE)
98 inode->i_flags |= S_IMMUTABLE;
99 if (ip->flags & BTRFS_INODE_APPEND)
100 inode->i_flags |= S_APPEND;
101 if (ip->flags & BTRFS_INODE_NOATIME)
102 inode->i_flags |= S_NOATIME;
103 if (ip->flags & BTRFS_INODE_DIRSYNC)
104 inode->i_flags |= S_DIRSYNC;
105}
106
107/*
108 * Inherit flags from the parent inode.
109 *
110 * Unlike extN we don't have any flags we don't want to inherit currently.
111 */
112void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
113{
114 unsigned int flags;
115
116 if (!dir)
117 return;
118
119 flags = BTRFS_I(dir)->flags;
120
121 if (S_ISREG(inode->i_mode))
122 flags &= ~BTRFS_INODE_DIRSYNC;
123 else if (!S_ISDIR(inode->i_mode))
124 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
125
126 BTRFS_I(inode)->flags = flags;
127 btrfs_update_iflags(inode);
128}
129
130static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
131{
132 struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
133 unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
134
135 if (copy_to_user(arg, &flags, sizeof(flags)))
136 return -EFAULT;
137 return 0;
138}
139
140static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
141{
142 struct inode *inode = file->f_path.dentry->d_inode;
143 struct btrfs_inode *ip = BTRFS_I(inode);
144 struct btrfs_root *root = ip->root;
145 struct btrfs_trans_handle *trans;
146 unsigned int flags, oldflags;
147 int ret;
148
149 if (copy_from_user(&flags, arg, sizeof(flags)))
150 return -EFAULT;
151
152 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
153 FS_NOATIME_FL | FS_NODUMP_FL | \
154 FS_SYNC_FL | FS_DIRSYNC_FL))
155 return -EOPNOTSUPP;
53 156
157 if (!is_owner_or_cap(inode))
158 return -EACCES;
159
160 mutex_lock(&inode->i_mutex);
161
162 flags = btrfs_mask_flags(inode->i_mode, flags);
163 oldflags = btrfs_flags_to_ioctl(ip->flags);
164 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
165 if (!capable(CAP_LINUX_IMMUTABLE)) {
166 ret = -EPERM;
167 goto out_unlock;
168 }
169 }
170
171 ret = mnt_want_write(file->f_path.mnt);
172 if (ret)
173 goto out_unlock;
174
175 if (flags & FS_SYNC_FL)
176 ip->flags |= BTRFS_INODE_SYNC;
177 else
178 ip->flags &= ~BTRFS_INODE_SYNC;
179 if (flags & FS_IMMUTABLE_FL)
180 ip->flags |= BTRFS_INODE_IMMUTABLE;
181 else
182 ip->flags &= ~BTRFS_INODE_IMMUTABLE;
183 if (flags & FS_APPEND_FL)
184 ip->flags |= BTRFS_INODE_APPEND;
185 else
186 ip->flags &= ~BTRFS_INODE_APPEND;
187 if (flags & FS_NODUMP_FL)
188 ip->flags |= BTRFS_INODE_NODUMP;
189 else
190 ip->flags &= ~BTRFS_INODE_NODUMP;
191 if (flags & FS_NOATIME_FL)
192 ip->flags |= BTRFS_INODE_NOATIME;
193 else
194 ip->flags &= ~BTRFS_INODE_NOATIME;
195 if (flags & FS_DIRSYNC_FL)
196 ip->flags |= BTRFS_INODE_DIRSYNC;
197 else
198 ip->flags &= ~BTRFS_INODE_DIRSYNC;
199
200
201 trans = btrfs_join_transaction(root, 1);
202 BUG_ON(!trans);
203
204 ret = btrfs_update_inode(trans, root, inode);
205 BUG_ON(ret);
206
207 btrfs_update_iflags(inode);
208 inode->i_ctime = CURRENT_TIME;
209 btrfs_end_transaction(trans, root);
210
211 mnt_drop_write(file->f_path.mnt);
212 out_unlock:
213 mutex_unlock(&inode->i_mutex);
214 return 0;
215}
216
217static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
218{
219 struct inode *inode = file->f_path.dentry->d_inode;
220
221 return put_user(inode->i_generation, arg);
222}
54 223
55static noinline int create_subvol(struct btrfs_root *root, 224static noinline int create_subvol(struct btrfs_root *root,
56 struct dentry *dentry, 225 struct dentry *dentry,
@@ -82,22 +251,25 @@ static noinline int create_subvol(struct btrfs_root *root,
82 if (ret) 251 if (ret)
83 goto fail; 252 goto fail;
84 253
85 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 254 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
86 objectid, trans->transid, 0, 0, 0); 255 0, objectid, NULL, 0, 0, 0);
87 if (IS_ERR(leaf)) { 256 if (IS_ERR(leaf)) {
88 ret = PTR_ERR(leaf); 257 ret = PTR_ERR(leaf);
89 goto fail; 258 goto fail;
90 } 259 }
91 260
92 btrfs_set_header_nritems(leaf, 0); 261 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
93 btrfs_set_header_level(leaf, 0);
94 btrfs_set_header_bytenr(leaf, leaf->start); 262 btrfs_set_header_bytenr(leaf, leaf->start);
95 btrfs_set_header_generation(leaf, trans->transid); 263 btrfs_set_header_generation(leaf, trans->transid);
264 btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
96 btrfs_set_header_owner(leaf, objectid); 265 btrfs_set_header_owner(leaf, objectid);
97 266
98 write_extent_buffer(leaf, root->fs_info->fsid, 267 write_extent_buffer(leaf, root->fs_info->fsid,
99 (unsigned long)btrfs_header_fsid(leaf), 268 (unsigned long)btrfs_header_fsid(leaf),
100 BTRFS_FSID_SIZE); 269 BTRFS_FSID_SIZE);
270 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
271 (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
272 BTRFS_UUID_SIZE);
101 btrfs_mark_buffer_dirty(leaf); 273 btrfs_mark_buffer_dirty(leaf);
102 274
103 inode_item = &root_item.inode; 275 inode_item = &root_item.inode;
@@ -125,7 +297,7 @@ static noinline int create_subvol(struct btrfs_root *root,
125 btrfs_set_root_dirid(&root_item, new_dirid); 297 btrfs_set_root_dirid(&root_item, new_dirid);
126 298
127 key.objectid = objectid; 299 key.objectid = objectid;
128 key.offset = 1; 300 key.offset = 0;
129 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 301 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
130 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 302 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
131 &root_item); 303 &root_item);
@@ -855,7 +1027,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
855 struct btrfs_file_extent_item); 1027 struct btrfs_file_extent_item);
856 comp = btrfs_file_extent_compression(leaf, extent); 1028 comp = btrfs_file_extent_compression(leaf, extent);
857 type = btrfs_file_extent_type(leaf, extent); 1029 type = btrfs_file_extent_type(leaf, extent);
858 if (type == BTRFS_FILE_EXTENT_REG) { 1030 if (type == BTRFS_FILE_EXTENT_REG ||
1031 type == BTRFS_FILE_EXTENT_PREALLOC) {
859 disko = btrfs_file_extent_disk_bytenr(leaf, 1032 disko = btrfs_file_extent_disk_bytenr(leaf,
860 extent); 1033 extent);
861 diskl = btrfs_file_extent_disk_num_bytes(leaf, 1034 diskl = btrfs_file_extent_disk_num_bytes(leaf,
@@ -878,7 +1051,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
878 new_key.objectid = inode->i_ino; 1051 new_key.objectid = inode->i_ino;
879 new_key.offset = key.offset + destoff - off; 1052 new_key.offset = key.offset + destoff - off;
880 1053
881 if (type == BTRFS_FILE_EXTENT_REG) { 1054 if (type == BTRFS_FILE_EXTENT_REG ||
1055 type == BTRFS_FILE_EXTENT_PREALLOC) {
882 ret = btrfs_insert_empty_item(trans, root, path, 1056 ret = btrfs_insert_empty_item(trans, root, path,
883 &new_key, size); 1057 &new_key, size);
884 if (ret) 1058 if (ret)
@@ -911,10 +1085,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
911 if (disko) { 1085 if (disko) {
912 inode_add_bytes(inode, datal); 1086 inode_add_bytes(inode, datal);
913 ret = btrfs_inc_extent_ref(trans, root, 1087 ret = btrfs_inc_extent_ref(trans, root,
914 disko, diskl, leaf->start, 1088 disko, diskl, 0,
915 root->root_key.objectid, 1089 root->root_key.objectid,
916 trans->transid, 1090 inode->i_ino,
917 inode->i_ino); 1091 new_key.offset - datao);
918 BUG_ON(ret); 1092 BUG_ON(ret);
919 } 1093 }
920 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 1094 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -1074,6 +1248,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1074 void __user *argp = (void __user *)arg; 1248 void __user *argp = (void __user *)arg;
1075 1249
1076 switch (cmd) { 1250 switch (cmd) {
1251 case FS_IOC_GETFLAGS:
1252 return btrfs_ioctl_getflags(file, argp);
1253 case FS_IOC_SETFLAGS:
1254 return btrfs_ioctl_setflags(file, argp);
1255 case FS_IOC_GETVERSION:
1256 return btrfs_ioctl_getversion(file, argp);
1077 case BTRFS_IOC_SNAP_CREATE: 1257 case BTRFS_IOC_SNAP_CREATE:
1078 return btrfs_ioctl_snap_create(file, argp, 0); 1258 return btrfs_ioctl_snap_create(file, argp, 0);
1079 case BTRFS_IOC_SUBVOL_CREATE: 1259 case BTRFS_IOC_SUBVOL_CREATE:
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5f8f218c1005..0d126be22b63 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -45,22 +45,132 @@ static void print_dev_item(struct extent_buffer *eb,
45 (unsigned long long)btrfs_device_total_bytes(eb, dev_item), 45 (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
46 (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); 46 (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
47} 47}
48static void print_extent_data_ref(struct extent_buffer *eb,
49 struct btrfs_extent_data_ref *ref)
50{
51 printk(KERN_INFO "\t\textent data backref root %llu "
52 "objectid %llu offset %llu count %u\n",
53 (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
54 (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
55 (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
56 btrfs_extent_data_ref_count(eb, ref));
57}
58
59static void print_extent_item(struct extent_buffer *eb, int slot)
60{
61 struct btrfs_extent_item *ei;
62 struct btrfs_extent_inline_ref *iref;
63 struct btrfs_extent_data_ref *dref;
64 struct btrfs_shared_data_ref *sref;
65 struct btrfs_disk_key key;
66 unsigned long end;
67 unsigned long ptr;
68 int type;
69 u32 item_size = btrfs_item_size_nr(eb, slot);
70 u64 flags;
71 u64 offset;
72
73 if (item_size < sizeof(*ei)) {
74#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
75 struct btrfs_extent_item_v0 *ei0;
76 BUG_ON(item_size != sizeof(*ei0));
77 ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
78 printk(KERN_INFO "\t\textent refs %u\n",
79 btrfs_extent_refs_v0(eb, ei0));
80 return;
81#else
82 BUG();
83#endif
84 }
85
86 ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
87 flags = btrfs_extent_flags(eb, ei);
88
89 printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
90 (unsigned long long)btrfs_extent_refs(eb, ei),
91 (unsigned long long)btrfs_extent_generation(eb, ei),
92 (unsigned long long)flags);
93
94 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
95 struct btrfs_tree_block_info *info;
96 info = (struct btrfs_tree_block_info *)(ei + 1);
97 btrfs_tree_block_key(eb, info, &key);
98 printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
99 "level %d\n",
100 (unsigned long long)btrfs_disk_key_objectid(&key),
101 key.type,
102 (unsigned long long)btrfs_disk_key_offset(&key),
103 btrfs_tree_block_level(eb, info));
104 iref = (struct btrfs_extent_inline_ref *)(info + 1);
105 } else {
106 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
107 }
108
109 ptr = (unsigned long)iref;
110 end = (unsigned long)ei + item_size;
111 while (ptr < end) {
112 iref = (struct btrfs_extent_inline_ref *)ptr;
113 type = btrfs_extent_inline_ref_type(eb, iref);
114 offset = btrfs_extent_inline_ref_offset(eb, iref);
115 switch (type) {
116 case BTRFS_TREE_BLOCK_REF_KEY:
117 printk(KERN_INFO "\t\ttree block backref "
118 "root %llu\n", (unsigned long long)offset);
119 break;
120 case BTRFS_SHARED_BLOCK_REF_KEY:
121 printk(KERN_INFO "\t\tshared block backref "
122 "parent %llu\n", (unsigned long long)offset);
123 break;
124 case BTRFS_EXTENT_DATA_REF_KEY:
125 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
126 print_extent_data_ref(eb, dref);
127 break;
128 case BTRFS_SHARED_DATA_REF_KEY:
129 sref = (struct btrfs_shared_data_ref *)(iref + 1);
130 printk(KERN_INFO "\t\tshared data backref "
131 "parent %llu count %u\n",
132 (unsigned long long)offset,
133 btrfs_shared_data_ref_count(eb, sref));
134 break;
135 default:
136 BUG();
137 }
138 ptr += btrfs_extent_inline_ref_size(type);
139 }
140 WARN_ON(ptr > end);
141}
142
143#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
144static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
145{
146 struct btrfs_extent_ref_v0 *ref0;
147
148 ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
149 printk("\t\textent back ref root %llu gen %llu "
150 "owner %llu num_refs %lu\n",
151 (unsigned long long)btrfs_ref_root_v0(eb, ref0),
152 (unsigned long long)btrfs_ref_generation_v0(eb, ref0),
153 (unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
154 (unsigned long)btrfs_ref_count_v0(eb, ref0));
155}
156#endif
157
48void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) 158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
49{ 159{
50 int i; 160 int i;
161 u32 type;
51 u32 nr = btrfs_header_nritems(l); 162 u32 nr = btrfs_header_nritems(l);
52 struct btrfs_item *item; 163 struct btrfs_item *item;
53 struct btrfs_extent_item *ei;
54 struct btrfs_root_item *ri; 164 struct btrfs_root_item *ri;
55 struct btrfs_dir_item *di; 165 struct btrfs_dir_item *di;
56 struct btrfs_inode_item *ii; 166 struct btrfs_inode_item *ii;
57 struct btrfs_block_group_item *bi; 167 struct btrfs_block_group_item *bi;
58 struct btrfs_file_extent_item *fi; 168 struct btrfs_file_extent_item *fi;
169 struct btrfs_extent_data_ref *dref;
170 struct btrfs_shared_data_ref *sref;
171 struct btrfs_dev_extent *dev_extent;
59 struct btrfs_key key; 172 struct btrfs_key key;
60 struct btrfs_key found_key; 173 struct btrfs_key found_key;
61 struct btrfs_extent_ref *ref;
62 struct btrfs_dev_extent *dev_extent;
63 u32 type;
64 174
65 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", 175 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
66 (unsigned long long)btrfs_header_bytenr(l), nr, 176 (unsigned long long)btrfs_header_bytenr(l), nr,
@@ -100,20 +210,25 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
100 btrfs_disk_root_refs(l, ri)); 210 btrfs_disk_root_refs(l, ri));
101 break; 211 break;
102 case BTRFS_EXTENT_ITEM_KEY: 212 case BTRFS_EXTENT_ITEM_KEY:
103 ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); 213 print_extent_item(l, i);
104 printk(KERN_INFO "\t\textent data refs %u\n", 214 break;
105 btrfs_extent_refs(l, ei)); 215 case BTRFS_TREE_BLOCK_REF_KEY:
106 break; 216 printk(KERN_INFO "\t\ttree block backref\n");
107 case BTRFS_EXTENT_REF_KEY: 217 break;
108 ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref); 218 case BTRFS_SHARED_BLOCK_REF_KEY:
109 printk(KERN_INFO "\t\textent back ref root %llu " 219 printk(KERN_INFO "\t\tshared block backref\n");
110 "gen %llu owner %llu num_refs %lu\n", 220 break;
111 (unsigned long long)btrfs_ref_root(l, ref), 221 case BTRFS_EXTENT_DATA_REF_KEY:
112 (unsigned long long)btrfs_ref_generation(l, ref), 222 dref = btrfs_item_ptr(l, i,
113 (unsigned long long)btrfs_ref_objectid(l, ref), 223 struct btrfs_extent_data_ref);
114 (unsigned long)btrfs_ref_num_refs(l, ref)); 224 print_extent_data_ref(l, dref);
225 break;
226 case BTRFS_SHARED_DATA_REF_KEY:
227 sref = btrfs_item_ptr(l, i,
228 struct btrfs_shared_data_ref);
229 printk(KERN_INFO "\t\tshared data backref count %u\n",
230 btrfs_shared_data_ref_count(l, sref));
115 break; 231 break;
116
117 case BTRFS_EXTENT_DATA_KEY: 232 case BTRFS_EXTENT_DATA_KEY:
118 fi = btrfs_item_ptr(l, i, 233 fi = btrfs_item_ptr(l, i,
119 struct btrfs_file_extent_item); 234 struct btrfs_file_extent_item);
@@ -139,6 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
139 (unsigned long long) 254 (unsigned long long)
140 btrfs_file_extent_ram_bytes(l, fi)); 255 btrfs_file_extent_ram_bytes(l, fi));
141 break; 256 break;
257 case BTRFS_EXTENT_REF_V0_KEY:
258#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
259 print_extent_ref_v0(l, i);
260#else
261 BUG();
262#endif
142 case BTRFS_BLOCK_GROUP_ITEM_KEY: 263 case BTRFS_BLOCK_GROUP_ITEM_KEY:
143 bi = btrfs_item_ptr(l, i, 264 bi = btrfs_item_ptr(l, i,
144 struct btrfs_block_group_item); 265 struct btrfs_block_group_item);
@@ -188,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
188 } 309 }
189 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", 310 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
190 (unsigned long long)btrfs_header_bytenr(c), 311 (unsigned long long)btrfs_header_bytenr(c),
191 btrfs_header_level(c), nr, 312 level, nr,
192 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); 313 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
193 for (i = 0; i < nr; i++) { 314 for (i = 0; i < nr; i++) {
194 btrfs_node_key_to_cpu(c, &key, i); 315 btrfs_node_key_to_cpu(c, &key, i);
@@ -205,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
205 btrfs_level_size(root, level - 1), 326 btrfs_level_size(root, level - 1),
206 btrfs_node_ptr_generation(c, i)); 327 btrfs_node_ptr_generation(c, i));
207 if (btrfs_is_leaf(next) && 328 if (btrfs_is_leaf(next) &&
208 btrfs_header_level(c) != 1) 329 level != 1)
209 BUG(); 330 BUG();
210 if (btrfs_header_level(next) != 331 if (btrfs_header_level(next) !=
211 btrfs_header_level(c) - 1) 332 level - 1)
212 BUG(); 333 BUG();
213 btrfs_print_tree(root, next); 334 btrfs_print_tree(root, next);
214 free_extent_buffer(next); 335 free_extent_buffer(next);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 000000000000..c04f7f212602
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,3716 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include "ctree.h"
25#include "disk-io.h"
26#include "transaction.h"
27#include "volumes.h"
28#include "locking.h"
29#include "btrfs_inode.h"
30#include "async-thread.h"
31
32/*
33 * backref_node, mapping_node and tree_block start with this
34 */
35struct tree_entry {
36 struct rb_node rb_node;
37 u64 bytenr;
38};
39
40/*
41 * present a tree block in the backref cache
42 */
43struct backref_node {
44 struct rb_node rb_node;
45 u64 bytenr;
46 /* objectid tree block owner */
47 u64 owner;
48 /* list of upper level blocks reference this block */
49 struct list_head upper;
50 /* list of child blocks in the cache */
51 struct list_head lower;
52 /* NULL if this node is not tree root */
53 struct btrfs_root *root;
54 /* extent buffer got by COW the block */
55 struct extent_buffer *eb;
56 /* level of tree block */
57 unsigned int level:8;
58 /* 1 if the block is root of old snapshot */
59 unsigned int old_root:1;
60 /* 1 if no child blocks in the cache */
61 unsigned int lowest:1;
62 /* is the extent buffer locked */
63 unsigned int locked:1;
64 /* has the block been processed */
65 unsigned int processed:1;
66 /* have backrefs of this block been checked */
67 unsigned int checked:1;
68};
69
70/*
71 * present a block pointer in the backref cache
72 */
73struct backref_edge {
74 struct list_head list[2];
75 struct backref_node *node[2];
76 u64 blockptr;
77};
78
79#define LOWER 0
80#define UPPER 1
81
82struct backref_cache {
83 /* red black tree of all backref nodes in the cache */
84 struct rb_root rb_root;
85 /* list of backref nodes with no child block in the cache */
86 struct list_head pending[BTRFS_MAX_LEVEL];
87 spinlock_t lock;
88};
89
90/*
91 * map address of tree root to tree
92 */
93struct mapping_node {
94 struct rb_node rb_node;
95 u64 bytenr;
96 void *data;
97};
98
99struct mapping_tree {
100 struct rb_root rb_root;
101 spinlock_t lock;
102};
103
104/*
105 * present a tree block to process
106 */
107struct tree_block {
108 struct rb_node rb_node;
109 u64 bytenr;
110 struct btrfs_key key;
111 unsigned int level:8;
112 unsigned int key_ready:1;
113};
114
115/* inode vector */
116#define INODEVEC_SIZE 16
117
118struct inodevec {
119 struct list_head list;
120 struct inode *inode[INODEVEC_SIZE];
121 int nr;
122};
123
124struct reloc_control {
125 /* block group to relocate */
126 struct btrfs_block_group_cache *block_group;
127 /* extent tree */
128 struct btrfs_root *extent_root;
129 /* inode for moving data */
130 struct inode *data_inode;
131 struct btrfs_workers workers;
132 /* tree blocks have been processed */
133 struct extent_io_tree processed_blocks;
134 /* map start of tree root to corresponding reloc tree */
135 struct mapping_tree reloc_root_tree;
136 /* list of reloc trees */
137 struct list_head reloc_roots;
138 u64 search_start;
139 u64 extents_found;
140 u64 extents_skipped;
141 int stage;
142 int create_reloc_root;
143 unsigned int found_file_extent:1;
144 unsigned int found_old_snapshot:1;
145};
146
147/* stages of data relocation */
148#define MOVE_DATA_EXTENTS 0
149#define UPDATE_DATA_PTRS 1
150
151/*
152 * merge reloc tree to corresponding fs tree in worker threads
153 */
154struct async_merge {
155 struct btrfs_work work;
156 struct reloc_control *rc;
157 struct btrfs_root *root;
158 struct completion *done;
159 atomic_t *num_pending;
160};
161
162static void mapping_tree_init(struct mapping_tree *tree)
163{
164 tree->rb_root.rb_node = NULL;
165 spin_lock_init(&tree->lock);
166}
167
168static void backref_cache_init(struct backref_cache *cache)
169{
170 int i;
171 cache->rb_root.rb_node = NULL;
172 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
173 INIT_LIST_HEAD(&cache->pending[i]);
174 spin_lock_init(&cache->lock);
175}
176
177static void backref_node_init(struct backref_node *node)
178{
179 memset(node, 0, sizeof(*node));
180 INIT_LIST_HEAD(&node->upper);
181 INIT_LIST_HEAD(&node->lower);
182 RB_CLEAR_NODE(&node->rb_node);
183}
184
185static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
186 struct rb_node *node)
187{
188 struct rb_node **p = &root->rb_node;
189 struct rb_node *parent = NULL;
190 struct tree_entry *entry;
191
192 while (*p) {
193 parent = *p;
194 entry = rb_entry(parent, struct tree_entry, rb_node);
195
196 if (bytenr < entry->bytenr)
197 p = &(*p)->rb_left;
198 else if (bytenr > entry->bytenr)
199 p = &(*p)->rb_right;
200 else
201 return parent;
202 }
203
204 rb_link_node(node, parent, p);
205 rb_insert_color(node, root);
206 return NULL;
207}
208
209static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
210{
211 struct rb_node *n = root->rb_node;
212 struct tree_entry *entry;
213
214 while (n) {
215 entry = rb_entry(n, struct tree_entry, rb_node);
216
217 if (bytenr < entry->bytenr)
218 n = n->rb_left;
219 else if (bytenr > entry->bytenr)
220 n = n->rb_right;
221 else
222 return n;
223 }
224 return NULL;
225}
226
227/*
228 * walk up backref nodes until reach node presents tree root
229 */
230static struct backref_node *walk_up_backref(struct backref_node *node,
231 struct backref_edge *edges[],
232 int *index)
233{
234 struct backref_edge *edge;
235 int idx = *index;
236
237 while (!list_empty(&node->upper)) {
238 edge = list_entry(node->upper.next,
239 struct backref_edge, list[LOWER]);
240 edges[idx++] = edge;
241 node = edge->node[UPPER];
242 }
243 *index = idx;
244 return node;
245}
246
247/*
248 * walk down backref nodes to find start of next reference path
249 */
250static struct backref_node *walk_down_backref(struct backref_edge *edges[],
251 int *index)
252{
253 struct backref_edge *edge;
254 struct backref_node *lower;
255 int idx = *index;
256
257 while (idx > 0) {
258 edge = edges[idx - 1];
259 lower = edge->node[LOWER];
260 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
261 idx--;
262 continue;
263 }
264 edge = list_entry(edge->list[LOWER].next,
265 struct backref_edge, list[LOWER]);
266 edges[idx - 1] = edge;
267 *index = idx;
268 return edge->node[UPPER];
269 }
270 *index = 0;
271 return NULL;
272}
273
274static void drop_node_buffer(struct backref_node *node)
275{
276 if (node->eb) {
277 if (node->locked) {
278 btrfs_tree_unlock(node->eb);
279 node->locked = 0;
280 }
281 free_extent_buffer(node->eb);
282 node->eb = NULL;
283 }
284}
285
286static void drop_backref_node(struct backref_cache *tree,
287 struct backref_node *node)
288{
289 BUG_ON(!node->lowest);
290 BUG_ON(!list_empty(&node->upper));
291
292 drop_node_buffer(node);
293 list_del(&node->lower);
294
295 rb_erase(&node->rb_node, &tree->rb_root);
296 kfree(node);
297}
298
299/*
300 * remove a backref node from the backref cache
301 */
302static void remove_backref_node(struct backref_cache *cache,
303 struct backref_node *node)
304{
305 struct backref_node *upper;
306 struct backref_edge *edge;
307
308 if (!node)
309 return;
310
311 BUG_ON(!node->lowest);
312 while (!list_empty(&node->upper)) {
313 edge = list_entry(node->upper.next, struct backref_edge,
314 list[LOWER]);
315 upper = edge->node[UPPER];
316 list_del(&edge->list[LOWER]);
317 list_del(&edge->list[UPPER]);
318 kfree(edge);
319 /*
320 * add the node to pending list if no other
321 * child block cached.
322 */
323 if (list_empty(&upper->lower)) {
324 list_add_tail(&upper->lower,
325 &cache->pending[upper->level]);
326 upper->lowest = 1;
327 }
328 }
329 drop_backref_node(cache, node);
330}
331
332/*
333 * find reloc tree by address of tree root
334 */
335static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
336 u64 bytenr)
337{
338 struct rb_node *rb_node;
339 struct mapping_node *node;
340 struct btrfs_root *root = NULL;
341
342 spin_lock(&rc->reloc_root_tree.lock);
343 rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
344 if (rb_node) {
345 node = rb_entry(rb_node, struct mapping_node, rb_node);
346 root = (struct btrfs_root *)node->data;
347 }
348 spin_unlock(&rc->reloc_root_tree.lock);
349 return root;
350}
351
352static int is_cowonly_root(u64 root_objectid)
353{
354 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
355 root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
356 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
357 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
358 root_objectid == BTRFS_TREE_LOG_OBJECTID ||
359 root_objectid == BTRFS_CSUM_TREE_OBJECTID)
360 return 1;
361 return 0;
362}
363
364static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
365 u64 root_objectid)
366{
367 struct btrfs_key key;
368
369 key.objectid = root_objectid;
370 key.type = BTRFS_ROOT_ITEM_KEY;
371 if (is_cowonly_root(root_objectid))
372 key.offset = 0;
373 else
374 key.offset = (u64)-1;
375
376 return btrfs_read_fs_root_no_name(fs_info, &key);
377}
378
379#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
380static noinline_for_stack
381struct btrfs_root *find_tree_root(struct reloc_control *rc,
382 struct extent_buffer *leaf,
383 struct btrfs_extent_ref_v0 *ref0)
384{
385 struct btrfs_root *root;
386 u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
387 u64 generation = btrfs_ref_generation_v0(leaf, ref0);
388
389 BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
390
391 root = read_fs_root(rc->extent_root->fs_info, root_objectid);
392 BUG_ON(IS_ERR(root));
393
394 if (root->ref_cows &&
395 generation != btrfs_root_generation(&root->root_item))
396 return NULL;
397
398 return root;
399}
400#endif
401
402static noinline_for_stack
403int find_inline_backref(struct extent_buffer *leaf, int slot,
404 unsigned long *ptr, unsigned long *end)
405{
406 struct btrfs_extent_item *ei;
407 struct btrfs_tree_block_info *bi;
408 u32 item_size;
409
410 item_size = btrfs_item_size_nr(leaf, slot);
411#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
412 if (item_size < sizeof(*ei)) {
413 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
414 return 1;
415 }
416#endif
417 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
418 WARN_ON(!(btrfs_extent_flags(leaf, ei) &
419 BTRFS_EXTENT_FLAG_TREE_BLOCK));
420
421 if (item_size <= sizeof(*ei) + sizeof(*bi)) {
422 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
423 return 1;
424 }
425
426 bi = (struct btrfs_tree_block_info *)(ei + 1);
427 *ptr = (unsigned long)(bi + 1);
428 *end = (unsigned long)ei + item_size;
429 return 0;
430}
431
432/*
433 * build backref tree for a given tree block. root of the backref tree
434 * corresponds the tree block, leaves of the backref tree correspond
435 * roots of b-trees that reference the tree block.
436 *
437 * the basic idea of this function is check backrefs of a given block
438 * to find upper level blocks that refernece the block, and then check
439 * bakcrefs of these upper level blocks recursively. the recursion stop
440 * when tree root is reached or backrefs for the block is cached.
441 *
442 * NOTE: if we find backrefs for a block are cached, we know backrefs
443 * for all upper level blocks that directly/indirectly reference the
444 * block are also cached.
445 */
446static struct backref_node *build_backref_tree(struct reloc_control *rc,
447 struct backref_cache *cache,
448 struct btrfs_key *node_key,
449 int level, u64 bytenr)
450{
451 struct btrfs_path *path1;
452 struct btrfs_path *path2;
453 struct extent_buffer *eb;
454 struct btrfs_root *root;
455 struct backref_node *cur;
456 struct backref_node *upper;
457 struct backref_node *lower;
458 struct backref_node *node = NULL;
459 struct backref_node *exist = NULL;
460 struct backref_edge *edge;
461 struct rb_node *rb_node;
462 struct btrfs_key key;
463 unsigned long end;
464 unsigned long ptr;
465 LIST_HEAD(list);
466 int ret;
467 int err = 0;
468
469 path1 = btrfs_alloc_path();
470 path2 = btrfs_alloc_path();
471 if (!path1 || !path2) {
472 err = -ENOMEM;
473 goto out;
474 }
475
476 node = kmalloc(sizeof(*node), GFP_NOFS);
477 if (!node) {
478 err = -ENOMEM;
479 goto out;
480 }
481
482 backref_node_init(node);
483 node->bytenr = bytenr;
484 node->owner = 0;
485 node->level = level;
486 node->lowest = 1;
487 cur = node;
488again:
489 end = 0;
490 ptr = 0;
491 key.objectid = cur->bytenr;
492 key.type = BTRFS_EXTENT_ITEM_KEY;
493 key.offset = (u64)-1;
494
495 path1->search_commit_root = 1;
496 path1->skip_locking = 1;
497 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
498 0, 0);
499 if (ret < 0) {
500 err = ret;
501 goto out;
502 }
503 BUG_ON(!ret || !path1->slots[0]);
504
505 path1->slots[0]--;
506
507 WARN_ON(cur->checked);
508 if (!list_empty(&cur->upper)) {
509 /*
510 * the backref was added previously when processsing
511 * backref of type BTRFS_TREE_BLOCK_REF_KEY
512 */
513 BUG_ON(!list_is_singular(&cur->upper));
514 edge = list_entry(cur->upper.next, struct backref_edge,
515 list[LOWER]);
516 BUG_ON(!list_empty(&edge->list[UPPER]));
517 exist = edge->node[UPPER];
518 /*
519 * add the upper level block to pending list if we need
520 * check its backrefs
521 */
522 if (!exist->checked)
523 list_add_tail(&edge->list[UPPER], &list);
524 } else {
525 exist = NULL;
526 }
527
528 while (1) {
529 cond_resched();
530 eb = path1->nodes[0];
531
532 if (ptr >= end) {
533 if (path1->slots[0] >= btrfs_header_nritems(eb)) {
534 ret = btrfs_next_leaf(rc->extent_root, path1);
535 if (ret < 0) {
536 err = ret;
537 goto out;
538 }
539 if (ret > 0)
540 break;
541 eb = path1->nodes[0];
542 }
543
544 btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
545 if (key.objectid != cur->bytenr) {
546 WARN_ON(exist);
547 break;
548 }
549
550 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
551 ret = find_inline_backref(eb, path1->slots[0],
552 &ptr, &end);
553 if (ret)
554 goto next;
555 }
556 }
557
558 if (ptr < end) {
559 /* update key for inline back ref */
560 struct btrfs_extent_inline_ref *iref;
561 iref = (struct btrfs_extent_inline_ref *)ptr;
562 key.type = btrfs_extent_inline_ref_type(eb, iref);
563 key.offset = btrfs_extent_inline_ref_offset(eb, iref);
564 WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
565 key.type != BTRFS_SHARED_BLOCK_REF_KEY);
566 }
567
568 if (exist &&
569 ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
570 exist->owner == key.offset) ||
571 (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
572 exist->bytenr == key.offset))) {
573 exist = NULL;
574 goto next;
575 }
576
577#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
578 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
579 key.type == BTRFS_EXTENT_REF_V0_KEY) {
580 if (key.objectid == key.offset &&
581 key.type == BTRFS_EXTENT_REF_V0_KEY) {
582 struct btrfs_extent_ref_v0 *ref0;
583 ref0 = btrfs_item_ptr(eb, path1->slots[0],
584 struct btrfs_extent_ref_v0);
585 root = find_tree_root(rc, eb, ref0);
586 if (root)
587 cur->root = root;
588 else
589 cur->old_root = 1;
590 break;
591 }
592#else
593 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
594 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
595#endif
596 if (key.objectid == key.offset) {
597 /*
598 * only root blocks of reloc trees use
599 * backref of this type.
600 */
601 root = find_reloc_root(rc, cur->bytenr);
602 BUG_ON(!root);
603 cur->root = root;
604 break;
605 }
606
607 edge = kzalloc(sizeof(*edge), GFP_NOFS);
608 if (!edge) {
609 err = -ENOMEM;
610 goto out;
611 }
612 rb_node = tree_search(&cache->rb_root, key.offset);
613 if (!rb_node) {
614 upper = kmalloc(sizeof(*upper), GFP_NOFS);
615 if (!upper) {
616 kfree(edge);
617 err = -ENOMEM;
618 goto out;
619 }
620 backref_node_init(upper);
621 upper->bytenr = key.offset;
622 upper->owner = 0;
623 upper->level = cur->level + 1;
624 /*
625 * backrefs for the upper level block isn't
626 * cached, add the block to pending list
627 */
628 list_add_tail(&edge->list[UPPER], &list);
629 } else {
630 upper = rb_entry(rb_node, struct backref_node,
631 rb_node);
632 INIT_LIST_HEAD(&edge->list[UPPER]);
633 }
634 list_add(&edge->list[LOWER], &cur->upper);
635 edge->node[UPPER] = upper;
636 edge->node[LOWER] = cur;
637
638 goto next;
639 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
640 goto next;
641 }
642
643 /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
644 root = read_fs_root(rc->extent_root->fs_info, key.offset);
645 if (IS_ERR(root)) {
646 err = PTR_ERR(root);
647 goto out;
648 }
649
650 if (btrfs_root_level(&root->root_item) == cur->level) {
651 /* tree root */
652 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
653 cur->bytenr);
654 cur->root = root;
655 break;
656 }
657
658 level = cur->level + 1;
659
660 /*
661 * searching the tree to find upper level blocks
662 * reference the block.
663 */
664 path2->search_commit_root = 1;
665 path2->skip_locking = 1;
666 path2->lowest_level = level;
667 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
668 path2->lowest_level = 0;
669 if (ret < 0) {
670 err = ret;
671 goto out;
672 }
673 if (ret > 0 && path2->slots[level] > 0)
674 path2->slots[level]--;
675
676 eb = path2->nodes[level];
677 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
678 cur->bytenr);
679
680 lower = cur;
681 for (; level < BTRFS_MAX_LEVEL; level++) {
682 if (!path2->nodes[level]) {
683 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
684 lower->bytenr);
685 lower->root = root;
686 break;
687 }
688
689 edge = kzalloc(sizeof(*edge), GFP_NOFS);
690 if (!edge) {
691 err = -ENOMEM;
692 goto out;
693 }
694
695 eb = path2->nodes[level];
696 rb_node = tree_search(&cache->rb_root, eb->start);
697 if (!rb_node) {
698 upper = kmalloc(sizeof(*upper), GFP_NOFS);
699 if (!upper) {
700 kfree(edge);
701 err = -ENOMEM;
702 goto out;
703 }
704 backref_node_init(upper);
705 upper->bytenr = eb->start;
706 upper->owner = btrfs_header_owner(eb);
707 upper->level = lower->level + 1;
708
709 /*
710 * if we know the block isn't shared
711 * we can void checking its backrefs.
712 */
713 if (btrfs_block_can_be_shared(root, eb))
714 upper->checked = 0;
715 else
716 upper->checked = 1;
717
718 /*
719 * add the block to pending list if we
720 * need check its backrefs. only block
721 * at 'cur->level + 1' is added to the
722 * tail of pending list. this guarantees
723 * we check backrefs from lower level
724 * blocks to upper level blocks.
725 */
726 if (!upper->checked &&
727 level == cur->level + 1) {
728 list_add_tail(&edge->list[UPPER],
729 &list);
730 } else
731 INIT_LIST_HEAD(&edge->list[UPPER]);
732 } else {
733 upper = rb_entry(rb_node, struct backref_node,
734 rb_node);
735 BUG_ON(!upper->checked);
736 INIT_LIST_HEAD(&edge->list[UPPER]);
737 }
738 list_add_tail(&edge->list[LOWER], &lower->upper);
739 edge->node[UPPER] = upper;
740 edge->node[LOWER] = lower;
741
742 if (rb_node)
743 break;
744 lower = upper;
745 upper = NULL;
746 }
747 btrfs_release_path(root, path2);
748next:
749 if (ptr < end) {
750 ptr += btrfs_extent_inline_ref_size(key.type);
751 if (ptr >= end) {
752 WARN_ON(ptr > end);
753 ptr = 0;
754 end = 0;
755 }
756 }
757 if (ptr >= end)
758 path1->slots[0]++;
759 }
760 btrfs_release_path(rc->extent_root, path1);
761
762 cur->checked = 1;
763 WARN_ON(exist);
764
765 /* the pending list isn't empty, take the first block to process */
766 if (!list_empty(&list)) {
767 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
768 list_del_init(&edge->list[UPPER]);
769 cur = edge->node[UPPER];
770 goto again;
771 }
772
773 /*
774 * everything goes well, connect backref nodes and insert backref nodes
775 * into the cache.
776 */
777 BUG_ON(!node->checked);
778 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
779 BUG_ON(rb_node);
780
781 list_for_each_entry(edge, &node->upper, list[LOWER])
782 list_add_tail(&edge->list[UPPER], &list);
783
784 while (!list_empty(&list)) {
785 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
786 list_del_init(&edge->list[UPPER]);
787 upper = edge->node[UPPER];
788
789 if (!RB_EMPTY_NODE(&upper->rb_node)) {
790 if (upper->lowest) {
791 list_del_init(&upper->lower);
792 upper->lowest = 0;
793 }
794
795 list_add_tail(&edge->list[UPPER], &upper->lower);
796 continue;
797 }
798
799 BUG_ON(!upper->checked);
800 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
801 &upper->rb_node);
802 BUG_ON(rb_node);
803
804 list_add_tail(&edge->list[UPPER], &upper->lower);
805
806 list_for_each_entry(edge, &upper->upper, list[LOWER])
807 list_add_tail(&edge->list[UPPER], &list);
808 }
809out:
810 btrfs_free_path(path1);
811 btrfs_free_path(path2);
812 if (err) {
813 INIT_LIST_HEAD(&list);
814 upper = node;
815 while (upper) {
816 if (RB_EMPTY_NODE(&upper->rb_node)) {
817 list_splice_tail(&upper->upper, &list);
818 kfree(upper);
819 }
820
821 if (list_empty(&list))
822 break;
823
824 edge = list_entry(list.next, struct backref_edge,
825 list[LOWER]);
826 upper = edge->node[UPPER];
827 kfree(edge);
828 }
829 return ERR_PTR(err);
830 }
831 return node;
832}
833
834/*
835 * helper to add 'address of tree root -> reloc tree' mapping
836 */
837static int __add_reloc_root(struct btrfs_root *root)
838{
839 struct rb_node *rb_node;
840 struct mapping_node *node;
841 struct reloc_control *rc = root->fs_info->reloc_ctl;
842
843 node = kmalloc(sizeof(*node), GFP_NOFS);
844 BUG_ON(!node);
845
846 node->bytenr = root->node->start;
847 node->data = root;
848
849 spin_lock(&rc->reloc_root_tree.lock);
850 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
851 node->bytenr, &node->rb_node);
852 spin_unlock(&rc->reloc_root_tree.lock);
853 BUG_ON(rb_node);
854
855 list_add_tail(&root->root_list, &rc->reloc_roots);
856 return 0;
857}
858
859/*
860 * helper to update/delete the 'address of tree root -> reloc tree'
861 * mapping
862 */
863static int __update_reloc_root(struct btrfs_root *root, int del)
864{
865 struct rb_node *rb_node;
866 struct mapping_node *node = NULL;
867 struct reloc_control *rc = root->fs_info->reloc_ctl;
868
869 spin_lock(&rc->reloc_root_tree.lock);
870 rb_node = tree_search(&rc->reloc_root_tree.rb_root,
871 root->commit_root->start);
872 if (rb_node) {
873 node = rb_entry(rb_node, struct mapping_node, rb_node);
874 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
875 }
876 spin_unlock(&rc->reloc_root_tree.lock);
877
878 BUG_ON((struct btrfs_root *)node->data != root);
879
880 if (!del) {
881 spin_lock(&rc->reloc_root_tree.lock);
882 node->bytenr = root->node->start;
883 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
884 node->bytenr, &node->rb_node);
885 spin_unlock(&rc->reloc_root_tree.lock);
886 BUG_ON(rb_node);
887 } else {
888 list_del_init(&root->root_list);
889 kfree(node);
890 }
891 return 0;
892}
893
894/*
895 * create reloc tree for a given fs tree. reloc tree is just a
896 * snapshot of the fs tree with special root objectid.
897 */
898int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
899 struct btrfs_root *root)
900{
901 struct btrfs_root *reloc_root;
902 struct extent_buffer *eb;
903 struct btrfs_root_item *root_item;
904 struct btrfs_key root_key;
905 int ret;
906
907 if (root->reloc_root) {
908 reloc_root = root->reloc_root;
909 reloc_root->last_trans = trans->transid;
910 return 0;
911 }
912
913 if (!root->fs_info->reloc_ctl ||
914 !root->fs_info->reloc_ctl->create_reloc_root ||
915 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
916 return 0;
917
918 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
919 BUG_ON(!root_item);
920
921 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
922 root_key.type = BTRFS_ROOT_ITEM_KEY;
923 root_key.offset = root->root_key.objectid;
924
925 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
926 BTRFS_TREE_RELOC_OBJECTID);
927 BUG_ON(ret);
928
929 btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
930 memcpy(root_item, &root->root_item, sizeof(*root_item));
931 btrfs_set_root_refs(root_item, 1);
932 btrfs_set_root_bytenr(root_item, eb->start);
933 btrfs_set_root_level(root_item, btrfs_header_level(eb));
934 btrfs_set_root_generation(root_item, trans->transid);
935 memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
936 root_item->drop_level = 0;
937
938 btrfs_tree_unlock(eb);
939 free_extent_buffer(eb);
940
941 ret = btrfs_insert_root(trans, root->fs_info->tree_root,
942 &root_key, root_item);
943 BUG_ON(ret);
944 kfree(root_item);
945
946 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
947 &root_key);
948 BUG_ON(IS_ERR(reloc_root));
949 reloc_root->last_trans = trans->transid;
950
951 __add_reloc_root(reloc_root);
952 root->reloc_root = reloc_root;
953 return 0;
954}
955
956/*
957 * update root item of reloc tree
958 */
959int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
960 struct btrfs_root *root)
961{
962 struct btrfs_root *reloc_root;
963 struct btrfs_root_item *root_item;
964 int del = 0;
965 int ret;
966
967 if (!root->reloc_root)
968 return 0;
969
970 reloc_root = root->reloc_root;
971 root_item = &reloc_root->root_item;
972
973 if (btrfs_root_refs(root_item) == 0) {
974 root->reloc_root = NULL;
975 del = 1;
976 }
977
978 __update_reloc_root(reloc_root, del);
979
980 if (reloc_root->commit_root != reloc_root->node) {
981 btrfs_set_root_node(root_item, reloc_root->node);
982 free_extent_buffer(reloc_root->commit_root);
983 reloc_root->commit_root = btrfs_root_node(reloc_root);
984 }
985
986 ret = btrfs_update_root(trans, root->fs_info->tree_root,
987 &reloc_root->root_key, root_item);
988 BUG_ON(ret);
989 return 0;
990}
991
992/*
993 * helper to find first cached inode with inode number >= objectid
994 * in a subvolume
995 */
996static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
997{
998 struct rb_node *node;
999 struct rb_node *prev;
1000 struct btrfs_inode *entry;
1001 struct inode *inode;
1002
1003 spin_lock(&root->inode_lock);
1004again:
1005 node = root->inode_tree.rb_node;
1006 prev = NULL;
1007 while (node) {
1008 prev = node;
1009 entry = rb_entry(node, struct btrfs_inode, rb_node);
1010
1011 if (objectid < entry->vfs_inode.i_ino)
1012 node = node->rb_left;
1013 else if (objectid > entry->vfs_inode.i_ino)
1014 node = node->rb_right;
1015 else
1016 break;
1017 }
1018 if (!node) {
1019 while (prev) {
1020 entry = rb_entry(prev, struct btrfs_inode, rb_node);
1021 if (objectid <= entry->vfs_inode.i_ino) {
1022 node = prev;
1023 break;
1024 }
1025 prev = rb_next(prev);
1026 }
1027 }
1028 while (node) {
1029 entry = rb_entry(node, struct btrfs_inode, rb_node);
1030 inode = igrab(&entry->vfs_inode);
1031 if (inode) {
1032 spin_unlock(&root->inode_lock);
1033 return inode;
1034 }
1035
1036 objectid = entry->vfs_inode.i_ino + 1;
1037 if (cond_resched_lock(&root->inode_lock))
1038 goto again;
1039
1040 node = rb_next(node);
1041 }
1042 spin_unlock(&root->inode_lock);
1043 return NULL;
1044}
1045
1046static int in_block_group(u64 bytenr,
1047 struct btrfs_block_group_cache *block_group)
1048{
1049 if (bytenr >= block_group->key.objectid &&
1050 bytenr < block_group->key.objectid + block_group->key.offset)
1051 return 1;
1052 return 0;
1053}
1054
1055/*
1056 * get new location of data
1057 */
1058static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1059 u64 bytenr, u64 num_bytes)
1060{
1061 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1062 struct btrfs_path *path;
1063 struct btrfs_file_extent_item *fi;
1064 struct extent_buffer *leaf;
1065 int ret;
1066
1067 path = btrfs_alloc_path();
1068 if (!path)
1069 return -ENOMEM;
1070
1071 bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1072 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
1073 bytenr, 0);
1074 if (ret < 0)
1075 goto out;
1076 if (ret > 0) {
1077 ret = -ENOENT;
1078 goto out;
1079 }
1080
1081 leaf = path->nodes[0];
1082 fi = btrfs_item_ptr(leaf, path->slots[0],
1083 struct btrfs_file_extent_item);
1084
1085 BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1086 btrfs_file_extent_compression(leaf, fi) ||
1087 btrfs_file_extent_encryption(leaf, fi) ||
1088 btrfs_file_extent_other_encoding(leaf, fi));
1089
1090 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1091 ret = 1;
1092 goto out;
1093 }
1094
1095 if (new_bytenr)
1096 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1097 ret = 0;
1098out:
1099 btrfs_free_path(path);
1100 return ret;
1101}
1102
1103/*
1104 * update file extent items in the tree leaf to point to
1105 * the new locations.
1106 */
1107static int replace_file_extents(struct btrfs_trans_handle *trans,
1108 struct reloc_control *rc,
1109 struct btrfs_root *root,
1110 struct extent_buffer *leaf,
1111 struct list_head *inode_list)
1112{
1113 struct btrfs_key key;
1114 struct btrfs_file_extent_item *fi;
1115 struct inode *inode = NULL;
1116 struct inodevec *ivec = NULL;
1117 u64 parent;
1118 u64 bytenr;
1119 u64 new_bytenr;
1120 u64 num_bytes;
1121 u64 end;
1122 u32 nritems;
1123 u32 i;
1124 int ret;
1125 int first = 1;
1126 int dirty = 0;
1127
1128 if (rc->stage != UPDATE_DATA_PTRS)
1129 return 0;
1130
1131 /* reloc trees always use full backref */
1132 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1133 parent = leaf->start;
1134 else
1135 parent = 0;
1136
1137 nritems = btrfs_header_nritems(leaf);
1138 for (i = 0; i < nritems; i++) {
1139 cond_resched();
1140 btrfs_item_key_to_cpu(leaf, &key, i);
1141 if (key.type != BTRFS_EXTENT_DATA_KEY)
1142 continue;
1143 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1144 if (btrfs_file_extent_type(leaf, fi) ==
1145 BTRFS_FILE_EXTENT_INLINE)
1146 continue;
1147 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1148 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1149 if (bytenr == 0)
1150 continue;
1151 if (!in_block_group(bytenr, rc->block_group))
1152 continue;
1153
1154 /*
1155 * if we are modifying block in fs tree, wait for readpage
1156 * to complete and drop the extent cache
1157 */
1158 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1159 if (!ivec || ivec->nr == INODEVEC_SIZE) {
1160 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1161 BUG_ON(!ivec);
1162 ivec->nr = 0;
1163 list_add_tail(&ivec->list, inode_list);
1164 }
1165 if (first) {
1166 inode = find_next_inode(root, key.objectid);
1167 if (inode)
1168 ivec->inode[ivec->nr++] = inode;
1169 first = 0;
1170 } else if (inode && inode->i_ino < key.objectid) {
1171 inode = find_next_inode(root, key.objectid);
1172 if (inode)
1173 ivec->inode[ivec->nr++] = inode;
1174 }
1175 if (inode && inode->i_ino == key.objectid) {
1176 end = key.offset +
1177 btrfs_file_extent_num_bytes(leaf, fi);
1178 WARN_ON(!IS_ALIGNED(key.offset,
1179 root->sectorsize));
1180 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1181 end--;
1182 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1183 key.offset, end,
1184 GFP_NOFS);
1185 if (!ret)
1186 continue;
1187
1188 btrfs_drop_extent_cache(inode, key.offset, end,
1189 1);
1190 unlock_extent(&BTRFS_I(inode)->io_tree,
1191 key.offset, end, GFP_NOFS);
1192 }
1193 }
1194
1195 ret = get_new_location(rc->data_inode, &new_bytenr,
1196 bytenr, num_bytes);
1197 if (ret > 0)
1198 continue;
1199 BUG_ON(ret < 0);
1200
1201 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1202 dirty = 1;
1203
1204 key.offset -= btrfs_file_extent_offset(leaf, fi);
1205 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1206 num_bytes, parent,
1207 btrfs_header_owner(leaf),
1208 key.objectid, key.offset);
1209 BUG_ON(ret);
1210
1211 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1212 parent, btrfs_header_owner(leaf),
1213 key.objectid, key.offset);
1214 BUG_ON(ret);
1215 }
1216 if (dirty)
1217 btrfs_mark_buffer_dirty(leaf);
1218 return 0;
1219}
1220
1221static noinline_for_stack
1222int memcmp_node_keys(struct extent_buffer *eb, int slot,
1223 struct btrfs_path *path, int level)
1224{
1225 struct btrfs_disk_key key1;
1226 struct btrfs_disk_key key2;
1227 btrfs_node_key(eb, &key1, slot);
1228 btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1229 return memcmp(&key1, &key2, sizeof(key1));
1230}
1231
1232/*
1233 * try to replace tree blocks in fs tree with the new blocks
1234 * in reloc tree. tree blocks haven't been modified since the
1235 * reloc tree was create can be replaced.
1236 *
1237 * if a block was replaced, level of the block + 1 is returned.
1238 * if no block got replaced, 0 is returned. if there are other
1239 * errors, a negative error number is returned.
1240 */
1241static int replace_path(struct btrfs_trans_handle *trans,
1242 struct btrfs_root *dest, struct btrfs_root *src,
1243 struct btrfs_path *path, struct btrfs_key *next_key,
1244 struct extent_buffer **leaf,
1245 int lowest_level, int max_level)
1246{
1247 struct extent_buffer *eb;
1248 struct extent_buffer *parent;
1249 struct btrfs_key key;
1250 u64 old_bytenr;
1251 u64 new_bytenr;
1252 u64 old_ptr_gen;
1253 u64 new_ptr_gen;
1254 u64 last_snapshot;
1255 u32 blocksize;
1256 int level;
1257 int ret;
1258 int slot;
1259
1260 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1261 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1262 BUG_ON(lowest_level > 1 && leaf);
1263
1264 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1265
1266 slot = path->slots[lowest_level];
1267 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1268
1269 eb = btrfs_lock_root_node(dest);
1270 btrfs_set_lock_blocking(eb);
1271 level = btrfs_header_level(eb);
1272
1273 if (level < lowest_level) {
1274 btrfs_tree_unlock(eb);
1275 free_extent_buffer(eb);
1276 return 0;
1277 }
1278
1279 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1280 BUG_ON(ret);
1281 btrfs_set_lock_blocking(eb);
1282
1283 if (next_key) {
1284 next_key->objectid = (u64)-1;
1285 next_key->type = (u8)-1;
1286 next_key->offset = (u64)-1;
1287 }
1288
1289 parent = eb;
1290 while (1) {
1291 level = btrfs_header_level(parent);
1292 BUG_ON(level < lowest_level);
1293
1294 ret = btrfs_bin_search(parent, &key, level, &slot);
1295 if (ret && slot > 0)
1296 slot--;
1297
1298 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1299 btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1300
1301 old_bytenr = btrfs_node_blockptr(parent, slot);
1302 blocksize = btrfs_level_size(dest, level - 1);
1303 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1304
1305 if (level <= max_level) {
1306 eb = path->nodes[level];
1307 new_bytenr = btrfs_node_blockptr(eb,
1308 path->slots[level]);
1309 new_ptr_gen = btrfs_node_ptr_generation(eb,
1310 path->slots[level]);
1311 } else {
1312 new_bytenr = 0;
1313 new_ptr_gen = 0;
1314 }
1315
1316 if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1317 WARN_ON(1);
1318 ret = level;
1319 break;
1320 }
1321
1322 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1323 memcmp_node_keys(parent, slot, path, level)) {
1324 if (level <= lowest_level && !leaf) {
1325 ret = 0;
1326 break;
1327 }
1328
1329 eb = read_tree_block(dest, old_bytenr, blocksize,
1330 old_ptr_gen);
1331 btrfs_tree_lock(eb);
1332 ret = btrfs_cow_block(trans, dest, eb, parent,
1333 slot, &eb);
1334 BUG_ON(ret);
1335 btrfs_set_lock_blocking(eb);
1336
1337 if (level <= lowest_level) {
1338 *leaf = eb;
1339 ret = 0;
1340 break;
1341 }
1342
1343 btrfs_tree_unlock(parent);
1344 free_extent_buffer(parent);
1345
1346 parent = eb;
1347 continue;
1348 }
1349
1350 btrfs_node_key_to_cpu(path->nodes[level], &key,
1351 path->slots[level]);
1352 btrfs_release_path(src, path);
1353
1354 path->lowest_level = level;
1355 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1356 path->lowest_level = 0;
1357 BUG_ON(ret);
1358
1359 /*
1360 * swap blocks in fs tree and reloc tree.
1361 */
1362 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1363 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1364 btrfs_mark_buffer_dirty(parent);
1365
1366 btrfs_set_node_blockptr(path->nodes[level],
1367 path->slots[level], old_bytenr);
1368 btrfs_set_node_ptr_generation(path->nodes[level],
1369 path->slots[level], old_ptr_gen);
1370 btrfs_mark_buffer_dirty(path->nodes[level]);
1371
1372 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1373 path->nodes[level]->start,
1374 src->root_key.objectid, level - 1, 0);
1375 BUG_ON(ret);
1376 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1377 0, dest->root_key.objectid, level - 1,
1378 0);
1379 BUG_ON(ret);
1380
1381 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1382 path->nodes[level]->start,
1383 src->root_key.objectid, level - 1, 0);
1384 BUG_ON(ret);
1385
1386 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1387 0, dest->root_key.objectid, level - 1,
1388 0);
1389 BUG_ON(ret);
1390
1391 btrfs_unlock_up_safe(path, 0);
1392
1393 ret = level;
1394 break;
1395 }
1396 btrfs_tree_unlock(parent);
1397 free_extent_buffer(parent);
1398 return ret;
1399}
1400
1401/*
1402 * helper to find next relocated block in reloc tree
1403 */
1404static noinline_for_stack
1405int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1406 int *level)
1407{
1408 struct extent_buffer *eb;
1409 int i;
1410 u64 last_snapshot;
1411 u32 nritems;
1412
1413 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1414
1415 for (i = 0; i < *level; i++) {
1416 free_extent_buffer(path->nodes[i]);
1417 path->nodes[i] = NULL;
1418 }
1419
1420 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1421 eb = path->nodes[i];
1422 nritems = btrfs_header_nritems(eb);
1423 while (path->slots[i] + 1 < nritems) {
1424 path->slots[i]++;
1425 if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1426 last_snapshot)
1427 continue;
1428
1429 *level = i;
1430 return 0;
1431 }
1432 free_extent_buffer(path->nodes[i]);
1433 path->nodes[i] = NULL;
1434 }
1435 return 1;
1436}
1437
1438/*
1439 * walk down reloc tree to find relocated block of lowest level
1440 */
1441static noinline_for_stack
1442int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1443 int *level)
1444{
1445 struct extent_buffer *eb = NULL;
1446 int i;
1447 u64 bytenr;
1448 u64 ptr_gen = 0;
1449 u64 last_snapshot;
1450 u32 blocksize;
1451 u32 nritems;
1452
1453 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1454
1455 for (i = *level; i > 0; i--) {
1456 eb = path->nodes[i];
1457 nritems = btrfs_header_nritems(eb);
1458 while (path->slots[i] < nritems) {
1459 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1460 if (ptr_gen > last_snapshot)
1461 break;
1462 path->slots[i]++;
1463 }
1464 if (path->slots[i] >= nritems) {
1465 if (i == *level)
1466 break;
1467 *level = i + 1;
1468 return 0;
1469 }
1470 if (i == 1) {
1471 *level = i;
1472 return 0;
1473 }
1474
1475 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1476 blocksize = btrfs_level_size(root, i - 1);
1477 eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1478 BUG_ON(btrfs_header_level(eb) != i - 1);
1479 path->nodes[i - 1] = eb;
1480 path->slots[i - 1] = 0;
1481 }
1482 return 1;
1483}
1484
1485/*
1486 * invalidate extent cache for file extents whose key in range of
1487 * [min_key, max_key)
1488 */
1489static int invalidate_extent_cache(struct btrfs_root *root,
1490 struct btrfs_key *min_key,
1491 struct btrfs_key *max_key)
1492{
1493 struct inode *inode = NULL;
1494 u64 objectid;
1495 u64 start, end;
1496
1497 objectid = min_key->objectid;
1498 while (1) {
1499 cond_resched();
1500 iput(inode);
1501
1502 if (objectid > max_key->objectid)
1503 break;
1504
1505 inode = find_next_inode(root, objectid);
1506 if (!inode)
1507 break;
1508
1509 if (inode->i_ino > max_key->objectid) {
1510 iput(inode);
1511 break;
1512 }
1513
1514 objectid = inode->i_ino + 1;
1515 if (!S_ISREG(inode->i_mode))
1516 continue;
1517
1518 if (unlikely(min_key->objectid == inode->i_ino)) {
1519 if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1520 continue;
1521 if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1522 start = 0;
1523 else {
1524 start = min_key->offset;
1525 WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1526 }
1527 } else {
1528 start = 0;
1529 }
1530
1531 if (unlikely(max_key->objectid == inode->i_ino)) {
1532 if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1533 continue;
1534 if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1535 end = (u64)-1;
1536 } else {
1537 if (max_key->offset == 0)
1538 continue;
1539 end = max_key->offset;
1540 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1541 end--;
1542 }
1543 } else {
1544 end = (u64)-1;
1545 }
1546
1547 /* the lock_extent waits for readpage to complete */
1548 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1549 btrfs_drop_extent_cache(inode, start, end, 1);
1550 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1551 }
1552 return 0;
1553}
1554
1555static int find_next_key(struct btrfs_path *path, int level,
1556 struct btrfs_key *key)
1557
1558{
1559 while (level < BTRFS_MAX_LEVEL) {
1560 if (!path->nodes[level])
1561 break;
1562 if (path->slots[level] + 1 <
1563 btrfs_header_nritems(path->nodes[level])) {
1564 btrfs_node_key_to_cpu(path->nodes[level], key,
1565 path->slots[level] + 1);
1566 return 0;
1567 }
1568 level++;
1569 }
1570 return 1;
1571}
1572
1573/*
1574 * merge the relocated tree blocks in reloc tree with corresponding
1575 * fs tree.
1576 */
1577static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1578 struct btrfs_root *root)
1579{
1580 LIST_HEAD(inode_list);
1581 struct btrfs_key key;
1582 struct btrfs_key next_key;
1583 struct btrfs_trans_handle *trans;
1584 struct btrfs_root *reloc_root;
1585 struct btrfs_root_item *root_item;
1586 struct btrfs_path *path;
1587 struct extent_buffer *leaf = NULL;
1588 unsigned long nr;
1589 int level;
1590 int max_level;
1591 int replaced = 0;
1592 int ret;
1593 int err = 0;
1594
1595 path = btrfs_alloc_path();
1596 if (!path)
1597 return -ENOMEM;
1598
1599 reloc_root = root->reloc_root;
1600 root_item = &reloc_root->root_item;
1601
1602 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1603 level = btrfs_root_level(root_item);
1604 extent_buffer_get(reloc_root->node);
1605 path->nodes[level] = reloc_root->node;
1606 path->slots[level] = 0;
1607 } else {
1608 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1609
1610 level = root_item->drop_level;
1611 BUG_ON(level == 0);
1612 path->lowest_level = level;
1613 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1614 path->lowest_level = 0;
1615 if (ret < 0) {
1616 btrfs_free_path(path);
1617 return ret;
1618 }
1619
1620 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1621 path->slots[level]);
1622 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1623
1624 btrfs_unlock_up_safe(path, 0);
1625 }
1626
1627 if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
1628 trans = btrfs_start_transaction(root, 1);
1629
1630 leaf = path->nodes[0];
1631 btrfs_item_key_to_cpu(leaf, &key, 0);
1632 btrfs_release_path(reloc_root, path);
1633
1634 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1635 if (ret < 0) {
1636 err = ret;
1637 goto out;
1638 }
1639
1640 leaf = path->nodes[0];
1641 btrfs_unlock_up_safe(path, 1);
1642 ret = replace_file_extents(trans, rc, root, leaf,
1643 &inode_list);
1644 if (ret < 0)
1645 err = ret;
1646 goto out;
1647 }
1648
1649 memset(&next_key, 0, sizeof(next_key));
1650
1651 while (1) {
1652 leaf = NULL;
1653 replaced = 0;
1654 trans = btrfs_start_transaction(root, 1);
1655 max_level = level;
1656
1657 ret = walk_down_reloc_tree(reloc_root, path, &level);
1658 if (ret < 0) {
1659 err = ret;
1660 goto out;
1661 }
1662 if (ret > 0)
1663 break;
1664
1665 if (!find_next_key(path, level, &key) &&
1666 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1667 ret = 0;
1668 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1669 ret = replace_path(trans, root, reloc_root,
1670 path, &next_key, &leaf,
1671 level, max_level);
1672 } else {
1673 ret = replace_path(trans, root, reloc_root,
1674 path, &next_key, NULL,
1675 level, max_level);
1676 }
1677 if (ret < 0) {
1678 err = ret;
1679 goto out;
1680 }
1681
1682 if (ret > 0) {
1683 level = ret;
1684 btrfs_node_key_to_cpu(path->nodes[level], &key,
1685 path->slots[level]);
1686 replaced = 1;
1687 } else if (leaf) {
1688 /*
1689 * no block got replaced, try replacing file extents
1690 */
1691 btrfs_item_key_to_cpu(leaf, &key, 0);
1692 ret = replace_file_extents(trans, rc, root, leaf,
1693 &inode_list);
1694 btrfs_tree_unlock(leaf);
1695 free_extent_buffer(leaf);
1696 BUG_ON(ret < 0);
1697 }
1698
1699 ret = walk_up_reloc_tree(reloc_root, path, &level);
1700 if (ret > 0)
1701 break;
1702
1703 BUG_ON(level == 0);
1704 /*
1705 * save the merging progress in the drop_progress.
1706 * this is OK since root refs == 1 in this case.
1707 */
1708 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1709 path->slots[level]);
1710 root_item->drop_level = level;
1711
1712 nr = trans->blocks_used;
1713 btrfs_end_transaction(trans, root);
1714
1715 btrfs_btree_balance_dirty(root, nr);
1716
1717 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1718 invalidate_extent_cache(root, &key, &next_key);
1719 }
1720
1721 /*
1722 * handle the case only one block in the fs tree need to be
1723 * relocated and the block is tree root.
1724 */
1725 leaf = btrfs_lock_root_node(root);
1726 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
1727 btrfs_tree_unlock(leaf);
1728 free_extent_buffer(leaf);
1729 if (ret < 0)
1730 err = ret;
1731out:
1732 btrfs_free_path(path);
1733
1734 if (err == 0) {
1735 memset(&root_item->drop_progress, 0,
1736 sizeof(root_item->drop_progress));
1737 root_item->drop_level = 0;
1738 btrfs_set_root_refs(root_item, 0);
1739 }
1740
1741 nr = trans->blocks_used;
1742 btrfs_end_transaction(trans, root);
1743
1744 btrfs_btree_balance_dirty(root, nr);
1745
1746 /*
1747 * put inodes while we aren't holding the tree locks
1748 */
1749 while (!list_empty(&inode_list)) {
1750 struct inodevec *ivec;
1751 ivec = list_entry(inode_list.next, struct inodevec, list);
1752 list_del(&ivec->list);
1753 while (ivec->nr > 0) {
1754 ivec->nr--;
1755 iput(ivec->inode[ivec->nr]);
1756 }
1757 kfree(ivec);
1758 }
1759
1760 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1761 invalidate_extent_cache(root, &key, &next_key);
1762
1763 return err;
1764}
1765
1766/*
1767 * callback for the work threads.
1768 * this function merges reloc tree with corresponding fs tree,
1769 * and then drops the reloc tree.
1770 */
1771static void merge_func(struct btrfs_work *work)
1772{
1773 struct btrfs_trans_handle *trans;
1774 struct btrfs_root *root;
1775 struct btrfs_root *reloc_root;
1776 struct async_merge *async;
1777
1778 async = container_of(work, struct async_merge, work);
1779 reloc_root = async->root;
1780
1781 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1782 root = read_fs_root(reloc_root->fs_info,
1783 reloc_root->root_key.offset);
1784 BUG_ON(IS_ERR(root));
1785 BUG_ON(root->reloc_root != reloc_root);
1786
1787 merge_reloc_root(async->rc, root);
1788
1789 trans = btrfs_start_transaction(root, 1);
1790 btrfs_update_reloc_root(trans, root);
1791 btrfs_end_transaction(trans, root);
1792 }
1793
1794 btrfs_drop_snapshot(reloc_root, 0);
1795
1796 if (atomic_dec_and_test(async->num_pending))
1797 complete(async->done);
1798
1799 kfree(async);
1800}
1801
1802static int merge_reloc_roots(struct reloc_control *rc)
1803{
1804 struct async_merge *async;
1805 struct btrfs_root *root;
1806 struct completion done;
1807 atomic_t num_pending;
1808
1809 init_completion(&done);
1810 atomic_set(&num_pending, 1);
1811
1812 while (!list_empty(&rc->reloc_roots)) {
1813 root = list_entry(rc->reloc_roots.next,
1814 struct btrfs_root, root_list);
1815 list_del_init(&root->root_list);
1816
1817 async = kmalloc(sizeof(*async), GFP_NOFS);
1818 BUG_ON(!async);
1819 async->work.func = merge_func;
1820 async->work.flags = 0;
1821 async->rc = rc;
1822 async->root = root;
1823 async->done = &done;
1824 async->num_pending = &num_pending;
1825 atomic_inc(&num_pending);
1826 btrfs_queue_worker(&rc->workers, &async->work);
1827 }
1828
1829 if (!atomic_dec_and_test(&num_pending))
1830 wait_for_completion(&done);
1831
1832 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1833 return 0;
1834}
1835
1836static void free_block_list(struct rb_root *blocks)
1837{
1838 struct tree_block *block;
1839 struct rb_node *rb_node;
1840 while ((rb_node = rb_first(blocks))) {
1841 block = rb_entry(rb_node, struct tree_block, rb_node);
1842 rb_erase(rb_node, blocks);
1843 kfree(block);
1844 }
1845}
1846
1847static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1848 struct btrfs_root *reloc_root)
1849{
1850 struct btrfs_root *root;
1851
1852 if (reloc_root->last_trans == trans->transid)
1853 return 0;
1854
1855 root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
1856 BUG_ON(IS_ERR(root));
1857 BUG_ON(root->reloc_root != reloc_root);
1858
1859 return btrfs_record_root_in_trans(trans, root);
1860}
1861
1862/*
1863 * select one tree from trees that references the block.
1864 * for blocks in refernce counted trees, we preper reloc tree.
1865 * if no reloc tree found and reloc_only is true, NULL is returned.
1866 */
1867static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1868 struct backref_node *node,
1869 struct backref_edge *edges[],
1870 int *nr, int reloc_only)
1871{
1872 struct backref_node *next;
1873 struct btrfs_root *root;
1874 int index;
1875 int loop = 0;
1876again:
1877 index = 0;
1878 next = node;
1879 while (1) {
1880 cond_resched();
1881 next = walk_up_backref(next, edges, &index);
1882 root = next->root;
1883 if (!root) {
1884 BUG_ON(!node->old_root);
1885 goto skip;
1886 }
1887
1888 /* no other choice for non-refernce counted tree */
1889 if (!root->ref_cows) {
1890 BUG_ON(reloc_only);
1891 break;
1892 }
1893
1894 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1895 record_reloc_root_in_trans(trans, root);
1896 break;
1897 }
1898
1899 if (loop) {
1900 btrfs_record_root_in_trans(trans, root);
1901 break;
1902 }
1903
1904 if (reloc_only || next != node) {
1905 if (!root->reloc_root)
1906 btrfs_record_root_in_trans(trans, root);
1907 root = root->reloc_root;
1908 /*
1909 * if the reloc tree was created in current
1910 * transation, there is no node in backref tree
1911 * corresponds to the root of the reloc tree.
1912 */
1913 if (btrfs_root_last_snapshot(&root->root_item) ==
1914 trans->transid - 1)
1915 break;
1916 }
1917skip:
1918 root = NULL;
1919 next = walk_down_backref(edges, &index);
1920 if (!next || next->level <= node->level)
1921 break;
1922 }
1923
1924 if (!root && !loop && !reloc_only) {
1925 loop = 1;
1926 goto again;
1927 }
1928
1929 if (root)
1930 *nr = index;
1931 else
1932 *nr = 0;
1933
1934 return root;
1935}
1936
1937static noinline_for_stack
1938struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1939 struct backref_node *node)
1940{
1941 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1942 int nr;
1943 return __select_one_root(trans, node, edges, &nr, 0);
1944}
1945
1946static noinline_for_stack
1947struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1948 struct backref_node *node,
1949 struct backref_edge *edges[], int *nr)
1950{
1951 return __select_one_root(trans, node, edges, nr, 1);
1952}
1953
1954static void grab_path_buffers(struct btrfs_path *path,
1955 struct backref_node *node,
1956 struct backref_edge *edges[], int nr)
1957{
1958 int i = 0;
1959 while (1) {
1960 drop_node_buffer(node);
1961 node->eb = path->nodes[node->level];
1962 BUG_ON(!node->eb);
1963 if (path->locks[node->level])
1964 node->locked = 1;
1965 path->nodes[node->level] = NULL;
1966 path->locks[node->level] = 0;
1967
1968 if (i >= nr)
1969 break;
1970
1971 edges[i]->blockptr = node->eb->start;
1972 node = edges[i]->node[UPPER];
1973 i++;
1974 }
1975}
1976
1977/*
1978 * relocate a block tree, and then update pointers in upper level
1979 * blocks that reference the block to point to the new location.
1980 *
1981 * if called by link_to_upper, the block has already been relocated.
1982 * in that case this function just updates pointers.
1983 */
1984static int do_relocation(struct btrfs_trans_handle *trans,
1985 struct backref_node *node,
1986 struct btrfs_key *key,
1987 struct btrfs_path *path, int lowest)
1988{
1989 struct backref_node *upper;
1990 struct backref_edge *edge;
1991 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1992 struct btrfs_root *root;
1993 struct extent_buffer *eb;
1994 u32 blocksize;
1995 u64 bytenr;
1996 u64 generation;
1997 int nr;
1998 int slot;
1999 int ret;
2000 int err = 0;
2001
2002 BUG_ON(lowest && node->eb);
2003
2004 path->lowest_level = node->level + 1;
2005 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2006 cond_resched();
2007 if (node->eb && node->eb->start == edge->blockptr)
2008 continue;
2009
2010 upper = edge->node[UPPER];
2011 root = select_reloc_root(trans, upper, edges, &nr);
2012 if (!root)
2013 continue;
2014
2015 if (upper->eb && !upper->locked)
2016 drop_node_buffer(upper);
2017
2018 if (!upper->eb) {
2019 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2020 if (ret < 0) {
2021 err = ret;
2022 break;
2023 }
2024 BUG_ON(ret > 0);
2025
2026 slot = path->slots[upper->level];
2027
2028 btrfs_unlock_up_safe(path, upper->level + 1);
2029 grab_path_buffers(path, upper, edges, nr);
2030
2031 btrfs_release_path(NULL, path);
2032 } else {
2033 ret = btrfs_bin_search(upper->eb, key, upper->level,
2034 &slot);
2035 BUG_ON(ret);
2036 }
2037
2038 bytenr = btrfs_node_blockptr(upper->eb, slot);
2039 if (!lowest) {
2040 if (node->eb->start == bytenr) {
2041 btrfs_tree_unlock(upper->eb);
2042 upper->locked = 0;
2043 continue;
2044 }
2045 } else {
2046 BUG_ON(node->bytenr != bytenr);
2047 }
2048
2049 blocksize = btrfs_level_size(root, node->level);
2050 generation = btrfs_node_ptr_generation(upper->eb, slot);
2051 eb = read_tree_block(root, bytenr, blocksize, generation);
2052 btrfs_tree_lock(eb);
2053 btrfs_set_lock_blocking(eb);
2054
2055 if (!node->eb) {
2056 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2057 slot, &eb);
2058 if (ret < 0) {
2059 err = ret;
2060 break;
2061 }
2062 btrfs_set_lock_blocking(eb);
2063 node->eb = eb;
2064 node->locked = 1;
2065 } else {
2066 btrfs_set_node_blockptr(upper->eb, slot,
2067 node->eb->start);
2068 btrfs_set_node_ptr_generation(upper->eb, slot,
2069 trans->transid);
2070 btrfs_mark_buffer_dirty(upper->eb);
2071
2072 ret = btrfs_inc_extent_ref(trans, root,
2073 node->eb->start, blocksize,
2074 upper->eb->start,
2075 btrfs_header_owner(upper->eb),
2076 node->level, 0);
2077 BUG_ON(ret);
2078
2079 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2080 BUG_ON(ret);
2081 }
2082 if (!lowest) {
2083 btrfs_tree_unlock(upper->eb);
2084 upper->locked = 0;
2085 }
2086 }
2087 path->lowest_level = 0;
2088 return err;
2089}
2090
2091static int link_to_upper(struct btrfs_trans_handle *trans,
2092 struct backref_node *node,
2093 struct btrfs_path *path)
2094{
2095 struct btrfs_key key;
2096 if (!node->eb || list_empty(&node->upper))
2097 return 0;
2098
2099 btrfs_node_key_to_cpu(node->eb, &key, 0);
2100 return do_relocation(trans, node, &key, path, 0);
2101}
2102
2103static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2104 struct backref_cache *cache,
2105 struct btrfs_path *path)
2106{
2107 struct backref_node *node;
2108 int level;
2109 int ret;
2110 int err = 0;
2111
2112 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2113 while (!list_empty(&cache->pending[level])) {
2114 node = list_entry(cache->pending[level].next,
2115 struct backref_node, lower);
2116 BUG_ON(node->level != level);
2117
2118 ret = link_to_upper(trans, node, path);
2119 if (ret < 0)
2120 err = ret;
2121 /*
2122 * this remove the node from the pending list and
2123 * may add some other nodes to the level + 1
2124 * pending list
2125 */
2126 remove_backref_node(cache, node);
2127 }
2128 }
2129 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2130 return err;
2131}
2132
2133static void mark_block_processed(struct reloc_control *rc,
2134 struct backref_node *node)
2135{
2136 u32 blocksize;
2137 if (node->level == 0 ||
2138 in_block_group(node->bytenr, rc->block_group)) {
2139 blocksize = btrfs_level_size(rc->extent_root, node->level);
2140 set_extent_bits(&rc->processed_blocks, node->bytenr,
2141 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2142 GFP_NOFS);
2143 }
2144 node->processed = 1;
2145}
2146
2147/*
2148 * mark a block and all blocks directly/indirectly reference the block
2149 * as processed.
2150 */
2151static void update_processed_blocks(struct reloc_control *rc,
2152 struct backref_node *node)
2153{
2154 struct backref_node *next = node;
2155 struct backref_edge *edge;
2156 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2157 int index = 0;
2158
2159 while (next) {
2160 cond_resched();
2161 while (1) {
2162 if (next->processed)
2163 break;
2164
2165 mark_block_processed(rc, next);
2166
2167 if (list_empty(&next->upper))
2168 break;
2169
2170 edge = list_entry(next->upper.next,
2171 struct backref_edge, list[LOWER]);
2172 edges[index++] = edge;
2173 next = edge->node[UPPER];
2174 }
2175 next = walk_down_backref(edges, &index);
2176 }
2177}
2178
2179static int tree_block_processed(u64 bytenr, u32 blocksize,
2180 struct reloc_control *rc)
2181{
2182 if (test_range_bit(&rc->processed_blocks, bytenr,
2183 bytenr + blocksize - 1, EXTENT_DIRTY, 1))
2184 return 1;
2185 return 0;
2186}
2187
2188/*
2189 * check if there are any file extent pointers in the leaf point to
2190 * data require processing
2191 */
2192static int check_file_extents(struct reloc_control *rc,
2193 u64 bytenr, u32 blocksize, u64 ptr_gen)
2194{
2195 struct btrfs_key found_key;
2196 struct btrfs_file_extent_item *fi;
2197 struct extent_buffer *leaf;
2198 u32 nritems;
2199 int i;
2200 int ret = 0;
2201
2202 leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2203
2204 nritems = btrfs_header_nritems(leaf);
2205 for (i = 0; i < nritems; i++) {
2206 cond_resched();
2207 btrfs_item_key_to_cpu(leaf, &found_key, i);
2208 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2209 continue;
2210 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2211 if (btrfs_file_extent_type(leaf, fi) ==
2212 BTRFS_FILE_EXTENT_INLINE)
2213 continue;
2214 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2215 if (bytenr == 0)
2216 continue;
2217 if (in_block_group(bytenr, rc->block_group)) {
2218 ret = 1;
2219 break;
2220 }
2221 }
2222 free_extent_buffer(leaf);
2223 return ret;
2224}
2225
2226/*
2227 * scan child blocks of a given block to find blocks require processing
2228 */
2229static int add_child_blocks(struct btrfs_trans_handle *trans,
2230 struct reloc_control *rc,
2231 struct backref_node *node,
2232 struct rb_root *blocks)
2233{
2234 struct tree_block *block;
2235 struct rb_node *rb_node;
2236 u64 bytenr;
2237 u64 ptr_gen;
2238 u32 blocksize;
2239 u32 nritems;
2240 int i;
2241 int err = 0;
2242
2243 nritems = btrfs_header_nritems(node->eb);
2244 blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2245 for (i = 0; i < nritems; i++) {
2246 cond_resched();
2247 bytenr = btrfs_node_blockptr(node->eb, i);
2248 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2249 if (ptr_gen == trans->transid)
2250 continue;
2251 if (!in_block_group(bytenr, rc->block_group) &&
2252 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2253 continue;
2254 if (tree_block_processed(bytenr, blocksize, rc))
2255 continue;
2256
2257 readahead_tree_block(rc->extent_root,
2258 bytenr, blocksize, ptr_gen);
2259 }
2260
2261 for (i = 0; i < nritems; i++) {
2262 cond_resched();
2263 bytenr = btrfs_node_blockptr(node->eb, i);
2264 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2265 if (ptr_gen == trans->transid)
2266 continue;
2267 if (!in_block_group(bytenr, rc->block_group) &&
2268 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2269 continue;
2270 if (tree_block_processed(bytenr, blocksize, rc))
2271 continue;
2272 if (!in_block_group(bytenr, rc->block_group) &&
2273 !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2274 continue;
2275
2276 block = kmalloc(sizeof(*block), GFP_NOFS);
2277 if (!block) {
2278 err = -ENOMEM;
2279 break;
2280 }
2281 block->bytenr = bytenr;
2282 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2283 block->level = node->level - 1;
2284 block->key_ready = 1;
2285 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2286 BUG_ON(rb_node);
2287 }
2288 if (err)
2289 free_block_list(blocks);
2290 return err;
2291}
2292
2293/*
2294 * find adjacent blocks require processing
2295 */
2296static noinline_for_stack
2297int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2298 struct reloc_control *rc,
2299 struct backref_cache *cache,
2300 struct rb_root *blocks, int level,
2301 struct backref_node **upper)
2302{
2303 struct backref_node *node;
2304 int ret = 0;
2305
2306 WARN_ON(!list_empty(&cache->pending[level]));
2307
2308 if (list_empty(&cache->pending[level + 1]))
2309 return 1;
2310
2311 node = list_entry(cache->pending[level + 1].next,
2312 struct backref_node, lower);
2313 if (node->eb)
2314 ret = add_child_blocks(trans, rc, node, blocks);
2315
2316 *upper = node;
2317 return ret;
2318}
2319
2320static int get_tree_block_key(struct reloc_control *rc,
2321 struct tree_block *block)
2322{
2323 struct extent_buffer *eb;
2324
2325 BUG_ON(block->key_ready);
2326 eb = read_tree_block(rc->extent_root, block->bytenr,
2327 block->key.objectid, block->key.offset);
2328 WARN_ON(btrfs_header_level(eb) != block->level);
2329 if (block->level == 0)
2330 btrfs_item_key_to_cpu(eb, &block->key, 0);
2331 else
2332 btrfs_node_key_to_cpu(eb, &block->key, 0);
2333 free_extent_buffer(eb);
2334 block->key_ready = 1;
2335 return 0;
2336}
2337
2338static int reada_tree_block(struct reloc_control *rc,
2339 struct tree_block *block)
2340{
2341 BUG_ON(block->key_ready);
2342 readahead_tree_block(rc->extent_root, block->bytenr,
2343 block->key.objectid, block->key.offset);
2344 return 0;
2345}
2346
2347/*
2348 * helper function to relocate a tree block
2349 */
2350static int relocate_tree_block(struct btrfs_trans_handle *trans,
2351 struct reloc_control *rc,
2352 struct backref_node *node,
2353 struct btrfs_key *key,
2354 struct btrfs_path *path)
2355{
2356 struct btrfs_root *root;
2357 int ret;
2358
2359 root = select_one_root(trans, node);
2360 if (unlikely(!root)) {
2361 rc->found_old_snapshot = 1;
2362 update_processed_blocks(rc, node);
2363 return 0;
2364 }
2365
2366 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2367 ret = do_relocation(trans, node, key, path, 1);
2368 if (ret < 0)
2369 goto out;
2370 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2371 ret = replace_file_extents(trans, rc, root,
2372 node->eb, NULL);
2373 if (ret < 0)
2374 goto out;
2375 }
2376 drop_node_buffer(node);
2377 } else if (!root->ref_cows) {
2378 path->lowest_level = node->level;
2379 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2380 btrfs_release_path(root, path);
2381 if (ret < 0)
2382 goto out;
2383 } else if (root != node->root) {
2384 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2385 }
2386
2387 update_processed_blocks(rc, node);
2388 ret = 0;
2389out:
2390 drop_node_buffer(node);
2391 return ret;
2392}
2393
2394/*
2395 * relocate a list of blocks
2396 */
2397static noinline_for_stack
2398int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2399 struct reloc_control *rc, struct rb_root *blocks)
2400{
2401 struct backref_cache *cache;
2402 struct backref_node *node;
2403 struct btrfs_path *path;
2404 struct tree_block *block;
2405 struct rb_node *rb_node;
2406 int level = -1;
2407 int ret;
2408 int err = 0;
2409
2410 path = btrfs_alloc_path();
2411 if (!path)
2412 return -ENOMEM;
2413
2414 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2415 if (!cache) {
2416 btrfs_free_path(path);
2417 return -ENOMEM;
2418 }
2419
2420 backref_cache_init(cache);
2421
2422 rb_node = rb_first(blocks);
2423 while (rb_node) {
2424 block = rb_entry(rb_node, struct tree_block, rb_node);
2425 if (level == -1)
2426 level = block->level;
2427 else
2428 BUG_ON(level != block->level);
2429 if (!block->key_ready)
2430 reada_tree_block(rc, block);
2431 rb_node = rb_next(rb_node);
2432 }
2433
2434 rb_node = rb_first(blocks);
2435 while (rb_node) {
2436 block = rb_entry(rb_node, struct tree_block, rb_node);
2437 if (!block->key_ready)
2438 get_tree_block_key(rc, block);
2439 rb_node = rb_next(rb_node);
2440 }
2441
2442 rb_node = rb_first(blocks);
2443 while (rb_node) {
2444 block = rb_entry(rb_node, struct tree_block, rb_node);
2445
2446 node = build_backref_tree(rc, cache, &block->key,
2447 block->level, block->bytenr);
2448 if (IS_ERR(node)) {
2449 err = PTR_ERR(node);
2450 goto out;
2451 }
2452
2453 ret = relocate_tree_block(trans, rc, node, &block->key,
2454 path);
2455 if (ret < 0) {
2456 err = ret;
2457 goto out;
2458 }
2459 remove_backref_node(cache, node);
2460 rb_node = rb_next(rb_node);
2461 }
2462
2463 if (level > 0)
2464 goto out;
2465
2466 free_block_list(blocks);
2467
2468 /*
2469 * now backrefs of some upper level tree blocks have been cached,
2470 * try relocating blocks referenced by these upper level blocks.
2471 */
2472 while (1) {
2473 struct backref_node *upper = NULL;
2474 if (trans->transaction->in_commit ||
2475 trans->transaction->delayed_refs.flushing)
2476 break;
2477
2478 ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
2479 &upper);
2480 if (ret < 0)
2481 err = ret;
2482 if (ret != 0)
2483 break;
2484
2485 rb_node = rb_first(blocks);
2486 while (rb_node) {
2487 block = rb_entry(rb_node, struct tree_block, rb_node);
2488 if (trans->transaction->in_commit ||
2489 trans->transaction->delayed_refs.flushing)
2490 goto out;
2491 BUG_ON(!block->key_ready);
2492 node = build_backref_tree(rc, cache, &block->key,
2493 level, block->bytenr);
2494 if (IS_ERR(node)) {
2495 err = PTR_ERR(node);
2496 goto out;
2497 }
2498
2499 ret = relocate_tree_block(trans, rc, node,
2500 &block->key, path);
2501 if (ret < 0) {
2502 err = ret;
2503 goto out;
2504 }
2505 remove_backref_node(cache, node);
2506 rb_node = rb_next(rb_node);
2507 }
2508 free_block_list(blocks);
2509
2510 if (upper) {
2511 ret = link_to_upper(trans, upper, path);
2512 if (ret < 0) {
2513 err = ret;
2514 break;
2515 }
2516 remove_backref_node(cache, upper);
2517 }
2518 }
2519out:
2520 free_block_list(blocks);
2521
2522 ret = finish_pending_nodes(trans, cache, path);
2523 if (ret < 0)
2524 err = ret;
2525
2526 kfree(cache);
2527 btrfs_free_path(path);
2528 return err;
2529}
2530
2531static noinline_for_stack
2532int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
2533{
2534 u64 page_start;
2535 u64 page_end;
2536 unsigned long i;
2537 unsigned long first_index;
2538 unsigned long last_index;
2539 unsigned int total_read = 0;
2540 unsigned int total_dirty = 0;
2541 struct page *page;
2542 struct file_ra_state *ra;
2543 struct btrfs_ordered_extent *ordered;
2544 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2545 int ret = 0;
2546
2547 ra = kzalloc(sizeof(*ra), GFP_NOFS);
2548 if (!ra)
2549 return -ENOMEM;
2550
2551 mutex_lock(&inode->i_mutex);
2552 first_index = start >> PAGE_CACHE_SHIFT;
2553 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2554
2555 /* make sure the dirty trick played by the caller work */
2556 while (1) {
2557 ret = invalidate_inode_pages2_range(inode->i_mapping,
2558 first_index, last_index);
2559 if (ret != -EBUSY)
2560 break;
2561 schedule_timeout(HZ/10);
2562 }
2563 if (ret)
2564 goto out_unlock;
2565
2566 file_ra_state_init(ra, inode->i_mapping);
2567
2568 for (i = first_index ; i <= last_index; i++) {
2569 if (total_read % ra->ra_pages == 0) {
2570 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
2571 min(last_index, ra->ra_pages + i - 1));
2572 }
2573 total_read++;
2574again:
2575 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
2576 BUG_ON(1);
2577 page = grab_cache_page(inode->i_mapping, i);
2578 if (!page) {
2579 ret = -ENOMEM;
2580 goto out_unlock;
2581 }
2582 if (!PageUptodate(page)) {
2583 btrfs_readpage(NULL, page);
2584 lock_page(page);
2585 if (!PageUptodate(page)) {
2586 unlock_page(page);
2587 page_cache_release(page);
2588 ret = -EIO;
2589 goto out_unlock;
2590 }
2591 }
2592 wait_on_page_writeback(page);
2593
2594 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2595 page_end = page_start + PAGE_CACHE_SIZE - 1;
2596 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2597
2598 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2599 if (ordered) {
2600 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2601 unlock_page(page);
2602 page_cache_release(page);
2603 btrfs_start_ordered_extent(inode, ordered, 1);
2604 btrfs_put_ordered_extent(ordered);
2605 goto again;
2606 }
2607 set_page_extent_mapped(page);
2608
2609 if (i == first_index)
2610 set_extent_bits(io_tree, page_start, page_end,
2611 EXTENT_BOUNDARY, GFP_NOFS);
2612 btrfs_set_extent_delalloc(inode, page_start, page_end);
2613
2614 set_page_dirty(page);
2615 total_dirty++;
2616
2617 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2618 unlock_page(page);
2619 page_cache_release(page);
2620 }
2621out_unlock:
2622 mutex_unlock(&inode->i_mutex);
2623 kfree(ra);
2624 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
2625 return ret;
2626}
2627
2628static noinline_for_stack
2629int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
2630{
2631 struct btrfs_root *root = BTRFS_I(inode)->root;
2632 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2633 struct extent_map *em;
2634 u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
2635 u64 end = start + extent_key->offset - 1;
2636
2637 em = alloc_extent_map(GFP_NOFS);
2638 em->start = start;
2639 em->len = extent_key->offset;
2640 em->block_len = extent_key->offset;
2641 em->block_start = extent_key->objectid;
2642 em->bdev = root->fs_info->fs_devices->latest_bdev;
2643 set_bit(EXTENT_FLAG_PINNED, &em->flags);
2644
2645 /* setup extent map to cheat btrfs_readpage */
2646 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2647 while (1) {
2648 int ret;
2649 spin_lock(&em_tree->lock);
2650 ret = add_extent_mapping(em_tree, em);
2651 spin_unlock(&em_tree->lock);
2652 if (ret != -EEXIST) {
2653 free_extent_map(em);
2654 break;
2655 }
2656 btrfs_drop_extent_cache(inode, start, end, 0);
2657 }
2658 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2659
2660 return relocate_inode_pages(inode, start, extent_key->offset);
2661}
2662
2663#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2664static int get_ref_objectid_v0(struct reloc_control *rc,
2665 struct btrfs_path *path,
2666 struct btrfs_key *extent_key,
2667 u64 *ref_objectid, int *path_change)
2668{
2669 struct btrfs_key key;
2670 struct extent_buffer *leaf;
2671 struct btrfs_extent_ref_v0 *ref0;
2672 int ret;
2673 int slot;
2674
2675 leaf = path->nodes[0];
2676 slot = path->slots[0];
2677 while (1) {
2678 if (slot >= btrfs_header_nritems(leaf)) {
2679 ret = btrfs_next_leaf(rc->extent_root, path);
2680 if (ret < 0)
2681 return ret;
2682 BUG_ON(ret > 0);
2683 leaf = path->nodes[0];
2684 slot = path->slots[0];
2685 if (path_change)
2686 *path_change = 1;
2687 }
2688 btrfs_item_key_to_cpu(leaf, &key, slot);
2689 if (key.objectid != extent_key->objectid)
2690 return -ENOENT;
2691
2692 if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
2693 slot++;
2694 continue;
2695 }
2696 ref0 = btrfs_item_ptr(leaf, slot,
2697 struct btrfs_extent_ref_v0);
2698 *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
2699 break;
2700 }
2701 return 0;
2702}
2703#endif
2704
2705/*
2706 * helper to add a tree block to the list.
2707 * the major work is getting the generation and level of the block
2708 */
2709static int add_tree_block(struct reloc_control *rc,
2710 struct btrfs_key *extent_key,
2711 struct btrfs_path *path,
2712 struct rb_root *blocks)
2713{
2714 struct extent_buffer *eb;
2715 struct btrfs_extent_item *ei;
2716 struct btrfs_tree_block_info *bi;
2717 struct tree_block *block;
2718 struct rb_node *rb_node;
2719 u32 item_size;
2720 int level = -1;
2721 int generation;
2722
2723 eb = path->nodes[0];
2724 item_size = btrfs_item_size_nr(eb, path->slots[0]);
2725
2726 if (item_size >= sizeof(*ei) + sizeof(*bi)) {
2727 ei = btrfs_item_ptr(eb, path->slots[0],
2728 struct btrfs_extent_item);
2729 bi = (struct btrfs_tree_block_info *)(ei + 1);
2730 generation = btrfs_extent_generation(eb, ei);
2731 level = btrfs_tree_block_level(eb, bi);
2732 } else {
2733#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2734 u64 ref_owner;
2735 int ret;
2736
2737 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2738 ret = get_ref_objectid_v0(rc, path, extent_key,
2739 &ref_owner, NULL);
2740 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
2741 level = (int)ref_owner;
2742 /* FIXME: get real generation */
2743 generation = 0;
2744#else
2745 BUG();
2746#endif
2747 }
2748
2749 btrfs_release_path(rc->extent_root, path);
2750
2751 BUG_ON(level == -1);
2752
2753 block = kmalloc(sizeof(*block), GFP_NOFS);
2754 if (!block)
2755 return -ENOMEM;
2756
2757 block->bytenr = extent_key->objectid;
2758 block->key.objectid = extent_key->offset;
2759 block->key.offset = generation;
2760 block->level = level;
2761 block->key_ready = 0;
2762
2763 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2764 BUG_ON(rb_node);
2765
2766 return 0;
2767}
2768
2769/*
2770 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2771 */
2772static int __add_tree_block(struct reloc_control *rc,
2773 u64 bytenr, u32 blocksize,
2774 struct rb_root *blocks)
2775{
2776 struct btrfs_path *path;
2777 struct btrfs_key key;
2778 int ret;
2779
2780 if (tree_block_processed(bytenr, blocksize, rc))
2781 return 0;
2782
2783 if (tree_search(blocks, bytenr))
2784 return 0;
2785
2786 path = btrfs_alloc_path();
2787 if (!path)
2788 return -ENOMEM;
2789
2790 key.objectid = bytenr;
2791 key.type = BTRFS_EXTENT_ITEM_KEY;
2792 key.offset = blocksize;
2793
2794 path->search_commit_root = 1;
2795 path->skip_locking = 1;
2796 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2797 if (ret < 0)
2798 goto out;
2799 BUG_ON(ret);
2800
2801 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2802 ret = add_tree_block(rc, &key, path, blocks);
2803out:
2804 btrfs_free_path(path);
2805 return ret;
2806}
2807
2808/*
2809 * helper to check if the block use full backrefs for pointers in it
2810 */
2811static int block_use_full_backref(struct reloc_control *rc,
2812 struct extent_buffer *eb)
2813{
2814 struct btrfs_path *path;
2815 struct btrfs_extent_item *ei;
2816 struct btrfs_key key;
2817 u64 flags;
2818 int ret;
2819
2820 if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
2821 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2822 return 1;
2823
2824 path = btrfs_alloc_path();
2825 BUG_ON(!path);
2826
2827 key.objectid = eb->start;
2828 key.type = BTRFS_EXTENT_ITEM_KEY;
2829 key.offset = eb->len;
2830
2831 path->search_commit_root = 1;
2832 path->skip_locking = 1;
2833 ret = btrfs_search_slot(NULL, rc->extent_root,
2834 &key, path, 0, 0);
2835 BUG_ON(ret);
2836
2837 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2838 struct btrfs_extent_item);
2839 flags = btrfs_extent_flags(path->nodes[0], ei);
2840 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2841 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2842 ret = 1;
2843 else
2844 ret = 0;
2845 btrfs_free_path(path);
2846 return ret;
2847}
2848
2849/*
2850 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
2851 * this function scans fs tree to find blocks reference the data extent
2852 */
2853static int find_data_references(struct reloc_control *rc,
2854 struct btrfs_key *extent_key,
2855 struct extent_buffer *leaf,
2856 struct btrfs_extent_data_ref *ref,
2857 struct rb_root *blocks)
2858{
2859 struct btrfs_path *path;
2860 struct tree_block *block;
2861 struct btrfs_root *root;
2862 struct btrfs_file_extent_item *fi;
2863 struct rb_node *rb_node;
2864 struct btrfs_key key;
2865 u64 ref_root;
2866 u64 ref_objectid;
2867 u64 ref_offset;
2868 u32 ref_count;
2869 u32 nritems;
2870 int err = 0;
2871 int added = 0;
2872 int counted;
2873 int ret;
2874
2875 path = btrfs_alloc_path();
2876 if (!path)
2877 return -ENOMEM;
2878
2879 ref_root = btrfs_extent_data_ref_root(leaf, ref);
2880 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
2881 ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
2882 ref_count = btrfs_extent_data_ref_count(leaf, ref);
2883
2884 root = read_fs_root(rc->extent_root->fs_info, ref_root);
2885 if (IS_ERR(root)) {
2886 err = PTR_ERR(root);
2887 goto out;
2888 }
2889
2890 key.objectid = ref_objectid;
2891 key.offset = ref_offset;
2892 key.type = BTRFS_EXTENT_DATA_KEY;
2893
2894 path->search_commit_root = 1;
2895 path->skip_locking = 1;
2896 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2897 if (ret < 0) {
2898 err = ret;
2899 goto out;
2900 }
2901
2902 leaf = path->nodes[0];
2903 nritems = btrfs_header_nritems(leaf);
2904 /*
2905 * the references in tree blocks that use full backrefs
2906 * are not counted in
2907 */
2908 if (block_use_full_backref(rc, leaf))
2909 counted = 0;
2910 else
2911 counted = 1;
2912 rb_node = tree_search(blocks, leaf->start);
2913 if (rb_node) {
2914 if (counted)
2915 added = 1;
2916 else
2917 path->slots[0] = nritems;
2918 }
2919
2920 while (ref_count > 0) {
2921 while (path->slots[0] >= nritems) {
2922 ret = btrfs_next_leaf(root, path);
2923 if (ret < 0) {
2924 err = ret;
2925 goto out;
2926 }
2927 if (ret > 0) {
2928 WARN_ON(1);
2929 goto out;
2930 }
2931
2932 leaf = path->nodes[0];
2933 nritems = btrfs_header_nritems(leaf);
2934 added = 0;
2935
2936 if (block_use_full_backref(rc, leaf))
2937 counted = 0;
2938 else
2939 counted = 1;
2940 rb_node = tree_search(blocks, leaf->start);
2941 if (rb_node) {
2942 if (counted)
2943 added = 1;
2944 else
2945 path->slots[0] = nritems;
2946 }
2947 }
2948
2949 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2950 if (key.objectid != ref_objectid ||
2951 key.type != BTRFS_EXTENT_DATA_KEY) {
2952 WARN_ON(1);
2953 break;
2954 }
2955
2956 fi = btrfs_item_ptr(leaf, path->slots[0],
2957 struct btrfs_file_extent_item);
2958
2959 if (btrfs_file_extent_type(leaf, fi) ==
2960 BTRFS_FILE_EXTENT_INLINE)
2961 goto next;
2962
2963 if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
2964 extent_key->objectid)
2965 goto next;
2966
2967 key.offset -= btrfs_file_extent_offset(leaf, fi);
2968 if (key.offset != ref_offset)
2969 goto next;
2970
2971 if (counted)
2972 ref_count--;
2973 if (added)
2974 goto next;
2975
2976 if (!tree_block_processed(leaf->start, leaf->len, rc)) {
2977 block = kmalloc(sizeof(*block), GFP_NOFS);
2978 if (!block) {
2979 err = -ENOMEM;
2980 break;
2981 }
2982 block->bytenr = leaf->start;
2983 btrfs_item_key_to_cpu(leaf, &block->key, 0);
2984 block->level = 0;
2985 block->key_ready = 1;
2986 rb_node = tree_insert(blocks, block->bytenr,
2987 &block->rb_node);
2988 BUG_ON(rb_node);
2989 }
2990 if (counted)
2991 added = 1;
2992 else
2993 path->slots[0] = nritems;
2994next:
2995 path->slots[0]++;
2996
2997 }
2998out:
2999 btrfs_free_path(path);
3000 return err;
3001}
3002
3003/*
3004 * hepler to find all tree blocks that reference a given data extent
3005 */
3006static noinline_for_stack
3007int add_data_references(struct reloc_control *rc,
3008 struct btrfs_key *extent_key,
3009 struct btrfs_path *path,
3010 struct rb_root *blocks)
3011{
3012 struct btrfs_key key;
3013 struct extent_buffer *eb;
3014 struct btrfs_extent_data_ref *dref;
3015 struct btrfs_extent_inline_ref *iref;
3016 unsigned long ptr;
3017 unsigned long end;
3018 u32 blocksize;
3019 int ret;
3020 int err = 0;
3021
3022 ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3023 extent_key->offset);
3024 BUG_ON(ret < 0);
3025 if (ret > 0) {
3026 /* the relocated data is fragmented */
3027 rc->extents_skipped++;
3028 btrfs_release_path(rc->extent_root, path);
3029 return 0;
3030 }
3031
3032 blocksize = btrfs_level_size(rc->extent_root, 0);
3033
3034 eb = path->nodes[0];
3035 ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3036 end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3037#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3038 if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3039 ptr = end;
3040 else
3041#endif
3042 ptr += sizeof(struct btrfs_extent_item);
3043
3044 while (ptr < end) {
3045 iref = (struct btrfs_extent_inline_ref *)ptr;
3046 key.type = btrfs_extent_inline_ref_type(eb, iref);
3047 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3048 key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3049 ret = __add_tree_block(rc, key.offset, blocksize,
3050 blocks);
3051 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3052 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3053 ret = find_data_references(rc, extent_key,
3054 eb, dref, blocks);
3055 } else {
3056 BUG();
3057 }
3058 ptr += btrfs_extent_inline_ref_size(key.type);
3059 }
3060 WARN_ON(ptr > end);
3061
3062 while (1) {
3063 cond_resched();
3064 eb = path->nodes[0];
3065 if (path->slots[0] >= btrfs_header_nritems(eb)) {
3066 ret = btrfs_next_leaf(rc->extent_root, path);
3067 if (ret < 0) {
3068 err = ret;
3069 break;
3070 }
3071 if (ret > 0)
3072 break;
3073 eb = path->nodes[0];
3074 }
3075
3076 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3077 if (key.objectid != extent_key->objectid)
3078 break;
3079
3080#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3081 if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3082 key.type == BTRFS_EXTENT_REF_V0_KEY) {
3083#else
3084 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3085 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3086#endif
3087 ret = __add_tree_block(rc, key.offset, blocksize,
3088 blocks);
3089 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3090 dref = btrfs_item_ptr(eb, path->slots[0],
3091 struct btrfs_extent_data_ref);
3092 ret = find_data_references(rc, extent_key,
3093 eb, dref, blocks);
3094 } else {
3095 ret = 0;
3096 }
3097 if (ret) {
3098 err = ret;
3099 break;
3100 }
3101 path->slots[0]++;
3102 }
3103 btrfs_release_path(rc->extent_root, path);
3104 if (err)
3105 free_block_list(blocks);
3106 return err;
3107}
3108
3109/*
3110 * hepler to find next unprocessed extent
3111 */
3112static noinline_for_stack
3113int find_next_extent(struct btrfs_trans_handle *trans,
3114 struct reloc_control *rc, struct btrfs_path *path)
3115{
3116 struct btrfs_key key;
3117 struct extent_buffer *leaf;
3118 u64 start, end, last;
3119 int ret;
3120
3121 last = rc->block_group->key.objectid + rc->block_group->key.offset;
3122 while (1) {
3123 cond_resched();
3124 if (rc->search_start >= last) {
3125 ret = 1;
3126 break;
3127 }
3128
3129 key.objectid = rc->search_start;
3130 key.type = BTRFS_EXTENT_ITEM_KEY;
3131 key.offset = 0;
3132
3133 path->search_commit_root = 1;
3134 path->skip_locking = 1;
3135 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3136 0, 0);
3137 if (ret < 0)
3138 break;
3139next:
3140 leaf = path->nodes[0];
3141 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3142 ret = btrfs_next_leaf(rc->extent_root, path);
3143 if (ret != 0)
3144 break;
3145 leaf = path->nodes[0];
3146 }
3147
3148 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3149 if (key.objectid >= last) {
3150 ret = 1;
3151 break;
3152 }
3153
3154 if (key.type != BTRFS_EXTENT_ITEM_KEY ||
3155 key.objectid + key.offset <= rc->search_start) {
3156 path->slots[0]++;
3157 goto next;
3158 }
3159
3160 ret = find_first_extent_bit(&rc->processed_blocks,
3161 key.objectid, &start, &end,
3162 EXTENT_DIRTY);
3163
3164 if (ret == 0 && start <= key.objectid) {
3165 btrfs_release_path(rc->extent_root, path);
3166 rc->search_start = end + 1;
3167 } else {
3168 rc->search_start = key.objectid + key.offset;
3169 return 0;
3170 }
3171 }
3172 btrfs_release_path(rc->extent_root, path);
3173 return ret;
3174}
3175
3176static void set_reloc_control(struct reloc_control *rc)
3177{
3178 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3179 mutex_lock(&fs_info->trans_mutex);
3180 fs_info->reloc_ctl = rc;
3181 mutex_unlock(&fs_info->trans_mutex);
3182}
3183
3184static void unset_reloc_control(struct reloc_control *rc)
3185{
3186 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3187 mutex_lock(&fs_info->trans_mutex);
3188 fs_info->reloc_ctl = NULL;
3189 mutex_unlock(&fs_info->trans_mutex);
3190}
3191
3192static int check_extent_flags(u64 flags)
3193{
3194 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3195 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3196 return 1;
3197 if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3198 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3199 return 1;
3200 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3201 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3202 return 1;
3203 return 0;
3204}
3205
3206static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3207{
3208 struct rb_root blocks = RB_ROOT;
3209 struct btrfs_key key;
3210 struct btrfs_trans_handle *trans = NULL;
3211 struct btrfs_path *path;
3212 struct btrfs_extent_item *ei;
3213 unsigned long nr;
3214 u64 flags;
3215 u32 item_size;
3216 int ret;
3217 int err = 0;
3218
3219 path = btrfs_alloc_path();
3220 if (!path)
3221 return -ENOMEM;
3222
3223 rc->search_start = rc->block_group->key.objectid;
3224 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3225 GFP_NOFS);
3226
3227 rc->create_reloc_root = 1;
3228 set_reloc_control(rc);
3229
3230 trans = btrfs_start_transaction(rc->extent_root, 1);
3231 btrfs_commit_transaction(trans, rc->extent_root);
3232
3233 while (1) {
3234 trans = btrfs_start_transaction(rc->extent_root, 1);
3235
3236 ret = find_next_extent(trans, rc, path);
3237 if (ret < 0)
3238 err = ret;
3239 if (ret != 0)
3240 break;
3241
3242 rc->extents_found++;
3243
3244 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3245 struct btrfs_extent_item);
3246 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3247 item_size = btrfs_item_size_nr(path->nodes[0],
3248 path->slots[0]);
3249 if (item_size >= sizeof(*ei)) {
3250 flags = btrfs_extent_flags(path->nodes[0], ei);
3251 ret = check_extent_flags(flags);
3252 BUG_ON(ret);
3253
3254 } else {
3255#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3256 u64 ref_owner;
3257 int path_change = 0;
3258
3259 BUG_ON(item_size !=
3260 sizeof(struct btrfs_extent_item_v0));
3261 ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3262 &path_change);
3263 if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3264 flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3265 else
3266 flags = BTRFS_EXTENT_FLAG_DATA;
3267
3268 if (path_change) {
3269 btrfs_release_path(rc->extent_root, path);
3270
3271 path->search_commit_root = 1;
3272 path->skip_locking = 1;
3273 ret = btrfs_search_slot(NULL, rc->extent_root,
3274 &key, path, 0, 0);
3275 if (ret < 0) {
3276 err = ret;
3277 break;
3278 }
3279 BUG_ON(ret > 0);
3280 }
3281#else
3282 BUG();
3283#endif
3284 }
3285
3286 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3287 ret = add_tree_block(rc, &key, path, &blocks);
3288 } else if (rc->stage == UPDATE_DATA_PTRS &&
3289 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3290 ret = add_data_references(rc, &key, path, &blocks);
3291 } else {
3292 btrfs_release_path(rc->extent_root, path);
3293 ret = 0;
3294 }
3295 if (ret < 0) {
3296 err = 0;
3297 break;
3298 }
3299
3300 if (!RB_EMPTY_ROOT(&blocks)) {
3301 ret = relocate_tree_blocks(trans, rc, &blocks);
3302 if (ret < 0) {
3303 err = ret;
3304 break;
3305 }
3306 }
3307
3308 nr = trans->blocks_used;
3309 btrfs_end_transaction_throttle(trans, rc->extent_root);
3310 trans = NULL;
3311 btrfs_btree_balance_dirty(rc->extent_root, nr);
3312
3313 if (rc->stage == MOVE_DATA_EXTENTS &&
3314 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3315 rc->found_file_extent = 1;
3316 ret = relocate_data_extent(rc->data_inode, &key);
3317 if (ret < 0) {
3318 err = ret;
3319 break;
3320 }
3321 }
3322 }
3323 btrfs_free_path(path);
3324
3325 if (trans) {
3326 nr = trans->blocks_used;
3327 btrfs_end_transaction(trans, rc->extent_root);
3328 btrfs_btree_balance_dirty(rc->extent_root, nr);
3329 }
3330
3331 rc->create_reloc_root = 0;
3332 smp_mb();
3333
3334 if (rc->extents_found > 0) {
3335 trans = btrfs_start_transaction(rc->extent_root, 1);
3336 btrfs_commit_transaction(trans, rc->extent_root);
3337 }
3338
3339 merge_reloc_roots(rc);
3340
3341 unset_reloc_control(rc);
3342
3343 /* get rid of pinned extents */
3344 trans = btrfs_start_transaction(rc->extent_root, 1);
3345 btrfs_commit_transaction(trans, rc->extent_root);
3346
3347 return err;
3348}
3349
3350static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3351 struct btrfs_root *root,
3352 u64 objectid, u64 size)
3353{
3354 struct btrfs_path *path;
3355 struct btrfs_inode_item *item;
3356 struct extent_buffer *leaf;
3357 int ret;
3358
3359 path = btrfs_alloc_path();
3360 if (!path)
3361 return -ENOMEM;
3362
3363 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3364 if (ret)
3365 goto out;
3366
3367 leaf = path->nodes[0];
3368 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3369 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3370 btrfs_set_inode_generation(leaf, item, 1);
3371 btrfs_set_inode_size(leaf, item, size);
3372 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3373 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
3374 btrfs_mark_buffer_dirty(leaf);
3375 btrfs_release_path(root, path);
3376out:
3377 btrfs_free_path(path);
3378 return ret;
3379}
3380
3381/*
3382 * helper to create inode for data relocation.
3383 * the inode is in data relocation tree and its link count is 0
3384 */
3385static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3386 struct btrfs_block_group_cache *group)
3387{
3388 struct inode *inode = NULL;
3389 struct btrfs_trans_handle *trans;
3390 struct btrfs_root *root;
3391 struct btrfs_key key;
3392 unsigned long nr;
3393 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3394 int err = 0;
3395
3396 root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
3397 if (IS_ERR(root))
3398 return ERR_CAST(root);
3399
3400 trans = btrfs_start_transaction(root, 1);
3401 BUG_ON(!trans);
3402
3403 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3404 if (err)
3405 goto out;
3406
3407 err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
3408 BUG_ON(err);
3409
3410 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
3411 group->key.offset, 0, group->key.offset,
3412 0, 0, 0);
3413 BUG_ON(err);
3414
3415 key.objectid = objectid;
3416 key.type = BTRFS_INODE_ITEM_KEY;
3417 key.offset = 0;
3418 inode = btrfs_iget(root->fs_info->sb, &key, root);
3419 BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3420 BTRFS_I(inode)->index_cnt = group->key.objectid;
3421
3422 err = btrfs_orphan_add(trans, inode);
3423out:
3424 nr = trans->blocks_used;
3425 btrfs_end_transaction(trans, root);
3426
3427 btrfs_btree_balance_dirty(root, nr);
3428 if (err) {
3429 if (inode)
3430 iput(inode);
3431 inode = ERR_PTR(err);
3432 }
3433 return inode;
3434}
3435
3436/*
3437 * function to relocate all extents in a block group.
3438 */
3439int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3440{
3441 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3442 struct reloc_control *rc;
3443 int ret;
3444 int err = 0;
3445
3446 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3447 if (!rc)
3448 return -ENOMEM;
3449
3450 mapping_tree_init(&rc->reloc_root_tree);
3451 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3452 INIT_LIST_HEAD(&rc->reloc_roots);
3453
3454 rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3455 BUG_ON(!rc->block_group);
3456
3457 btrfs_init_workers(&rc->workers, "relocate",
3458 fs_info->thread_pool_size);
3459
3460 rc->extent_root = extent_root;
3461 btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
3462
3463 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3464 if (IS_ERR(rc->data_inode)) {
3465 err = PTR_ERR(rc->data_inode);
3466 rc->data_inode = NULL;
3467 goto out;
3468 }
3469
3470 printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
3471 (unsigned long long)rc->block_group->key.objectid,
3472 (unsigned long long)rc->block_group->flags);
3473
3474 btrfs_start_delalloc_inodes(fs_info->tree_root);
3475 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
3476
3477 while (1) {
3478 mutex_lock(&fs_info->cleaner_mutex);
3479 btrfs_clean_old_snapshots(fs_info->tree_root);
3480 mutex_unlock(&fs_info->cleaner_mutex);
3481
3482 rc->extents_found = 0;
3483 rc->extents_skipped = 0;
3484
3485 ret = relocate_block_group(rc);
3486 if (ret < 0) {
3487 err = ret;
3488 break;
3489 }
3490
3491 if (rc->extents_found == 0)
3492 break;
3493
3494 printk(KERN_INFO "btrfs: found %llu extents\n",
3495 (unsigned long long)rc->extents_found);
3496
3497 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
3498 btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
3499 invalidate_mapping_pages(rc->data_inode->i_mapping,
3500 0, -1);
3501 rc->stage = UPDATE_DATA_PTRS;
3502 } else if (rc->stage == UPDATE_DATA_PTRS &&
3503 rc->extents_skipped >= rc->extents_found) {
3504 iput(rc->data_inode);
3505 rc->data_inode = create_reloc_inode(fs_info,
3506 rc->block_group);
3507 if (IS_ERR(rc->data_inode)) {
3508 err = PTR_ERR(rc->data_inode);
3509 rc->data_inode = NULL;
3510 break;
3511 }
3512 rc->stage = MOVE_DATA_EXTENTS;
3513 rc->found_file_extent = 0;
3514 }
3515 }
3516
3517 filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
3518 rc->block_group->key.objectid,
3519 rc->block_group->key.objectid +
3520 rc->block_group->key.offset - 1);
3521
3522 WARN_ON(rc->block_group->pinned > 0);
3523 WARN_ON(rc->block_group->reserved > 0);
3524 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3525out:
3526 iput(rc->data_inode);
3527 btrfs_stop_workers(&rc->workers);
3528 btrfs_put_block_group(rc->block_group);
3529 kfree(rc);
3530 return err;
3531}
3532
3533/*
3534 * recover relocation interrupted by system crash.
3535 *
3536 * this function resumes merging reloc trees with corresponding fs trees.
3537 * this is important for keeping the sharing of tree blocks
3538 */
3539int btrfs_recover_relocation(struct btrfs_root *root)
3540{
3541 LIST_HEAD(reloc_roots);
3542 struct btrfs_key key;
3543 struct btrfs_root *fs_root;
3544 struct btrfs_root *reloc_root;
3545 struct btrfs_path *path;
3546 struct extent_buffer *leaf;
3547 struct reloc_control *rc = NULL;
3548 struct btrfs_trans_handle *trans;
3549 int ret;
3550 int err = 0;
3551
3552 path = btrfs_alloc_path();
3553 if (!path)
3554 return -ENOMEM;
3555
3556 key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3557 key.type = BTRFS_ROOT_ITEM_KEY;
3558 key.offset = (u64)-1;
3559
3560 while (1) {
3561 ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
3562 path, 0, 0);
3563 if (ret < 0) {
3564 err = ret;
3565 goto out;
3566 }
3567 if (ret > 0) {
3568 if (path->slots[0] == 0)
3569 break;
3570 path->slots[0]--;
3571 }
3572 leaf = path->nodes[0];
3573 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3574 btrfs_release_path(root->fs_info->tree_root, path);
3575
3576 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3577 key.type != BTRFS_ROOT_ITEM_KEY)
3578 break;
3579
3580 reloc_root = btrfs_read_fs_root_no_radix(root, &key);
3581 if (IS_ERR(reloc_root)) {
3582 err = PTR_ERR(reloc_root);
3583 goto out;
3584 }
3585
3586 list_add(&reloc_root->root_list, &reloc_roots);
3587
3588 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3589 fs_root = read_fs_root(root->fs_info,
3590 reloc_root->root_key.offset);
3591 if (IS_ERR(fs_root)) {
3592 err = PTR_ERR(fs_root);
3593 goto out;
3594 }
3595 }
3596
3597 if (key.offset == 0)
3598 break;
3599
3600 key.offset--;
3601 }
3602 btrfs_release_path(root->fs_info->tree_root, path);
3603
3604 if (list_empty(&reloc_roots))
3605 goto out;
3606
3607 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3608 if (!rc) {
3609 err = -ENOMEM;
3610 goto out;
3611 }
3612
3613 mapping_tree_init(&rc->reloc_root_tree);
3614 INIT_LIST_HEAD(&rc->reloc_roots);
3615 btrfs_init_workers(&rc->workers, "relocate",
3616 root->fs_info->thread_pool_size);
3617 rc->extent_root = root->fs_info->extent_root;
3618
3619 set_reloc_control(rc);
3620
3621 while (!list_empty(&reloc_roots)) {
3622 reloc_root = list_entry(reloc_roots.next,
3623 struct btrfs_root, root_list);
3624 list_del(&reloc_root->root_list);
3625
3626 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3627 list_add_tail(&reloc_root->root_list,
3628 &rc->reloc_roots);
3629 continue;
3630 }
3631
3632 fs_root = read_fs_root(root->fs_info,
3633 reloc_root->root_key.offset);
3634 BUG_ON(IS_ERR(fs_root));
3635
3636 __add_reloc_root(reloc_root);
3637 fs_root->reloc_root = reloc_root;
3638 }
3639
3640 trans = btrfs_start_transaction(rc->extent_root, 1);
3641 btrfs_commit_transaction(trans, rc->extent_root);
3642
3643 merge_reloc_roots(rc);
3644
3645 unset_reloc_control(rc);
3646
3647 trans = btrfs_start_transaction(rc->extent_root, 1);
3648 btrfs_commit_transaction(trans, rc->extent_root);
3649out:
3650 if (rc) {
3651 btrfs_stop_workers(&rc->workers);
3652 kfree(rc);
3653 }
3654 while (!list_empty(&reloc_roots)) {
3655 reloc_root = list_entry(reloc_roots.next,
3656 struct btrfs_root, root_list);
3657 list_del(&reloc_root->root_list);
3658 free_extent_buffer(reloc_root->node);
3659 free_extent_buffer(reloc_root->commit_root);
3660 kfree(reloc_root);
3661 }
3662 btrfs_free_path(path);
3663
3664 if (err == 0) {
3665 /* cleanup orphan inode in data relocation tree */
3666 fs_root = read_fs_root(root->fs_info,
3667 BTRFS_DATA_RELOC_TREE_OBJECTID);
3668 if (IS_ERR(fs_root))
3669 err = PTR_ERR(fs_root);
3670 }
3671 return err;
3672}
3673
3674/*
3675 * helper to add ordered checksum for data relocation.
3676 *
3677 * cloning checksum properly handles the nodatasum extents.
3678 * it also saves CPU time to re-calculate the checksum.
3679 */
3680int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3681{
3682 struct btrfs_ordered_sum *sums;
3683 struct btrfs_sector_sum *sector_sum;
3684 struct btrfs_ordered_extent *ordered;
3685 struct btrfs_root *root = BTRFS_I(inode)->root;
3686 size_t offset;
3687 int ret;
3688 u64 disk_bytenr;
3689 LIST_HEAD(list);
3690
3691 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
3692 BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
3693
3694 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
3695 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
3696 disk_bytenr + len - 1, &list);
3697
3698 while (!list_empty(&list)) {
3699 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3700 list_del_init(&sums->list);
3701
3702 sector_sum = sums->sums;
3703 sums->bytenr = ordered->start;
3704
3705 offset = 0;
3706 while (offset < sums->len) {
3707 sector_sum->bytenr += ordered->start - disk_bytenr;
3708 sector_sum++;
3709 offset += root->sectorsize;
3710 }
3711
3712 btrfs_add_ordered_sum(inode, ordered, sums);
3713 }
3714 btrfs_put_ordered_extent(ordered);
3715 return 0;
3716}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index b48650de4472..0ddc6d61c55a 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -111,6 +111,15 @@ out:
111 return ret; 111 return ret;
112} 112}
113 113
114int btrfs_set_root_node(struct btrfs_root_item *item,
115 struct extent_buffer *node)
116{
117 btrfs_set_root_bytenr(item, node->start);
118 btrfs_set_root_level(item, btrfs_header_level(node));
119 btrfs_set_root_generation(item, btrfs_header_generation(node));
120 return 0;
121}
122
114/* 123/*
115 * copy the data in 'item' into the btree 124 * copy the data in 'item' into the btree
116 */ 125 */
@@ -164,8 +173,7 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
164 * offset lower than the latest root. They need to be queued for deletion to 173 * offset lower than the latest root. They need to be queued for deletion to
165 * finish what was happening when we crashed. 174 * finish what was happening when we crashed.
166 */ 175 */
167int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, 176int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
168 struct btrfs_root *latest)
169{ 177{
170 struct btrfs_root *dead_root; 178 struct btrfs_root *dead_root;
171 struct btrfs_item *item; 179 struct btrfs_item *item;
@@ -227,10 +235,7 @@ again:
227 goto err; 235 goto err;
228 } 236 }
229 237
230 if (objectid == BTRFS_TREE_RELOC_OBJECTID) 238 ret = btrfs_add_dead_root(dead_root);
231 ret = btrfs_add_dead_reloc_root(dead_root);
232 else
233 ret = btrfs_add_dead_root(dead_root, latest);
234 if (ret) 239 if (ret)
235 goto err; 240 goto err;
236 goto again; 241 goto again;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2ff7cd2db25f..6d6d06cb6dfc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,7 +26,6 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/seq_file.h> 27#include <linux/seq_file.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
31#include <linux/mount.h> 30#include <linux/mount.h>
32#include <linux/mpage.h> 31#include <linux/mpage.h>
@@ -52,7 +51,6 @@
52#include "export.h" 51#include "export.h"
53#include "compression.h" 52#include "compression.h"
54 53
55
56static struct super_operations btrfs_super_ops; 54static struct super_operations btrfs_super_ops;
57 55
58static void btrfs_put_super(struct super_block *sb) 56static void btrfs_put_super(struct super_block *sb)
@@ -67,8 +65,8 @@ static void btrfs_put_super(struct super_block *sb)
67enum { 65enum {
68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, 68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
71 Opt_ratio, Opt_flushoncommit, Opt_err, 69 Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
72}; 70};
73 71
74static match_table_t tokens = { 72static match_table_t tokens = {
@@ -84,6 +82,8 @@ static match_table_t tokens = {
84 {Opt_thread_pool, "thread_pool=%d"}, 82 {Opt_thread_pool, "thread_pool=%d"},
85 {Opt_compress, "compress"}, 83 {Opt_compress, "compress"},
86 {Opt_ssd, "ssd"}, 84 {Opt_ssd, "ssd"},
85 {Opt_ssd_spread, "ssd_spread"},
86 {Opt_nossd, "nossd"},
87 {Opt_noacl, "noacl"}, 87 {Opt_noacl, "noacl"},
88 {Opt_notreelog, "notreelog"}, 88 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"}, 89 {Opt_flushoncommit, "flushoncommit"},
@@ -158,7 +158,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
158 */ 158 */
159 break; 159 break;
160 case Opt_nodatasum: 160 case Opt_nodatasum:
161 printk(KERN_INFO "btrfs: setting nodatacsum\n"); 161 printk(KERN_INFO "btrfs: setting nodatasum\n");
162 btrfs_set_opt(info->mount_opt, NODATASUM); 162 btrfs_set_opt(info->mount_opt, NODATASUM);
163 break; 163 break;
164 case Opt_nodatacow: 164 case Opt_nodatacow:
@@ -174,6 +174,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
174 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 174 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
175 btrfs_set_opt(info->mount_opt, SSD); 175 btrfs_set_opt(info->mount_opt, SSD);
176 break; 176 break;
177 case Opt_ssd_spread:
178 printk(KERN_INFO "btrfs: use spread ssd "
179 "allocation scheme\n");
180 btrfs_set_opt(info->mount_opt, SSD);
181 btrfs_set_opt(info->mount_opt, SSD_SPREAD);
182 break;
183 case Opt_nossd:
184 printk(KERN_INFO "btrfs: not using ssd allocation "
185 "scheme\n");
186 btrfs_set_opt(info->mount_opt, NOSSD);
187 btrfs_clear_opt(info->mount_opt, SSD);
188 btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
189 break;
177 case Opt_nobarrier: 190 case Opt_nobarrier:
178 printk(KERN_INFO "btrfs: turning off barriers\n"); 191 printk(KERN_INFO "btrfs: turning off barriers\n");
179 btrfs_set_opt(info->mount_opt, NOBARRIER); 192 btrfs_set_opt(info->mount_opt, NOBARRIER);
@@ -322,7 +335,7 @@ static int btrfs_fill_super(struct super_block *sb,
322 struct dentry *root_dentry; 335 struct dentry *root_dentry;
323 struct btrfs_super_block *disk_super; 336 struct btrfs_super_block *disk_super;
324 struct btrfs_root *tree_root; 337 struct btrfs_root *tree_root;
325 struct btrfs_inode *bi; 338 struct btrfs_key key;
326 int err; 339 int err;
327 340
328 sb->s_maxbytes = MAX_LFS_FILESIZE; 341 sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -341,23 +354,15 @@ static int btrfs_fill_super(struct super_block *sb,
341 } 354 }
342 sb->s_fs_info = tree_root; 355 sb->s_fs_info = tree_root;
343 disk_super = &tree_root->fs_info->super_copy; 356 disk_super = &tree_root->fs_info->super_copy;
344 inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
345 tree_root->fs_info->fs_root);
346 bi = BTRFS_I(inode);
347 bi->location.objectid = inode->i_ino;
348 bi->location.offset = 0;
349 bi->root = tree_root->fs_info->fs_root;
350
351 btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
352 357
353 if (!inode) { 358 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
354 err = -ENOMEM; 359 key.type = BTRFS_INODE_ITEM_KEY;
360 key.offset = 0;
361 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
362 if (IS_ERR(inode)) {
363 err = PTR_ERR(inode);
355 goto fail_close; 364 goto fail_close;
356 } 365 }
357 if (inode->i_state & I_NEW) {
358 btrfs_read_locked_inode(inode);
359 unlock_new_inode(inode);
360 }
361 366
362 root_dentry = d_alloc_root(inode); 367 root_dentry = d_alloc_root(inode);
363 if (!root_dentry) { 368 if (!root_dentry) {
@@ -388,10 +393,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
388 struct btrfs_root *root = btrfs_sb(sb); 393 struct btrfs_root *root = btrfs_sb(sb);
389 int ret; 394 int ret;
390 395
391 if (sb->s_flags & MS_RDONLY)
392 return 0;
393
394 sb->s_dirt = 0;
395 if (!wait) { 396 if (!wait) {
396 filemap_flush(root->fs_info->btree_inode->i_mapping); 397 filemap_flush(root->fs_info->btree_inode->i_mapping);
397 return 0; 398 return 0;
@@ -402,7 +403,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
402 403
403 trans = btrfs_start_transaction(root, 1); 404 trans = btrfs_start_transaction(root, 1);
404 ret = btrfs_commit_transaction(trans, root); 405 ret = btrfs_commit_transaction(trans, root);
405 sb->s_dirt = 0;
406 return ret; 406 return ret;
407} 407}
408 408
@@ -433,7 +433,11 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
433 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); 433 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
434 if (btrfs_test_opt(root, COMPRESS)) 434 if (btrfs_test_opt(root, COMPRESS))
435 seq_puts(seq, ",compress"); 435 seq_puts(seq, ",compress");
436 if (btrfs_test_opt(root, SSD)) 436 if (btrfs_test_opt(root, NOSSD))
437 seq_puts(seq, ",nossd");
438 if (btrfs_test_opt(root, SSD_SPREAD))
439 seq_puts(seq, ",ssd_spread");
440 else if (btrfs_test_opt(root, SSD))
437 seq_puts(seq, ",ssd"); 441 seq_puts(seq, ",ssd");
438 if (btrfs_test_opt(root, NOTREELOG)) 442 if (btrfs_test_opt(root, NOTREELOG))
439 seq_puts(seq, ",notreelog"); 443 seq_puts(seq, ",notreelog");
@@ -444,11 +448,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
444 return 0; 448 return 0;
445} 449}
446 450
447static void btrfs_write_super(struct super_block *sb)
448{
449 sb->s_dirt = 0;
450}
451
452static int btrfs_test_super(struct super_block *s, void *data) 451static int btrfs_test_super(struct super_block *s, void *data)
453{ 452{
454 struct btrfs_fs_devices *test_fs_devices = data; 453 struct btrfs_fs_devices *test_fs_devices = data;
@@ -584,7 +583,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
584 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 583 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
585 return -EINVAL; 584 return -EINVAL;
586 585
587 ret = btrfs_cleanup_reloc_trees(root); 586 /* recover relocation */
587 ret = btrfs_recover_relocation(root);
588 WARN_ON(ret); 588 WARN_ON(ret);
589 589
590 ret = btrfs_cleanup_fs_roots(root->fs_info); 590 ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -678,7 +678,6 @@ static int btrfs_unfreeze(struct super_block *sb)
678static struct super_operations btrfs_super_ops = { 678static struct super_operations btrfs_super_ops = {
679 .delete_inode = btrfs_delete_inode, 679 .delete_inode = btrfs_delete_inode,
680 .put_super = btrfs_put_super, 680 .put_super = btrfs_put_super,
681 .write_super = btrfs_write_super,
682 .sync_fs = btrfs_sync_fs, 681 .sync_fs = btrfs_sync_fs,
683 .show_options = btrfs_show_options, 682 .show_options = btrfs_show_options,
684 .write_inode = btrfs_write_inode, 683 .write_inode = btrfs_write_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 01b143605ec1..cdbb5022da52 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -25,7 +25,6 @@
25#include "disk-io.h" 25#include "disk-io.h"
26#include "transaction.h" 26#include "transaction.h"
27#include "locking.h" 27#include "locking.h"
28#include "ref-cache.h"
29#include "tree-log.h" 28#include "tree-log.h"
30 29
31#define BTRFS_ROOT_TRANS_TAG 0 30#define BTRFS_ROOT_TRANS_TAG 0
@@ -41,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
41 } 40 }
42} 41}
43 42
43static noinline void switch_commit_root(struct btrfs_root *root)
44{
45 free_extent_buffer(root->commit_root);
46 root->commit_root = btrfs_root_node(root);
47}
48
44/* 49/*
45 * either allocate a new transaction or hop into the existing one 50 * either allocate a new transaction or hop into the existing one
46 */ 51 */
@@ -94,45 +99,37 @@ static noinline int join_transaction(struct btrfs_root *root)
94 * to make sure the old root from before we joined the transaction is deleted 99 * to make sure the old root from before we joined the transaction is deleted
95 * when the transaction commits 100 * when the transaction commits
96 */ 101 */
97noinline int btrfs_record_root_in_trans(struct btrfs_root *root) 102static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root)
98{ 104{
99 struct btrfs_dirty_root *dirty; 105 if (root->ref_cows && root->last_trans < trans->transid) {
100 u64 running_trans_id = root->fs_info->running_transaction->transid;
101 if (root->ref_cows && root->last_trans < running_trans_id) {
102 WARN_ON(root == root->fs_info->extent_root); 106 WARN_ON(root == root->fs_info->extent_root);
103 if (root->root_item.refs != 0) { 107 WARN_ON(root->root_item.refs == 0);
104 radix_tree_tag_set(&root->fs_info->fs_roots_radix, 108 WARN_ON(root->commit_root != root->node);
105 (unsigned long)root->root_key.objectid, 109
106 BTRFS_ROOT_TRANS_TAG); 110 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
107 111 (unsigned long)root->root_key.objectid,
108 dirty = kmalloc(sizeof(*dirty), GFP_NOFS); 112 BTRFS_ROOT_TRANS_TAG);
109 BUG_ON(!dirty); 113 root->last_trans = trans->transid;
110 dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); 114 btrfs_init_reloc_root(trans, root);
111 BUG_ON(!dirty->root); 115 }
112 dirty->latest_root = root; 116 return 0;
113 INIT_LIST_HEAD(&dirty->list); 117}
114 118
115 root->commit_root = btrfs_root_node(root); 119int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
116 120 struct btrfs_root *root)
117 memcpy(dirty->root, root, sizeof(*root)); 121{
118 spin_lock_init(&dirty->root->node_lock); 122 if (!root->ref_cows)
119 spin_lock_init(&dirty->root->list_lock); 123 return 0;
120 mutex_init(&dirty->root->objectid_mutex); 124
121 mutex_init(&dirty->root->log_mutex); 125 mutex_lock(&root->fs_info->trans_mutex);
122 INIT_LIST_HEAD(&dirty->root->dead_list); 126 if (root->last_trans == trans->transid) {
123 dirty->root->node = root->commit_root; 127 mutex_unlock(&root->fs_info->trans_mutex);
124 dirty->root->commit_root = NULL; 128 return 0;
125
126 spin_lock(&root->list_lock);
127 list_add(&dirty->root->dead_list, &root->dead_list);
128 spin_unlock(&root->list_lock);
129
130 root->dirty_root = dirty;
131 } else {
132 WARN_ON(1);
133 }
134 root->last_trans = running_trans_id;
135 } 129 }
130
131 record_root_in_trans(trans, root);
132 mutex_unlock(&root->fs_info->trans_mutex);
136 return 0; 133 return 0;
137} 134}
138 135
@@ -181,7 +178,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
181 ret = join_transaction(root); 178 ret = join_transaction(root);
182 BUG_ON(ret); 179 BUG_ON(ret);
183 180
184 btrfs_record_root_in_trans(root);
185 h->transid = root->fs_info->running_transaction->transid; 181 h->transid = root->fs_info->running_transaction->transid;
186 h->transaction = root->fs_info->running_transaction; 182 h->transaction = root->fs_info->running_transaction;
187 h->blocks_reserved = num_blocks; 183 h->blocks_reserved = num_blocks;
@@ -192,6 +188,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
192 h->delayed_ref_updates = 0; 188 h->delayed_ref_updates = 0;
193 189
194 root->fs_info->running_transaction->use_count++; 190 root->fs_info->running_transaction->use_count++;
191 record_root_in_trans(h, root);
195 mutex_unlock(&root->fs_info->trans_mutex); 192 mutex_unlock(&root->fs_info->trans_mutex);
196 return h; 193 return h;
197} 194}
@@ -233,6 +230,7 @@ static noinline int wait_for_commit(struct btrfs_root *root,
233 return 0; 230 return 0;
234} 231}
235 232
233#if 0
236/* 234/*
237 * rate limit against the drop_snapshot code. This helps to slow down new 235 * rate limit against the drop_snapshot code. This helps to slow down new
238 * operations if the drop_snapshot code isn't able to keep up. 236 * operations if the drop_snapshot code isn't able to keep up.
@@ -273,6 +271,7 @@ harder:
273 goto harder; 271 goto harder;
274 } 272 }
275} 273}
274#endif
276 275
277void btrfs_throttle(struct btrfs_root *root) 276void btrfs_throttle(struct btrfs_root *root)
278{ 277{
@@ -280,7 +279,6 @@ void btrfs_throttle(struct btrfs_root *root)
280 if (!root->fs_info->open_ioctl_trans) 279 if (!root->fs_info->open_ioctl_trans)
281 wait_current_trans(root); 280 wait_current_trans(root);
282 mutex_unlock(&root->fs_info->trans_mutex); 281 mutex_unlock(&root->fs_info->trans_mutex);
283 throttle_on_drops(root);
284} 282}
285 283
286static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 284static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -323,9 +321,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
323 memset(trans, 0, sizeof(*trans)); 321 memset(trans, 0, sizeof(*trans));
324 kmem_cache_free(btrfs_trans_handle_cachep, trans); 322 kmem_cache_free(btrfs_trans_handle_cachep, trans);
325 323
326 if (throttle)
327 throttle_on_drops(root);
328
329 return 0; 324 return 0;
330} 325}
331 326
@@ -455,36 +450,32 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
455 450
456 btrfs_write_dirty_block_groups(trans, root); 451 btrfs_write_dirty_block_groups(trans, root);
457 452
458 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
459 BUG_ON(ret);
460
461 while (1) { 453 while (1) {
462 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 454 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
463 if (old_root_bytenr == root->node->start) 455 if (old_root_bytenr == root->node->start)
464 break; 456 break;
465 btrfs_set_root_bytenr(&root->root_item,
466 root->node->start);
467 btrfs_set_root_level(&root->root_item,
468 btrfs_header_level(root->node));
469 btrfs_set_root_generation(&root->root_item, trans->transid);
470 457
458 btrfs_set_root_node(&root->root_item, root->node);
471 ret = btrfs_update_root(trans, tree_root, 459 ret = btrfs_update_root(trans, tree_root,
472 &root->root_key, 460 &root->root_key,
473 &root->root_item); 461 &root->root_item);
474 BUG_ON(ret); 462 BUG_ON(ret);
475 btrfs_write_dirty_block_groups(trans, root);
476 463
477 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 464 ret = btrfs_write_dirty_block_groups(trans, root);
478 BUG_ON(ret); 465 BUG_ON(ret);
479 } 466 }
467
468 if (root != root->fs_info->extent_root)
469 switch_commit_root(root);
470
480 return 0; 471 return 0;
481} 472}
482 473
483/* 474/*
484 * update all the cowonly tree roots on disk 475 * update all the cowonly tree roots on disk
485 */ 476 */
486int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 477static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
487 struct btrfs_root *root) 478 struct btrfs_root *root)
488{ 479{
489 struct btrfs_fs_info *fs_info = root->fs_info; 480 struct btrfs_fs_info *fs_info = root->fs_info;
490 struct list_head *next; 481 struct list_head *next;
@@ -508,10 +499,12 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
508 root = list_entry(next, struct btrfs_root, dirty_list); 499 root = list_entry(next, struct btrfs_root, dirty_list);
509 500
510 update_cowonly_root(trans, root); 501 update_cowonly_root(trans, root);
511
512 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
513 BUG_ON(ret);
514 } 502 }
503
504 down_write(&fs_info->extent_commit_sem);
505 switch_commit_root(fs_info->extent_root);
506 up_write(&fs_info->extent_commit_sem);
507
515 return 0; 508 return 0;
516} 509}
517 510
@@ -520,118 +513,53 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
520 * a dirty root struct and adds it into the list of dead roots that need to 513 * a dirty root struct and adds it into the list of dead roots that need to
521 * be deleted 514 * be deleted
522 */ 515 */
523int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) 516int btrfs_add_dead_root(struct btrfs_root *root)
524{ 517{
525 struct btrfs_dirty_root *dirty;
526
527 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
528 if (!dirty)
529 return -ENOMEM;
530 dirty->root = root;
531 dirty->latest_root = latest;
532
533 mutex_lock(&root->fs_info->trans_mutex); 518 mutex_lock(&root->fs_info->trans_mutex);
534 list_add(&dirty->list, &latest->fs_info->dead_roots); 519 list_add(&root->root_list, &root->fs_info->dead_roots);
535 mutex_unlock(&root->fs_info->trans_mutex); 520 mutex_unlock(&root->fs_info->trans_mutex);
536 return 0; 521 return 0;
537} 522}
538 523
539/* 524/*
540 * at transaction commit time we need to schedule the old roots for 525 * update all the cowonly tree roots on disk
541 * deletion via btrfs_drop_snapshot. This runs through all the
542 * reference counted roots that were modified in the current
543 * transaction and puts them into the drop list
544 */ 526 */
545static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, 527static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
546 struct radix_tree_root *radix, 528 struct btrfs_root *root)
547 struct list_head *list)
548{ 529{
549 struct btrfs_dirty_root *dirty;
550 struct btrfs_root *gang[8]; 530 struct btrfs_root *gang[8];
551 struct btrfs_root *root; 531 struct btrfs_fs_info *fs_info = root->fs_info;
552 int i; 532 int i;
553 int ret; 533 int ret;
554 int err = 0; 534 int err = 0;
555 u32 refs;
556 535
557 while (1) { 536 while (1) {
558 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0, 537 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
538 (void **)gang, 0,
559 ARRAY_SIZE(gang), 539 ARRAY_SIZE(gang),
560 BTRFS_ROOT_TRANS_TAG); 540 BTRFS_ROOT_TRANS_TAG);
561 if (ret == 0) 541 if (ret == 0)
562 break; 542 break;
563 for (i = 0; i < ret; i++) { 543 for (i = 0; i < ret; i++) {
564 root = gang[i]; 544 root = gang[i];
565 radix_tree_tag_clear(radix, 545 radix_tree_tag_clear(&fs_info->fs_roots_radix,
566 (unsigned long)root->root_key.objectid, 546 (unsigned long)root->root_key.objectid,
567 BTRFS_ROOT_TRANS_TAG); 547 BTRFS_ROOT_TRANS_TAG);
568
569 BUG_ON(!root->ref_tree);
570 dirty = root->dirty_root;
571 548
572 btrfs_free_log(trans, root); 549 btrfs_free_log(trans, root);
573 btrfs_free_reloc_root(trans, root); 550 btrfs_update_reloc_root(trans, root);
574
575 if (root->commit_root == root->node) {
576 WARN_ON(root->node->start !=
577 btrfs_root_bytenr(&root->root_item));
578
579 free_extent_buffer(root->commit_root);
580 root->commit_root = NULL;
581 root->dirty_root = NULL;
582
583 spin_lock(&root->list_lock);
584 list_del_init(&dirty->root->dead_list);
585 spin_unlock(&root->list_lock);
586
587 kfree(dirty->root);
588 kfree(dirty);
589 551
590 /* make sure to update the root on disk 552 if (root->commit_root != root->node) {
591 * so we get any updates to the block used 553 switch_commit_root(root);
592 * counts 554 btrfs_set_root_node(&root->root_item,
593 */ 555 root->node);
594 err = btrfs_update_root(trans,
595 root->fs_info->tree_root,
596 &root->root_key,
597 &root->root_item);
598 continue;
599 } 556 }
600 557
601 memset(&root->root_item.drop_progress, 0, 558 err = btrfs_update_root(trans, fs_info->tree_root,
602 sizeof(struct btrfs_disk_key));
603 root->root_item.drop_level = 0;
604 root->commit_root = NULL;
605 root->dirty_root = NULL;
606 root->root_key.offset = root->fs_info->generation;
607 btrfs_set_root_bytenr(&root->root_item,
608 root->node->start);
609 btrfs_set_root_level(&root->root_item,
610 btrfs_header_level(root->node));
611 btrfs_set_root_generation(&root->root_item,
612 root->root_key.offset);
613
614 err = btrfs_insert_root(trans, root->fs_info->tree_root,
615 &root->root_key, 559 &root->root_key,
616 &root->root_item); 560 &root->root_item);
617 if (err) 561 if (err)
618 break; 562 break;
619
620 refs = btrfs_root_refs(&dirty->root->root_item);
621 btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
622 err = btrfs_update_root(trans, root->fs_info->tree_root,
623 &dirty->root->root_key,
624 &dirty->root->root_item);
625
626 BUG_ON(err);
627 if (refs == 1) {
628 list_add(&dirty->list, list);
629 } else {
630 WARN_ON(1);
631 free_extent_buffer(dirty->root->node);
632 kfree(dirty->root);
633 kfree(dirty);
634 }
635 } 563 }
636 } 564 }
637 return err; 565 return err;
@@ -670,6 +598,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
670 return 0; 598 return 0;
671} 599}
672 600
601#if 0
673/* 602/*
674 * when dropping snapshots, we generate a ton of delayed refs, and it makes 603 * when dropping snapshots, we generate a ton of delayed refs, and it makes
675 * sense not to join the transaction while it is trying to flush the current 604 * sense not to join the transaction while it is trying to flush the current
@@ -688,12 +617,8 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
688 TASK_UNINTERRUPTIBLE); 617 TASK_UNINTERRUPTIBLE);
689 mutex_unlock(&info->trans_mutex); 618 mutex_unlock(&info->trans_mutex);
690 619
691 atomic_dec(&info->throttles);
692 wake_up(&info->transaction_throttle);
693
694 schedule(); 620 schedule();
695 621
696 atomic_inc(&info->throttles);
697 mutex_lock(&info->trans_mutex); 622 mutex_lock(&info->trans_mutex);
698 finish_wait(&info->transaction_wait, &wait); 623 finish_wait(&info->transaction_wait, &wait);
699 } 624 }
@@ -705,113 +630,64 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
705 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on 630 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
706 * all of them 631 * all of them
707 */ 632 */
708static noinline int drop_dirty_roots(struct btrfs_root *tree_root, 633int btrfs_drop_dead_root(struct btrfs_root *root)
709 struct list_head *list)
710{ 634{
711 struct btrfs_dirty_root *dirty;
712 struct btrfs_trans_handle *trans; 635 struct btrfs_trans_handle *trans;
636 struct btrfs_root *tree_root = root->fs_info->tree_root;
713 unsigned long nr; 637 unsigned long nr;
714 u64 num_bytes; 638 int ret;
715 u64 bytes_used;
716 u64 max_useless;
717 int ret = 0;
718 int err;
719
720 while (!list_empty(list)) {
721 struct btrfs_root *root;
722
723 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
724 list_del_init(&dirty->list);
725
726 num_bytes = btrfs_root_used(&dirty->root->root_item);
727 root = dirty->latest_root;
728 atomic_inc(&root->fs_info->throttles);
729
730 while (1) {
731 /*
732 * we don't want to jump in and create a bunch of
733 * delayed refs if the transaction is starting to close
734 */
735 wait_transaction_pre_flush(tree_root->fs_info);
736 trans = btrfs_start_transaction(tree_root, 1);
737
738 /*
739 * we've joined a transaction, make sure it isn't
740 * closing right now
741 */
742 if (trans->transaction->delayed_refs.flushing) {
743 btrfs_end_transaction(trans, tree_root);
744 continue;
745 }
746
747 mutex_lock(&root->fs_info->drop_mutex);
748 ret = btrfs_drop_snapshot(trans, dirty->root);
749 if (ret != -EAGAIN)
750 break;
751 mutex_unlock(&root->fs_info->drop_mutex);
752 639
753 err = btrfs_update_root(trans, 640 while (1) {
754 tree_root, 641 /*
755 &dirty->root->root_key, 642 * we don't want to jump in and create a bunch of
756 &dirty->root->root_item); 643 * delayed refs if the transaction is starting to close
757 if (err) 644 */
758 ret = err; 645 wait_transaction_pre_flush(tree_root->fs_info);
759 nr = trans->blocks_used; 646 trans = btrfs_start_transaction(tree_root, 1);
760 ret = btrfs_end_transaction(trans, tree_root);
761 BUG_ON(ret);
762 647
763 btrfs_btree_balance_dirty(tree_root, nr); 648 /*
764 cond_resched(); 649 * we've joined a transaction, make sure it isn't
650 * closing right now
651 */
652 if (trans->transaction->delayed_refs.flushing) {
653 btrfs_end_transaction(trans, tree_root);
654 continue;
765 } 655 }
766 BUG_ON(ret);
767 atomic_dec(&root->fs_info->throttles);
768 wake_up(&root->fs_info->transaction_throttle);
769 656
770 num_bytes -= btrfs_root_used(&dirty->root->root_item); 657 ret = btrfs_drop_snapshot(trans, root);
771 bytes_used = btrfs_root_used(&root->root_item); 658 if (ret != -EAGAIN)
772 if (num_bytes) { 659 break;
773 mutex_lock(&root->fs_info->trans_mutex);
774 btrfs_record_root_in_trans(root);
775 mutex_unlock(&root->fs_info->trans_mutex);
776 btrfs_set_root_used(&root->root_item,
777 bytes_used - num_bytes);
778 }
779 660
780 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); 661 ret = btrfs_update_root(trans, tree_root,
781 if (ret) { 662 &root->root_key,
782 BUG(); 663 &root->root_item);
664 if (ret)
783 break; 665 break;
784 }
785 mutex_unlock(&root->fs_info->drop_mutex);
786
787 spin_lock(&root->list_lock);
788 list_del_init(&dirty->root->dead_list);
789 if (!list_empty(&root->dead_list)) {
790 struct btrfs_root *oldest;
791 oldest = list_entry(root->dead_list.prev,
792 struct btrfs_root, dead_list);
793 max_useless = oldest->root_key.offset - 1;
794 } else {
795 max_useless = root->root_key.offset - 1;
796 }
797 spin_unlock(&root->list_lock);
798 666
799 nr = trans->blocks_used; 667 nr = trans->blocks_used;
800 ret = btrfs_end_transaction(trans, tree_root); 668 ret = btrfs_end_transaction(trans, tree_root);
801 BUG_ON(ret); 669 BUG_ON(ret);
802 670
803 ret = btrfs_remove_leaf_refs(root, max_useless, 0);
804 BUG_ON(ret);
805
806 free_extent_buffer(dirty->root->node);
807 kfree(dirty->root);
808 kfree(dirty);
809
810 btrfs_btree_balance_dirty(tree_root, nr); 671 btrfs_btree_balance_dirty(tree_root, nr);
811 cond_resched(); 672 cond_resched();
812 } 673 }
674 BUG_ON(ret);
675
676 ret = btrfs_del_root(trans, tree_root, &root->root_key);
677 BUG_ON(ret);
678
679 nr = trans->blocks_used;
680 ret = btrfs_end_transaction(trans, tree_root);
681 BUG_ON(ret);
682
683 free_extent_buffer(root->node);
684 free_extent_buffer(root->commit_root);
685 kfree(root);
686
687 btrfs_btree_balance_dirty(tree_root, nr);
813 return ret; 688 return ret;
814} 689}
690#endif
815 691
816/* 692/*
817 * new snapshots need to be created at a very specific time in the 693 * new snapshots need to be created at a very specific time in the
@@ -839,24 +715,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
839 if (ret) 715 if (ret)
840 goto fail; 716 goto fail;
841 717
842 btrfs_record_root_in_trans(root); 718 record_root_in_trans(trans, root);
843 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 719 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
844 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 720 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
845 721
846 key.objectid = objectid; 722 key.objectid = objectid;
847 key.offset = trans->transid; 723 key.offset = 0;
848 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 724 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
849 725
850 old = btrfs_lock_root_node(root); 726 old = btrfs_lock_root_node(root);
851 btrfs_cow_block(trans, root, old, NULL, 0, &old); 727 btrfs_cow_block(trans, root, old, NULL, 0, &old);
728 btrfs_set_lock_blocking(old);
852 729
853 btrfs_copy_root(trans, root, old, &tmp, objectid); 730 btrfs_copy_root(trans, root, old, &tmp, objectid);
854 btrfs_tree_unlock(old); 731 btrfs_tree_unlock(old);
855 free_extent_buffer(old); 732 free_extent_buffer(old);
856 733
857 btrfs_set_root_bytenr(new_root_item, tmp->start); 734 btrfs_set_root_node(new_root_item, tmp);
858 btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
859 btrfs_set_root_generation(new_root_item, trans->transid);
860 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 735 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
861 new_root_item); 736 new_root_item);
862 btrfs_tree_unlock(tmp); 737 btrfs_tree_unlock(tmp);
@@ -964,6 +839,34 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
964 return 0; 839 return 0;
965} 840}
966 841
842static void update_super_roots(struct btrfs_root *root)
843{
844 struct btrfs_root_item *root_item;
845 struct btrfs_super_block *super;
846
847 super = &root->fs_info->super_copy;
848
849 root_item = &root->fs_info->chunk_root->root_item;
850 super->chunk_root = root_item->bytenr;
851 super->chunk_root_generation = root_item->generation;
852 super->chunk_root_level = root_item->level;
853
854 root_item = &root->fs_info->tree_root->root_item;
855 super->root = root_item->bytenr;
856 super->generation = root_item->generation;
857 super->root_level = root_item->level;
858}
859
860int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
861{
862 int ret = 0;
863 spin_lock(&info->new_trans_lock);
864 if (info->running_transaction)
865 ret = info->running_transaction->in_commit;
866 spin_unlock(&info->new_trans_lock);
867 return ret;
868}
869
967int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 870int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
968 struct btrfs_root *root) 871 struct btrfs_root *root)
969{ 872{
@@ -971,8 +874,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
971 unsigned long timeout = 1; 874 unsigned long timeout = 1;
972 struct btrfs_transaction *cur_trans; 875 struct btrfs_transaction *cur_trans;
973 struct btrfs_transaction *prev_trans = NULL; 876 struct btrfs_transaction *prev_trans = NULL;
974 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
975 struct list_head dirty_fs_roots;
976 struct extent_io_tree *pinned_copy; 877 struct extent_io_tree *pinned_copy;
977 DEFINE_WAIT(wait); 878 DEFINE_WAIT(wait);
978 int ret; 879 int ret;
@@ -999,7 +900,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
999 BUG_ON(ret); 900 BUG_ON(ret);
1000 901
1001 mutex_lock(&root->fs_info->trans_mutex); 902 mutex_lock(&root->fs_info->trans_mutex);
1002 INIT_LIST_HEAD(&dirty_fs_roots);
1003 if (cur_trans->in_commit) { 903 if (cur_trans->in_commit) {
1004 cur_trans->use_count++; 904 cur_trans->use_count++;
1005 mutex_unlock(&root->fs_info->trans_mutex); 905 mutex_unlock(&root->fs_info->trans_mutex);
@@ -1058,9 +958,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1058 958
1059 mutex_unlock(&root->fs_info->trans_mutex); 959 mutex_unlock(&root->fs_info->trans_mutex);
1060 960
1061 if (flush_on_commit || snap_pending) { 961 if (flush_on_commit) {
1062 if (flush_on_commit) 962 btrfs_start_delalloc_inodes(root);
1063 btrfs_start_delalloc_inodes(root); 963 ret = btrfs_wait_ordered_extents(root, 0);
964 BUG_ON(ret);
965 } else if (snap_pending) {
1064 ret = btrfs_wait_ordered_extents(root, 1); 966 ret = btrfs_wait_ordered_extents(root, 1);
1065 BUG_ON(ret); 967 BUG_ON(ret);
1066 } 968 }
@@ -1105,41 +1007,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1105 * with the tree-log code. 1007 * with the tree-log code.
1106 */ 1008 */
1107 mutex_lock(&root->fs_info->tree_log_mutex); 1009 mutex_lock(&root->fs_info->tree_log_mutex);
1108 /*
1109 * keep tree reloc code from adding new reloc trees
1110 */
1111 mutex_lock(&root->fs_info->tree_reloc_mutex);
1112 1010
1113 1011 ret = commit_fs_roots(trans, root);
1114 ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
1115 &dirty_fs_roots);
1116 BUG_ON(ret); 1012 BUG_ON(ret);
1117 1013
1118 /* add_dirty_roots gets rid of all the tree log roots, it is now 1014 /* commit_fs_roots gets rid of all the tree log roots, it is now
1119 * safe to free the root of tree log roots 1015 * safe to free the root of tree log roots
1120 */ 1016 */
1121 btrfs_free_log_root_tree(trans, root->fs_info); 1017 btrfs_free_log_root_tree(trans, root->fs_info);
1122 1018
1123 ret = btrfs_commit_tree_roots(trans, root); 1019 ret = commit_cowonly_roots(trans, root);
1124 BUG_ON(ret); 1020 BUG_ON(ret);
1125 1021
1126 cur_trans = root->fs_info->running_transaction; 1022 cur_trans = root->fs_info->running_transaction;
1127 spin_lock(&root->fs_info->new_trans_lock); 1023 spin_lock(&root->fs_info->new_trans_lock);
1128 root->fs_info->running_transaction = NULL; 1024 root->fs_info->running_transaction = NULL;
1129 spin_unlock(&root->fs_info->new_trans_lock); 1025 spin_unlock(&root->fs_info->new_trans_lock);
1130 btrfs_set_super_generation(&root->fs_info->super_copy, 1026
1131 cur_trans->transid); 1027 btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1132 btrfs_set_super_root(&root->fs_info->super_copy, 1028 root->fs_info->tree_root->node);
1133 root->fs_info->tree_root->node->start); 1029 switch_commit_root(root->fs_info->tree_root);
1134 btrfs_set_super_root_level(&root->fs_info->super_copy, 1030
1135 btrfs_header_level(root->fs_info->tree_root->node)); 1031 btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1136 1032 root->fs_info->chunk_root->node);
1137 btrfs_set_super_chunk_root(&root->fs_info->super_copy, 1033 switch_commit_root(root->fs_info->chunk_root);
1138 chunk_root->node->start); 1034
1139 btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, 1035 update_super_roots(root);
1140 btrfs_header_level(chunk_root->node));
1141 btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
1142 btrfs_header_generation(chunk_root->node));
1143 1036
1144 if (!root->fs_info->log_root_recovering) { 1037 if (!root->fs_info->log_root_recovering) {
1145 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1038 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
@@ -1153,7 +1046,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1153 1046
1154 trans->transaction->blocked = 0; 1047 trans->transaction->blocked = 0;
1155 1048
1156 wake_up(&root->fs_info->transaction_throttle);
1157 wake_up(&root->fs_info->transaction_wait); 1049 wake_up(&root->fs_info->transaction_wait);
1158 1050
1159 mutex_unlock(&root->fs_info->trans_mutex); 1051 mutex_unlock(&root->fs_info->trans_mutex);
@@ -1170,9 +1062,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1170 btrfs_finish_extent_commit(trans, root, pinned_copy); 1062 btrfs_finish_extent_commit(trans, root, pinned_copy);
1171 kfree(pinned_copy); 1063 kfree(pinned_copy);
1172 1064
1173 btrfs_drop_dead_reloc_roots(root);
1174 mutex_unlock(&root->fs_info->tree_reloc_mutex);
1175
1176 /* do the directory inserts of any pending snapshot creations */ 1065 /* do the directory inserts of any pending snapshot creations */
1177 finish_pending_snapshots(trans, root->fs_info); 1066 finish_pending_snapshots(trans, root->fs_info);
1178 1067
@@ -1181,21 +1070,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1181 cur_trans->commit_done = 1; 1070 cur_trans->commit_done = 1;
1182 1071
1183 root->fs_info->last_trans_committed = cur_trans->transid; 1072 root->fs_info->last_trans_committed = cur_trans->transid;
1073
1184 wake_up(&cur_trans->commit_wait); 1074 wake_up(&cur_trans->commit_wait);
1185 1075
1186 put_transaction(cur_trans); 1076 put_transaction(cur_trans);
1187 put_transaction(cur_trans); 1077 put_transaction(cur_trans);
1188 1078
1189 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
1190 if (root->fs_info->closing)
1191 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
1192
1193 mutex_unlock(&root->fs_info->trans_mutex); 1079 mutex_unlock(&root->fs_info->trans_mutex);
1194 1080
1195 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1081 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1196
1197 if (root->fs_info->closing)
1198 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1199 return ret; 1082 return ret;
1200} 1083}
1201 1084
@@ -1204,16 +1087,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1204 */ 1087 */
1205int btrfs_clean_old_snapshots(struct btrfs_root *root) 1088int btrfs_clean_old_snapshots(struct btrfs_root *root)
1206{ 1089{
1207 struct list_head dirty_roots; 1090 LIST_HEAD(list);
1208 INIT_LIST_HEAD(&dirty_roots); 1091 struct btrfs_fs_info *fs_info = root->fs_info;
1209again: 1092
1210 mutex_lock(&root->fs_info->trans_mutex); 1093 mutex_lock(&fs_info->trans_mutex);
1211 list_splice_init(&root->fs_info->dead_roots, &dirty_roots); 1094 list_splice_init(&fs_info->dead_roots, &list);
1212 mutex_unlock(&root->fs_info->trans_mutex); 1095 mutex_unlock(&fs_info->trans_mutex);
1213 1096
1214 if (!list_empty(&dirty_roots)) { 1097 while (!list_empty(&list)) {
1215 drop_dirty_roots(root, &dirty_roots); 1098 root = list_entry(list.next, struct btrfs_root, root_list);
1216 goto again; 1099 list_del_init(&root->root_list);
1100 btrfs_drop_snapshot(root, 0);
1217 } 1101 }
1218 return 0; 1102 return 0;
1219} 1103}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 94f5bde2b58d..663c67404918 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,12 +62,6 @@ struct btrfs_pending_snapshot {
62 struct list_head list; 62 struct list_head list;
63}; 63};
64 64
65struct btrfs_dirty_root {
66 struct list_head list;
67 struct btrfs_root *root;
68 struct btrfs_root *latest_root;
69};
70
71static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, 65static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
72 struct inode *inode) 66 struct inode *inode)
73{ 67{
@@ -100,7 +94,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
100int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 94int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
101 struct btrfs_root *root); 95 struct btrfs_root *root);
102 96
103int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest); 97int btrfs_add_dead_root(struct btrfs_root *root);
98int btrfs_drop_dead_root(struct btrfs_root *root);
104int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); 99int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
105int btrfs_clean_old_snapshots(struct btrfs_root *root); 100int btrfs_clean_old_snapshots(struct btrfs_root *root);
106int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 101int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -108,7 +103,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
108int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 103int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
109 struct btrfs_root *root); 104 struct btrfs_root *root);
110void btrfs_throttle(struct btrfs_root *root); 105void btrfs_throttle(struct btrfs_root *root);
111int btrfs_record_root_in_trans(struct btrfs_root *root); 106int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root);
112int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 108int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
113 struct extent_io_tree *dirty_pages); 109 struct extent_io_tree *dirty_pages);
110int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
114#endif 111#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index db5e212e8445..d91b0de7c502 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -430,18 +430,16 @@ no_copy:
430static noinline struct inode *read_one_inode(struct btrfs_root *root, 430static noinline struct inode *read_one_inode(struct btrfs_root *root,
431 u64 objectid) 431 u64 objectid)
432{ 432{
433 struct btrfs_key key;
433 struct inode *inode; 434 struct inode *inode;
434 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
435 if (inode->i_state & I_NEW) {
436 BTRFS_I(inode)->root = root;
437 BTRFS_I(inode)->location.objectid = objectid;
438 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
439 BTRFS_I(inode)->location.offset = 0;
440 btrfs_read_locked_inode(inode);
441 unlock_new_inode(inode);
442 435
443 } 436 key.objectid = objectid;
444 if (is_bad_inode(inode)) { 437 key.type = BTRFS_INODE_ITEM_KEY;
438 key.offset = 0;
439 inode = btrfs_iget(root->fs_info->sb, &key, root);
440 if (IS_ERR(inode)) {
441 inode = NULL;
442 } else if (is_bad_inode(inode)) {
445 iput(inode); 443 iput(inode);
446 inode = NULL; 444 inode = NULL;
447 } 445 }
@@ -541,6 +539,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
541 539
542 if (found_type == BTRFS_FILE_EXTENT_REG || 540 if (found_type == BTRFS_FILE_EXTENT_REG ||
543 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 541 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
542 u64 offset;
544 unsigned long dest_offset; 543 unsigned long dest_offset;
545 struct btrfs_key ins; 544 struct btrfs_key ins;
546 545
@@ -555,6 +554,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
555 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 554 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
556 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 555 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
557 ins.type = BTRFS_EXTENT_ITEM_KEY; 556 ins.type = BTRFS_EXTENT_ITEM_KEY;
557 offset = key->offset - btrfs_file_extent_offset(eb, item);
558 558
559 if (ins.objectid > 0) { 559 if (ins.objectid > 0) {
560 u64 csum_start; 560 u64 csum_start;
@@ -569,19 +569,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
569 if (ret == 0) { 569 if (ret == 0) {
570 ret = btrfs_inc_extent_ref(trans, root, 570 ret = btrfs_inc_extent_ref(trans, root,
571 ins.objectid, ins.offset, 571 ins.objectid, ins.offset,
572 path->nodes[0]->start, 572 0, root->root_key.objectid,
573 root->root_key.objectid, 573 key->objectid, offset);
574 trans->transid, key->objectid);
575 } else { 574 } else {
576 /* 575 /*
577 * insert the extent pointer in the extent 576 * insert the extent pointer in the extent
578 * allocation tree 577 * allocation tree
579 */ 578 */
580 ret = btrfs_alloc_logged_extent(trans, root, 579 ret = btrfs_alloc_logged_file_extent(trans,
581 path->nodes[0]->start, 580 root, root->root_key.objectid,
582 root->root_key.objectid, 581 key->objectid, offset, &ins);
583 trans->transid, key->objectid,
584 &ins);
585 BUG_ON(ret); 582 BUG_ON(ret);
586 } 583 }
587 btrfs_release_path(root, path); 584 btrfs_release_path(root, path);
@@ -800,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
800 return -ENOENT; 797 return -ENOENT;
801 798
802 inode = read_one_inode(root, key->objectid); 799 inode = read_one_inode(root, key->objectid);
803 BUG_ON(!dir); 800 BUG_ON(!inode);
804 801
805 ref_ptr = btrfs_item_ptr_offset(eb, slot); 802 ref_ptr = btrfs_item_ptr_offset(eb, slot);
806 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 803 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
@@ -1706,9 +1703,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1706 btrfs_wait_tree_block_writeback(next); 1703 btrfs_wait_tree_block_writeback(next);
1707 btrfs_tree_unlock(next); 1704 btrfs_tree_unlock(next);
1708 1705
1709 ret = btrfs_drop_leaf_ref(trans, root, next);
1710 BUG_ON(ret);
1711
1712 WARN_ON(root_owner != 1706 WARN_ON(root_owner !=
1713 BTRFS_TREE_LOG_OBJECTID); 1707 BTRFS_TREE_LOG_OBJECTID);
1714 ret = btrfs_free_reserved_extent(root, 1708 ret = btrfs_free_reserved_extent(root,
@@ -1753,10 +1747,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1753 btrfs_wait_tree_block_writeback(next); 1747 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next); 1748 btrfs_tree_unlock(next);
1755 1749
1756 if (*level == 0) {
1757 ret = btrfs_drop_leaf_ref(trans, root, next);
1758 BUG_ON(ret);
1759 }
1760 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1750 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1761 ret = btrfs_free_reserved_extent(root, bytenr, blocksize); 1751 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1762 BUG_ON(ret); 1752 BUG_ON(ret);
@@ -1811,12 +1801,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1811 btrfs_wait_tree_block_writeback(next); 1801 btrfs_wait_tree_block_writeback(next);
1812 btrfs_tree_unlock(next); 1802 btrfs_tree_unlock(next);
1813 1803
1814 if (*level == 0) {
1815 ret = btrfs_drop_leaf_ref(trans, root,
1816 next);
1817 BUG_ON(ret);
1818 }
1819
1820 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1804 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1821 ret = btrfs_free_reserved_extent(root, 1805 ret = btrfs_free_reserved_extent(root,
1822 path->nodes[*level]->start, 1806 path->nodes[*level]->start,
@@ -1884,11 +1868,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1884 btrfs_wait_tree_block_writeback(next); 1868 btrfs_wait_tree_block_writeback(next);
1885 btrfs_tree_unlock(next); 1869 btrfs_tree_unlock(next);
1886 1870
1887 if (orig_level == 0) {
1888 ret = btrfs_drop_leaf_ref(trans, log,
1889 next);
1890 BUG_ON(ret);
1891 }
1892 WARN_ON(log->root_key.objectid != 1871 WARN_ON(log->root_key.objectid !=
1893 BTRFS_TREE_LOG_OBJECTID); 1872 BTRFS_TREE_LOG_OBJECTID);
1894 ret = btrfs_free_reserved_extent(log, next->start, 1873 ret = btrfs_free_reserved_extent(log, next->start,
@@ -2027,9 +2006,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2027 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2006 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
2028 BUG_ON(ret); 2007 BUG_ON(ret);
2029 2008
2030 btrfs_set_root_bytenr(&log->root_item, log->node->start); 2009 btrfs_set_root_node(&log->root_item, log->node);
2031 btrfs_set_root_generation(&log->root_item, trans->transid);
2032 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2033 2010
2034 root->log_batch = 0; 2011 root->log_batch = 0;
2035 root->log_transid++; 2012 root->log_transid++;
@@ -2581,7 +2558,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2581 ins_keys, ins_sizes, nr); 2558 ins_keys, ins_sizes, nr);
2582 BUG_ON(ret); 2559 BUG_ON(ret);
2583 2560
2584 for (i = 0; i < nr; i++) { 2561 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2585 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2562 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2586 dst_path->slots[0]); 2563 dst_path->slots[0]);
2587 2564
@@ -2617,36 +2594,31 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2617 found_type = btrfs_file_extent_type(src, extent); 2594 found_type = btrfs_file_extent_type(src, extent);
2618 if (found_type == BTRFS_FILE_EXTENT_REG || 2595 if (found_type == BTRFS_FILE_EXTENT_REG ||
2619 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2596 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2620 u64 ds = btrfs_file_extent_disk_bytenr(src, 2597 u64 ds, dl, cs, cl;
2621 extent); 2598 ds = btrfs_file_extent_disk_bytenr(src,
2622 u64 dl = btrfs_file_extent_disk_num_bytes(src, 2599 extent);
2623 extent); 2600 /* ds == 0 is a hole */
2624 u64 cs = btrfs_file_extent_offset(src, extent); 2601 if (ds == 0)
2625 u64 cl = btrfs_file_extent_num_bytes(src, 2602 continue;
2626 extent);; 2603
2604 dl = btrfs_file_extent_disk_num_bytes(src,
2605 extent);
2606 cs = btrfs_file_extent_offset(src, extent);
2607 cl = btrfs_file_extent_num_bytes(src,
2608 extent);;
2627 if (btrfs_file_extent_compression(src, 2609 if (btrfs_file_extent_compression(src,
2628 extent)) { 2610 extent)) {
2629 cs = 0; 2611 cs = 0;
2630 cl = dl; 2612 cl = dl;
2631 } 2613 }
2632 /* ds == 0 is a hole */ 2614
2633 if (ds != 0) { 2615 ret = btrfs_lookup_csums_range(
2634 ret = btrfs_inc_extent_ref(trans, log, 2616 log->fs_info->csum_root,
2635 ds, dl, 2617 ds + cs, ds + cs + cl - 1,
2636 dst_path->nodes[0]->start, 2618 &ordered_sums);
2637 BTRFS_TREE_LOG_OBJECTID, 2619 BUG_ON(ret);
2638 trans->transid,
2639 ins_keys[i].objectid);
2640 BUG_ON(ret);
2641 ret = btrfs_lookup_csums_range(
2642 log->fs_info->csum_root,
2643 ds + cs, ds + cs + cl - 1,
2644 &ordered_sums);
2645 BUG_ON(ret);
2646 }
2647 } 2620 }
2648 } 2621 }
2649 dst_path->slots[0]++;
2650 } 2622 }
2651 2623
2652 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2624 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
@@ -3029,9 +3001,7 @@ again:
3029 BUG_ON(!wc.replay_dest); 3001 BUG_ON(!wc.replay_dest);
3030 3002
3031 wc.replay_dest->log_root = log; 3003 wc.replay_dest->log_root = log;
3032 mutex_lock(&fs_info->trans_mutex); 3004 btrfs_record_root_in_trans(trans, wc.replay_dest);
3033 btrfs_record_root_in_trans(wc.replay_dest);
3034 mutex_unlock(&fs_info->trans_mutex);
3035 ret = walk_log_tree(trans, log, &wc); 3005 ret = walk_log_tree(trans, log, &wc);
3036 BUG_ON(ret); 3006 BUG_ON(ret);
3037 3007
@@ -3049,6 +3019,7 @@ again:
3049 key.offset = found_key.offset - 1; 3019 key.offset = found_key.offset - 1;
3050 wc.replay_dest->log_root = NULL; 3020 wc.replay_dest->log_root = NULL;
3051 free_extent_buffer(log->node); 3021 free_extent_buffer(log->node);
3022 free_extent_buffer(log->commit_root);
3052 kfree(log); 3023 kfree(log);
3053 3024
3054 if (found_key.offset == 0) 3025 if (found_key.offset == 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a6d35b0054ca..5dbefd11b4af 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -161,8 +161,10 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
161 int again = 0; 161 int again = 0;
162 unsigned long num_run; 162 unsigned long num_run;
163 unsigned long num_sync_run; 163 unsigned long num_sync_run;
164 unsigned long batch_run = 0;
164 unsigned long limit; 165 unsigned long limit;
165 unsigned long last_waited = 0; 166 unsigned long last_waited = 0;
167 int force_reg = 0;
166 168
167 bdi = blk_get_backing_dev_info(device->bdev); 169 bdi = blk_get_backing_dev_info(device->bdev);
168 fs_info = device->dev_root->fs_info; 170 fs_info = device->dev_root->fs_info;
@@ -176,19 +178,22 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
176 178
177loop: 179loop:
178 spin_lock(&device->io_lock); 180 spin_lock(&device->io_lock);
179 num_run = 0;
180 181
181loop_lock: 182loop_lock:
183 num_run = 0;
182 184
183 /* take all the bios off the list at once and process them 185 /* take all the bios off the list at once and process them
184 * later on (without the lock held). But, remember the 186 * later on (without the lock held). But, remember the
185 * tail and other pointers so the bios can be properly reinserted 187 * tail and other pointers so the bios can be properly reinserted
186 * into the list if we hit congestion 188 * into the list if we hit congestion
187 */ 189 */
188 if (device->pending_sync_bios.head) 190 if (!force_reg && device->pending_sync_bios.head) {
189 pending_bios = &device->pending_sync_bios; 191 pending_bios = &device->pending_sync_bios;
190 else 192 force_reg = 1;
193 } else {
191 pending_bios = &device->pending_bios; 194 pending_bios = &device->pending_bios;
195 force_reg = 0;
196 }
192 197
193 pending = pending_bios->head; 198 pending = pending_bios->head;
194 tail = pending_bios->tail; 199 tail = pending_bios->tail;
@@ -228,10 +233,14 @@ loop_lock:
228 while (pending) { 233 while (pending) {
229 234
230 rmb(); 235 rmb();
231 if (pending_bios != &device->pending_sync_bios && 236 /* we want to work on both lists, but do more bios on the
232 device->pending_sync_bios.head && 237 * sync list than the regular list
233 num_run > 16) { 238 */
234 cond_resched(); 239 if ((num_run > 32 &&
240 pending_bios != &device->pending_sync_bios &&
241 device->pending_sync_bios.head) ||
242 (num_run > 64 && pending_bios == &device->pending_sync_bios &&
243 device->pending_bios.head)) {
235 spin_lock(&device->io_lock); 244 spin_lock(&device->io_lock);
236 requeue_list(pending_bios, pending, tail); 245 requeue_list(pending_bios, pending, tail);
237 goto loop_lock; 246 goto loop_lock;
@@ -249,6 +258,8 @@ loop_lock:
249 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 258 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
250 submit_bio(cur->bi_rw, cur); 259 submit_bio(cur->bi_rw, cur);
251 num_run++; 260 num_run++;
261 batch_run++;
262
252 if (bio_sync(cur)) 263 if (bio_sync(cur))
253 num_sync_run++; 264 num_sync_run++;
254 265
@@ -265,7 +276,7 @@ loop_lock:
265 * is now congested. Back off and let other work structs 276 * is now congested. Back off and let other work structs
266 * run instead 277 * run instead
267 */ 278 */
268 if (pending && bdi_write_congested(bdi) && num_run > 16 && 279 if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
269 fs_info->fs_devices->open_devices > 1) { 280 fs_info->fs_devices->open_devices > 1) {
270 struct io_context *ioc; 281 struct io_context *ioc;
271 282
@@ -366,6 +377,7 @@ static noinline int device_list_add(const char *path,
366 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 377 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
367 fs_devices->latest_devid = devid; 378 fs_devices->latest_devid = devid;
368 fs_devices->latest_trans = found_transid; 379 fs_devices->latest_trans = found_transid;
380 mutex_init(&fs_devices->device_list_mutex);
369 device = NULL; 381 device = NULL;
370 } else { 382 } else {
371 device = __find_device(&fs_devices->devices, devid, 383 device = __find_device(&fs_devices->devices, devid,
@@ -392,7 +404,11 @@ static noinline int device_list_add(const char *path,
392 return -ENOMEM; 404 return -ENOMEM;
393 } 405 }
394 INIT_LIST_HEAD(&device->dev_alloc_list); 406 INIT_LIST_HEAD(&device->dev_alloc_list);
407
408 mutex_lock(&fs_devices->device_list_mutex);
395 list_add(&device->dev_list, &fs_devices->devices); 409 list_add(&device->dev_list, &fs_devices->devices);
410 mutex_unlock(&fs_devices->device_list_mutex);
411
396 device->fs_devices = fs_devices; 412 device->fs_devices = fs_devices;
397 fs_devices->num_devices++; 413 fs_devices->num_devices++;
398 } 414 }
@@ -418,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
418 INIT_LIST_HEAD(&fs_devices->devices); 434 INIT_LIST_HEAD(&fs_devices->devices);
419 INIT_LIST_HEAD(&fs_devices->alloc_list); 435 INIT_LIST_HEAD(&fs_devices->alloc_list);
420 INIT_LIST_HEAD(&fs_devices->list); 436 INIT_LIST_HEAD(&fs_devices->list);
437 mutex_init(&fs_devices->device_list_mutex);
421 fs_devices->latest_devid = orig->latest_devid; 438 fs_devices->latest_devid = orig->latest_devid;
422 fs_devices->latest_trans = orig->latest_trans; 439 fs_devices->latest_trans = orig->latest_trans;
423 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 440 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
424 441
442 mutex_lock(&orig->device_list_mutex);
425 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 443 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
426 device = kzalloc(sizeof(*device), GFP_NOFS); 444 device = kzalloc(sizeof(*device), GFP_NOFS);
427 if (!device) 445 if (!device)
@@ -443,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
443 device->fs_devices = fs_devices; 461 device->fs_devices = fs_devices;
444 fs_devices->num_devices++; 462 fs_devices->num_devices++;
445 } 463 }
464 mutex_unlock(&orig->device_list_mutex);
446 return fs_devices; 465 return fs_devices;
447error: 466error:
467 mutex_unlock(&orig->device_list_mutex);
448 free_fs_devices(fs_devices); 468 free_fs_devices(fs_devices);
449 return ERR_PTR(-ENOMEM); 469 return ERR_PTR(-ENOMEM);
450} 470}
@@ -455,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
455 475
456 mutex_lock(&uuid_mutex); 476 mutex_lock(&uuid_mutex);
457again: 477again:
478 mutex_lock(&fs_devices->device_list_mutex);
458 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 479 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
459 if (device->in_fs_metadata) 480 if (device->in_fs_metadata)
460 continue; 481 continue;
@@ -474,6 +495,7 @@ again:
474 kfree(device->name); 495 kfree(device->name);
475 kfree(device); 496 kfree(device);
476 } 497 }
498 mutex_unlock(&fs_devices->device_list_mutex);
477 499
478 if (fs_devices->seed) { 500 if (fs_devices->seed) {
479 fs_devices = fs_devices->seed; 501 fs_devices = fs_devices->seed;
@@ -594,6 +616,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
594 device->in_fs_metadata = 0; 616 device->in_fs_metadata = 0;
595 device->mode = flags; 617 device->mode = flags;
596 618
619 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
620 fs_devices->rotating = 1;
621
597 fs_devices->open_devices++; 622 fs_devices->open_devices++;
598 if (device->writeable) { 623 if (device->writeable) {
599 fs_devices->rw_devices++; 624 fs_devices->rw_devices++;
@@ -696,7 +721,8 @@ error:
696 */ 721 */
697static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, 722static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
698 struct btrfs_device *device, 723 struct btrfs_device *device,
699 u64 num_bytes, u64 *start) 724 u64 num_bytes, u64 *start,
725 u64 *max_avail)
700{ 726{
701 struct btrfs_key key; 727 struct btrfs_key key;
702 struct btrfs_root *root = device->dev_root; 728 struct btrfs_root *root = device->dev_root;
@@ -733,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
733 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 759 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
734 if (ret < 0) 760 if (ret < 0)
735 goto error; 761 goto error;
736 ret = btrfs_previous_item(root, path, 0, key.type); 762 if (ret > 0) {
737 if (ret < 0) 763 ret = btrfs_previous_item(root, path, key.objectid, key.type);
738 goto error; 764 if (ret < 0)
765 goto error;
766 if (ret > 0)
767 start_found = 1;
768 }
739 l = path->nodes[0]; 769 l = path->nodes[0];
740 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 770 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
741 while (1) { 771 while (1) {
@@ -778,6 +808,10 @@ no_more_items:
778 if (last_byte < search_start) 808 if (last_byte < search_start)
779 last_byte = search_start; 809 last_byte = search_start;
780 hole_size = key.offset - last_byte; 810 hole_size = key.offset - last_byte;
811
812 if (hole_size > *max_avail)
813 *max_avail = hole_size;
814
781 if (key.offset > last_byte && 815 if (key.offset > last_byte &&
782 hole_size >= num_bytes) { 816 hole_size >= num_bytes) {
783 *start = last_byte; 817 *start = last_byte;
@@ -1121,12 +1155,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1121 1155
1122 device = NULL; 1156 device = NULL;
1123 devices = &root->fs_info->fs_devices->devices; 1157 devices = &root->fs_info->fs_devices->devices;
1158 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1124 list_for_each_entry(tmp, devices, dev_list) { 1159 list_for_each_entry(tmp, devices, dev_list) {
1125 if (tmp->in_fs_metadata && !tmp->bdev) { 1160 if (tmp->in_fs_metadata && !tmp->bdev) {
1126 device = tmp; 1161 device = tmp;
1127 break; 1162 break;
1128 } 1163 }
1129 } 1164 }
1165 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1130 bdev = NULL; 1166 bdev = NULL;
1131 bh = NULL; 1167 bh = NULL;
1132 disk_super = NULL; 1168 disk_super = NULL;
@@ -1181,7 +1217,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1181 goto error_brelse; 1217 goto error_brelse;
1182 1218
1183 device->in_fs_metadata = 0; 1219 device->in_fs_metadata = 0;
1220
1221 /*
1222 * the device list mutex makes sure that we don't change
1223 * the device list while someone else is writing out all
1224 * the device supers.
1225 */
1226 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1184 list_del_init(&device->dev_list); 1227 list_del_init(&device->dev_list);
1228 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1229
1185 device->fs_devices->num_devices--; 1230 device->fs_devices->num_devices--;
1186 1231
1187 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1232 next_device = list_entry(root->fs_info->fs_devices->devices.next,
@@ -1275,6 +1320,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1275 seed_devices->opened = 1; 1320 seed_devices->opened = 1;
1276 INIT_LIST_HEAD(&seed_devices->devices); 1321 INIT_LIST_HEAD(&seed_devices->devices);
1277 INIT_LIST_HEAD(&seed_devices->alloc_list); 1322 INIT_LIST_HEAD(&seed_devices->alloc_list);
1323 mutex_init(&seed_devices->device_list_mutex);
1278 list_splice_init(&fs_devices->devices, &seed_devices->devices); 1324 list_splice_init(&fs_devices->devices, &seed_devices->devices);
1279 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1325 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1280 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1326 list_for_each_entry(device, &seed_devices->devices, dev_list) {
@@ -1400,6 +1446,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1400 mutex_lock(&root->fs_info->volume_mutex); 1446 mutex_lock(&root->fs_info->volume_mutex);
1401 1447
1402 devices = &root->fs_info->fs_devices->devices; 1448 devices = &root->fs_info->fs_devices->devices;
1449 /*
1450 * we have the volume lock, so we don't need the extra
1451 * device list mutex while reading the list here.
1452 */
1403 list_for_each_entry(device, devices, dev_list) { 1453 list_for_each_entry(device, devices, dev_list) {
1404 if (device->bdev == bdev) { 1454 if (device->bdev == bdev) {
1405 ret = -EEXIST; 1455 ret = -EEXIST;
@@ -1454,6 +1504,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1454 } 1504 }
1455 1505
1456 device->fs_devices = root->fs_info->fs_devices; 1506 device->fs_devices = root->fs_info->fs_devices;
1507
1508 /*
1509 * we don't want write_supers to jump in here with our device
1510 * half setup
1511 */
1512 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1457 list_add(&device->dev_list, &root->fs_info->fs_devices->devices); 1513 list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1458 list_add(&device->dev_alloc_list, 1514 list_add(&device->dev_alloc_list,
1459 &root->fs_info->fs_devices->alloc_list); 1515 &root->fs_info->fs_devices->alloc_list);
@@ -1462,6 +1518,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1462 root->fs_info->fs_devices->rw_devices++; 1518 root->fs_info->fs_devices->rw_devices++;
1463 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1519 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1464 1520
1521 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1522 root->fs_info->fs_devices->rotating = 1;
1523
1465 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1524 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1466 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1525 btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1467 total_bytes + device->total_bytes); 1526 total_bytes + device->total_bytes);
@@ -1469,6 +1528,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1469 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1528 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1470 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1529 btrfs_set_super_num_devices(&root->fs_info->super_copy,
1471 total_bytes + 1); 1530 total_bytes + 1);
1531 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1472 1532
1473 if (seeding_dev) { 1533 if (seeding_dev) {
1474 ret = init_first_rw_device(trans, root, device); 1534 ret = init_first_rw_device(trans, root, device);
@@ -1570,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1570 device->fs_devices->total_rw_bytes += diff; 1630 device->fs_devices->total_rw_bytes += diff;
1571 1631
1572 device->total_bytes = new_size; 1632 device->total_bytes = new_size;
1633 device->disk_total_bytes = new_size;
1573 btrfs_clear_space_info_full(device->dev_root->fs_info); 1634 btrfs_clear_space_info_full(device->dev_root->fs_info);
1574 1635
1575 return btrfs_update_device(trans, device); 1636 return btrfs_update_device(trans, device);
@@ -1671,8 +1732,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1671 int ret; 1732 int ret;
1672 int i; 1733 int i;
1673 1734
1674 printk(KERN_INFO "btrfs relocating chunk %llu\n",
1675 (unsigned long long)chunk_offset);
1676 root = root->fs_info->chunk_root; 1735 root = root->fs_info->chunk_root;
1677 extent_root = root->fs_info->extent_root; 1736 extent_root = root->fs_info->extent_root;
1678 em_tree = &root->fs_info->mapping_tree.map_tree; 1737 em_tree = &root->fs_info->mapping_tree.map_tree;
@@ -1958,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1958 goto done; 2017 goto done;
1959 if (ret) { 2018 if (ret) {
1960 ret = 0; 2019 ret = 0;
1961 goto done; 2020 break;
1962 } 2021 }
1963 2022
1964 l = path->nodes[0]; 2023 l = path->nodes[0];
@@ -1966,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1966 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2025 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1967 2026
1968 if (key.objectid != device->devid) 2027 if (key.objectid != device->devid)
1969 goto done; 2028 break;
1970 2029
1971 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2030 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1972 length = btrfs_dev_extent_length(l, dev_extent); 2031 length = btrfs_dev_extent_length(l, dev_extent);
@@ -2122,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2122 max_chunk_size); 2181 max_chunk_size);
2123 2182
2124again: 2183again:
2184 max_avail = 0;
2125 if (!map || map->num_stripes != num_stripes) { 2185 if (!map || map->num_stripes != num_stripes) {
2126 kfree(map); 2186 kfree(map);
2127 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2187 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2170,7 +2230,8 @@ again:
2170 2230
2171 if (device->in_fs_metadata && avail >= min_free) { 2231 if (device->in_fs_metadata && avail >= min_free) {
2172 ret = find_free_dev_extent(trans, device, 2232 ret = find_free_dev_extent(trans, device,
2173 min_free, &dev_offset); 2233 min_free, &dev_offset,
2234 &max_avail);
2174 if (ret == 0) { 2235 if (ret == 0) {
2175 list_move_tail(&device->dev_alloc_list, 2236 list_move_tail(&device->dev_alloc_list,
2176 &private_devs); 2237 &private_devs);
@@ -2746,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2746 } 2807 }
2747 } 2808 }
2748 2809
2749 for (i = 0; i > nr; i++) {
2750 struct btrfs_multi_bio *multi;
2751 struct btrfs_bio_stripe *stripe;
2752 int ret;
2753
2754 length = 1;
2755 ret = btrfs_map_block(map_tree, WRITE, buf[i],
2756 &length, &multi, 0);
2757 BUG_ON(ret);
2758
2759 stripe = multi->stripes;
2760 for (j = 0; j < multi->num_stripes; j++) {
2761 if (stripe->physical >= physical &&
2762 physical < stripe->physical + length)
2763 break;
2764 }
2765 BUG_ON(j >= multi->num_stripes);
2766 kfree(multi);
2767 }
2768
2769 *logical = buf; 2810 *logical = buf;
2770 *naddrs = nr; 2811 *naddrs = nr;
2771 *stripe_len = map->stripe_len; 2812 *stripe_len = map->stripe_len;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5c3ff6d02fd7..5139a833f721 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -96,7 +96,12 @@ struct btrfs_fs_devices {
96 u64 rw_devices; 96 u64 rw_devices;
97 u64 total_rw_bytes; 97 u64 total_rw_bytes;
98 struct block_device *latest_bdev; 98 struct block_device *latest_bdev;
99 /* all of the devices in the FS */ 99
100 /* all of the devices in the FS, protected by a mutex
101 * so we can safely walk it to write out the supers without
102 * worrying about add/remove by the multi-device code
103 */
104 struct mutex device_list_mutex;
100 struct list_head devices; 105 struct list_head devices;
101 106
102 /* devices not currently being allocated */ 107 /* devices not currently being allocated */
@@ -107,6 +112,11 @@ struct btrfs_fs_devices {
107 int seeding; 112 int seeding;
108 113
109 int opened; 114 int opened;
115
116 /* set when we find or add a device that doesn't have the
117 * nonrot flag set
118 */
119 int rotating;
110}; 120};
111 121
112struct btrfs_bio_stripe { 122struct btrfs_bio_stripe {
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ecfbce836d32..3e2b90eaa239 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
208 *total_in = 0; 208 *total_in = 0;
209 209
210 workspace = find_zlib_workspace(); 210 workspace = find_zlib_workspace();
211 if (!workspace) 211 if (IS_ERR(workspace))
212 return -1; 212 return -1;
213 213
214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
@@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
366 char *kaddr; 366 char *kaddr;
367 367
368 workspace = find_zlib_workspace(); 368 workspace = find_zlib_workspace();
369 if (!workspace) 369 if (IS_ERR(workspace))
370 return -ENOMEM; 370 return -ENOMEM;
371 371
372 data_in = kmap(pages_in[page_in_index]); 372 data_in = kmap(pages_in[page_in_index]);
@@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
547 return -ENOMEM; 547 return -ENOMEM;
548 548
549 workspace = find_zlib_workspace(); 549 workspace = find_zlib_workspace();
550 if (!workspace) 550 if (IS_ERR(workspace))
551 return -ENOMEM; 551 return -ENOMEM;
552 552
553 workspace->inf_strm.next_in = data_in; 553 workspace->inf_strm.next_in = data_in;
diff --git a/fs/buffer.c b/fs/buffer.c
index 49106127a4aa..28f320fac4d4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1085,12 +1085,12 @@ static struct buffer_head *
1085__getblk_slow(struct block_device *bdev, sector_t block, int size) 1085__getblk_slow(struct block_device *bdev, sector_t block, int size)
1086{ 1086{
1087 /* Size must be multiple of hard sectorsize */ 1087 /* Size must be multiple of hard sectorsize */
1088 if (unlikely(size & (bdev_hardsect_size(bdev)-1) || 1088 if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1089 (size < 512 || size > PAGE_SIZE))) { 1089 (size < 512 || size > PAGE_SIZE))) {
1090 printk(KERN_ERR "getblk(): invalid block size %d requested\n", 1090 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1091 size); 1091 size);
1092 printk(KERN_ERR "hardsect size: %d\n", 1092 printk(KERN_ERR "logical block size: %d\n",
1093 bdev_hardsect_size(bdev)); 1093 bdev_logical_block_size(bdev));
1094 1094
1095 dump_stack(); 1095 dump_stack();
1096 return NULL; 1096 return NULL;
@@ -1165,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
1165 1165
1166 if (!test_set_buffer_dirty(bh)) { 1166 if (!test_set_buffer_dirty(bh)) {
1167 struct page *page = bh->b_page; 1167 struct page *page = bh->b_page;
1168 if (!TestSetPageDirty(page)) 1168 if (!TestSetPageDirty(page)) {
1169 __set_page_dirty(page, page_mapping(page), 0); 1169 struct address_space *mapping = page_mapping(page);
1170 if (mapping)
1171 __set_page_dirty(page, mapping, 0);
1172 }
1170 } 1173 }
1171} 1174}
1172 1175
@@ -2935,6 +2938,8 @@ int submit_bh(int rw, struct buffer_head * bh)
2935 BUG_ON(!buffer_locked(bh)); 2938 BUG_ON(!buffer_locked(bh));
2936 BUG_ON(!buffer_mapped(bh)); 2939 BUG_ON(!buffer_mapped(bh));
2937 BUG_ON(!bh->b_end_io); 2940 BUG_ON(!bh->b_end_io);
2941 BUG_ON(buffer_delay(bh));
2942 BUG_ON(buffer_unwritten(bh));
2938 2943
2939 /* 2944 /*
2940 * Mask in barrier bit for a write (could be either a WRITE or a 2945 * Mask in barrier bit for a write (could be either a WRITE or a
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 1e962348d111..431accd475a7 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -354,7 +354,9 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
354 /* make sure all pages pinned by operations on behalf of the netfs are 354 /* make sure all pages pinned by operations on behalf of the netfs are
355 * written to disc */ 355 * written to disc */
356 cachefiles_begin_secure(cache, &saved_cred); 356 cachefiles_begin_secure(cache, &saved_cred);
357 ret = fsync_super(cache->mnt->mnt_sb); 357 down_read(&cache->mnt->mnt_sb->s_umount);
358 ret = sync_filesystem(cache->mnt->mnt_sb);
359 up_read(&cache->mnt->mnt_sb->s_umount);
358 cachefiles_end_secure(cache, saved_cred); 360 cachefiles_end_secure(cache, saved_cred);
359 361
360 if (ret == -EIO) 362 if (ret == -EIO)
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 38f71222a552..a173551e19d7 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -13,7 +13,6 @@
13#include <linux/major.h> 13#include <linux/major.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/smp_lock.h>
17#include <linux/seq_file.h> 16#include <linux/seq_file.h>
18 17
19#include <linux/kobject.h> 18#include <linux/kobject.h>
@@ -375,7 +374,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
375 p = inode->i_cdev; 374 p = inode->i_cdev;
376 if (!p) { 375 if (!p) {
377 inode->i_cdev = p = new; 376 inode->i_cdev = p = new;
378 inode->i_cindex = idx;
379 list_add(&inode->i_devices, &p->list); 377 list_add(&inode->i_devices, &p->list);
380 new = NULL; 378 new = NULL;
381 } else if (!cdev_get(p)) 379 } else if (!cdev_get(p))
@@ -405,6 +403,18 @@ static int chrdev_open(struct inode *inode, struct file *filp)
405 return ret; 403 return ret;
406} 404}
407 405
406int cdev_index(struct inode *inode)
407{
408 int idx;
409 struct kobject *kobj;
410
411 kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
412 if (!kobj)
413 return -1;
414 kobject_put(kobj);
415 return idx;
416}
417
408void cd_forget(struct inode *inode) 418void cd_forget(struct inode *inode)
409{ 419{
410 spin_lock(&cdev_lock); 420 spin_lock(&cdev_lock);
@@ -557,6 +567,7 @@ EXPORT_SYMBOL(cdev_init);
557EXPORT_SYMBOL(cdev_alloc); 567EXPORT_SYMBOL(cdev_alloc);
558EXPORT_SYMBOL(cdev_del); 568EXPORT_SYMBOL(cdev_del);
559EXPORT_SYMBOL(cdev_add); 569EXPORT_SYMBOL(cdev_add);
570EXPORT_SYMBOL(cdev_index);
560EXPORT_SYMBOL(register_chrdev); 571EXPORT_SYMBOL(register_chrdev);
561EXPORT_SYMBOL(unregister_chrdev); 572EXPORT_SYMBOL(unregister_chrdev);
562EXPORT_SYMBOL(directly_mappable_cdev_bdi); 573EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index f20c4069c220..e85b1e4389e0 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,23 @@
1Version 1.60
2-------------
3Fix memory leak in reconnect. Fix oops in DFS mount error path.
4Set s_maxbytes to smaller (the max that vfs can handle) so that
5sendfile will now work over cifs mounts again. Add noforcegid
6and noforceuid mount parameters.
7
8Version 1.59
9------------
10Client uses server inode numbers (which are persistent) rather than
11client generated ones by default (mount option "serverino" turned
12on by default if server supports it). Add forceuid and forcegid
13mount options (so that when negotiating unix extensions specifying
14which uid mounted does not immediately force the server's reported
15uids to be overridden). Add support for scope mount parm. Improve
16hard link detection to use same inode for both. Do not set
17read-only dos attribute on directories (for chmod) since Windows
18explorer special cases this attribute bit for directories for
19a different purpose.
20
1Version 1.58 21Version 1.58
2------------ 22------------
3Guard against buffer overruns in various UCS-2 to UTF-8 string conversions 23Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
@@ -10,6 +30,8 @@ we converted from). Fix endianness of the vcnum field used during
10session setup to distinguish multiple mounts to same server from different 30session setup to distinguish multiple mounts to same server from different
11userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental 31userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
12flag to be set to 2, and mount must enable krb5 to turn on extended security). 32flag to be set to 2, and mount must enable krb5 to turn on extended security).
33Performance of file create to Samba improved (posix create on lookup
34removes 1 of 2 network requests sent on file create)
13 35
14Version 1.57 36Version 1.57
15------------ 37------------
diff --git a/fs/cifs/README b/fs/cifs/README
index db208ddb9899..79c1a93400be 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,10 +262,11 @@ A partial list of the supported mount options follows:
262 mount. 262 mount.
263 domain Set the SMB/CIFS workgroup name prepended to the 263 domain Set the SMB/CIFS workgroup name prepended to the
264 username during CIFS session establishment 264 username during CIFS session establishment
265 uid Set the default uid for inodes. For mounts to servers 265 forceuid Set the default uid for inodes to the uid
266 passed in on mount. For mounts to servers
266 which do support the CIFS Unix extensions, such as a 267 which do support the CIFS Unix extensions, such as a
267 properly configured Samba server, the server provides 268 properly configured Samba server, the server provides
268 the uid, gid and mode so this parameter should not be 269 the uid, gid and mode so this parameter should not be
269 specified unless the server and clients uid and gid 270 specified unless the server and clients uid and gid
270 numbering differ. If the server and client are in the 271 numbering differ. If the server and client are in the
271 same domain (e.g. running winbind or nss_ldap) and 272 same domain (e.g. running winbind or nss_ldap) and
@@ -277,11 +278,7 @@ A partial list of the supported mount options follows:
277 of existing files will be the uid (gid) of the person 278 of existing files will be the uid (gid) of the person
278 who executed the mount (root, except when mount.cifs 279 who executed the mount (root, except when mount.cifs
279 is configured setuid for user mounts) unless the "uid=" 280 is configured setuid for user mounts) unless the "uid="
280 (gid) mount option is specified. For the uid (gid) of newly 281 (gid) mount option is specified. Also note that permission
281 created files and directories, ie files created since
282 the last mount of the server share, the expected uid
283 (gid) is cached as long as the inode remains in
284 memory on the client. Also note that permission
285 checks (authorization checks) on accesses to a file occur 282 checks (authorization checks) on accesses to a file occur
286 at the server, but there are cases in which an administrator 283 at the server, but there are cases in which an administrator
287 may want to restrict at the client as well. For those 284 may want to restrict at the client as well. For those
@@ -289,9 +286,18 @@ A partial list of the supported mount options follows:
289 (such as Windows), permissions can also be checked at the 286 (such as Windows), permissions can also be checked at the
290 client, and a crude form of client side permission checking 287 client, and a crude form of client side permission checking
291 can be enabled by specifying file_mode and dir_mode on 288 can be enabled by specifying file_mode and dir_mode on
292 the client. Note that the mount.cifs helper must be 289 the client. (default)
293 at version 1.10 or higher to support specifying the uid 290 forcegid (similar to above but for the groupid instead of uid) (default)
294 (or gid) in non-numeric form. 291 noforceuid Fill in file owner information (uid) by requesting it from
292 the server if possible. With this option, the value given in
293 the uid= option (on mount) will only be used if the server
294 can not support returning uids on inodes.
295 noforcegid (similar to above but for the group owner, gid, instead of uid)
296 uid Set the default uid for inodes, and indicate to the
297 cifs kernel driver which local user mounted. If the server
298 supports the unix extensions the default uid is
299 not used to fill in the owner fields of inodes (files)
300 unless the "forceuid" parameter is specified.
295 gid Set the default gid for inodes (similar to above). 301 gid Set the default gid for inodes (similar to above).
296 file_mode If CIFS Unix extensions are not supported by the server 302 file_mode If CIFS Unix extensions are not supported by the server
297 this overrides the default mode for file inodes. 303 this overrides the default mode for file inodes.
@@ -388,8 +394,13 @@ A partial list of the supported mount options follows:
388 or the CIFS Unix Extensions equivalent and for those 394 or the CIFS Unix Extensions equivalent and for those
389 this mount option will have no effect. Exporting cifs mounts 395 this mount option will have no effect. Exporting cifs mounts
390 under nfsd requires this mount option on the cifs mount. 396 under nfsd requires this mount option on the cifs mount.
397 This is now the default if server supports the
398 required network operation.
391 noserverino Client generates inode numbers (rather than using the actual one 399 noserverino Client generates inode numbers (rather than using the actual one
392 from the server) by default. 400 from the server). These inode numbers will vary after
401 unmount or reboot which can confuse some applications,
402 but not all server filesystems support unique inode
403 numbers.
393 setuids If the CIFS Unix extensions are negotiated with the server 404 setuids If the CIFS Unix extensions are negotiated with the server
394 the client will attempt to set the effective uid and gid of 405 the client will attempt to set the effective uid and gid of
395 the local process on newly created files, directories, and 406 the local process on newly created files, directories, and
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 1b09f1670061..20692fbfdb24 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -49,6 +49,7 @@
49#define ASN1_OJI 6 /* Object Identifier */ 49#define ASN1_OJI 6 /* Object Identifier */
50#define ASN1_OJD 7 /* Object Description */ 50#define ASN1_OJD 7 /* Object Description */
51#define ASN1_EXT 8 /* External */ 51#define ASN1_EXT 8 /* External */
52#define ASN1_ENUM 10 /* Enumerated */
52#define ASN1_SEQ 16 /* Sequence */ 53#define ASN1_SEQ 16 /* Sequence */
53#define ASN1_SET 17 /* Set */ 54#define ASN1_SET 17 /* Set */
54#define ASN1_NUMSTR 18 /* Numerical String */ 55#define ASN1_NUMSTR 18 /* Numerical String */
@@ -78,10 +79,12 @@
78#define SPNEGO_OID_LEN 7 79#define SPNEGO_OID_LEN 7
79#define NTLMSSP_OID_LEN 10 80#define NTLMSSP_OID_LEN 10
80#define KRB5_OID_LEN 7 81#define KRB5_OID_LEN 7
82#define KRB5U2U_OID_LEN 8
81#define MSKRB5_OID_LEN 7 83#define MSKRB5_OID_LEN 7
82static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 }; 84static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 };
83static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 }; 85static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 };
84static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 }; 86static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 };
87static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 };
85static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 }; 88static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 };
86 89
87/* 90/*
@@ -122,6 +125,28 @@ asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
122 return 1; 125 return 1;
123} 126}
124 127
128#if 0 /* will be needed later by spnego decoding/encoding of ntlmssp */
129static unsigned char
130asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
131{
132 unsigned char ch;
133
134 if (ctx->pointer >= ctx->end) {
135 ctx->error = ASN1_ERR_DEC_EMPTY;
136 return 0;
137 }
138
139 ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */
140 if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */
141 *val = *(++(ctx->pointer)); /* value has enum value */
142 else
143 return 0;
144
145 ctx->pointer++;
146 return 1;
147}
148#endif
149
125static unsigned char 150static unsigned char
126asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) 151asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
127{ 152{
@@ -476,10 +501,9 @@ decode_negTokenInit(unsigned char *security_blob, int length,
476 unsigned int cls, con, tag, oidlen, rc; 501 unsigned int cls, con, tag, oidlen, rc;
477 bool use_ntlmssp = false; 502 bool use_ntlmssp = false;
478 bool use_kerberos = false; 503 bool use_kerberos = false;
504 bool use_kerberosu2u = false;
479 bool use_mskerberos = false; 505 bool use_mskerberos = false;
480 506
481 *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/
482
483 /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */ 507 /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
484 508
485 asn1_open(&ctx, security_blob, length); 509 asn1_open(&ctx, security_blob, length);
@@ -515,6 +539,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
515 return 0; 539 return 0;
516 } 540 }
517 541
542 /* SPNEGO */
518 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 543 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
519 cFYI(1, ("Error decoding negTokenInit")); 544 cFYI(1, ("Error decoding negTokenInit"));
520 return 0; 545 return 0;
@@ -526,6 +551,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
526 return 0; 551 return 0;
527 } 552 }
528 553
554 /* negTokenInit */
529 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 555 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
530 cFYI(1, ("Error decoding negTokenInit")); 556 cFYI(1, ("Error decoding negTokenInit"));
531 return 0; 557 return 0;
@@ -537,6 +563,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
537 return 0; 563 return 0;
538 } 564 }
539 565
566 /* sequence */
540 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 567 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
541 cFYI(1, ("Error decoding 2nd part of negTokenInit")); 568 cFYI(1, ("Error decoding 2nd part of negTokenInit"));
542 return 0; 569 return 0;
@@ -548,6 +575,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
548 return 0; 575 return 0;
549 } 576 }
550 577
578 /* sequence of */
551 if (asn1_header_decode 579 if (asn1_header_decode
552 (&ctx, &sequence_end, &cls, &con, &tag) == 0) { 580 (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
553 cFYI(1, ("Error decoding 2nd part of negTokenInit")); 581 cFYI(1, ("Error decoding 2nd part of negTokenInit"));
@@ -560,6 +588,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
560 return 0; 588 return 0;
561 } 589 }
562 590
591 /* list of security mechanisms */
563 while (!asn1_eoc_decode(&ctx, sequence_end)) { 592 while (!asn1_eoc_decode(&ctx, sequence_end)) {
564 rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); 593 rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
565 if (!rc) { 594 if (!rc) {
@@ -576,11 +605,15 @@ decode_negTokenInit(unsigned char *security_blob, int length,
576 605
577 if (compare_oid(oid, oidlen, MSKRB5_OID, 606 if (compare_oid(oid, oidlen, MSKRB5_OID,
578 MSKRB5_OID_LEN) && 607 MSKRB5_OID_LEN) &&
579 !use_kerberos) 608 !use_mskerberos)
580 use_mskerberos = true; 609 use_mskerberos = true;
610 else if (compare_oid(oid, oidlen, KRB5U2U_OID,
611 KRB5U2U_OID_LEN) &&
612 !use_kerberosu2u)
613 use_kerberosu2u = true;
581 else if (compare_oid(oid, oidlen, KRB5_OID, 614 else if (compare_oid(oid, oidlen, KRB5_OID,
582 KRB5_OID_LEN) && 615 KRB5_OID_LEN) &&
583 !use_mskerberos) 616 !use_kerberos)
584 use_kerberos = true; 617 use_kerberos = true;
585 else if (compare_oid(oid, oidlen, NTLMSSP_OID, 618 else if (compare_oid(oid, oidlen, NTLMSSP_OID,
586 NTLMSSP_OID_LEN)) 619 NTLMSSP_OID_LEN))
@@ -593,7 +626,12 @@ decode_negTokenInit(unsigned char *security_blob, int length,
593 } 626 }
594 } 627 }
595 628
629 /* mechlistMIC */
596 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 630 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
631 /* Check if we have reached the end of the blob, but with
632 no mechListMic (e.g. NTLMSSP instead of KRB5) */
633 if (ctx.error == ASN1_ERR_DEC_EMPTY)
634 goto decode_negtoken_exit;
597 cFYI(1, ("Error decoding last part negTokenInit exit3")); 635 cFYI(1, ("Error decoding last part negTokenInit exit3"));
598 return 0; 636 return 0;
599 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { 637 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
@@ -602,6 +640,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
602 cls, con, tag, end, *end)); 640 cls, con, tag, end, *end));
603 return 0; 641 return 0;
604 } 642 }
643
644 /* sequence */
605 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 645 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
606 cFYI(1, ("Error decoding last part negTokenInit exit5")); 646 cFYI(1, ("Error decoding last part negTokenInit exit5"));
607 return 0; 647 return 0;
@@ -611,6 +651,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
611 cls, con, tag, end, *end)); 651 cls, con, tag, end, *end));
612 } 652 }
613 653
654 /* sequence of */
614 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 655 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
615 cFYI(1, ("Error decoding last part negTokenInit exit 7")); 656 cFYI(1, ("Error decoding last part negTokenInit exit 7"));
616 return 0; 657 return 0;
@@ -619,6 +660,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
619 cls, con, tag, end, *end)); 660 cls, con, tag, end, *end));
620 return 0; 661 return 0;
621 } 662 }
663
664 /* general string */
622 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 665 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
623 cFYI(1, ("Error decoding last part negTokenInit exit9")); 666 cFYI(1, ("Error decoding last part negTokenInit exit9"));
624 return 0; 667 return 0;
@@ -630,13 +673,13 @@ decode_negTokenInit(unsigned char *security_blob, int length,
630 } 673 }
631 cFYI(1, ("Need to call asn1_octets_decode() function for %s", 674 cFYI(1, ("Need to call asn1_octets_decode() function for %s",
632 ctx.pointer)); /* is this UTF-8 or ASCII? */ 675 ctx.pointer)); /* is this UTF-8 or ASCII? */
633 676decode_negtoken_exit:
634 if (use_kerberos) 677 if (use_kerberos)
635 *secType = Kerberos; 678 *secType = Kerberos;
636 else if (use_mskerberos) 679 else if (use_mskerberos)
637 *secType = MSKerberos; 680 *secType = MSKerberos;
638 else if (use_ntlmssp) 681 else if (use_ntlmssp)
639 *secType = NTLMSSP; 682 *secType = RawNTLMSSP;
640 683
641 return 1; 684 return 1;
642} 685}
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 7f19fefd3d45..42cec2a7c0cf 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -261,6 +261,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
261 atomic_set(&tcon->num_reads, 0); 261 atomic_set(&tcon->num_reads, 0);
262 atomic_set(&tcon->num_oplock_brks, 0); 262 atomic_set(&tcon->num_oplock_brks, 0);
263 atomic_set(&tcon->num_opens, 0); 263 atomic_set(&tcon->num_opens, 0);
264 atomic_set(&tcon->num_posixopens, 0);
265 atomic_set(&tcon->num_posixmkdirs, 0);
264 atomic_set(&tcon->num_closes, 0); 266 atomic_set(&tcon->num_closes, 0);
265 atomic_set(&tcon->num_deletes, 0); 267 atomic_set(&tcon->num_deletes, 0);
266 atomic_set(&tcon->num_mkdirs, 0); 268 atomic_set(&tcon->num_mkdirs, 0);
@@ -347,11 +349,15 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
347 atomic_read(&tcon->num_locks), 349 atomic_read(&tcon->num_locks),
348 atomic_read(&tcon->num_hardlinks), 350 atomic_read(&tcon->num_hardlinks),
349 atomic_read(&tcon->num_symlinks)); 351 atomic_read(&tcon->num_symlinks));
350 seq_printf(m, "\nOpens: %d Closes: %d" 352 seq_printf(m, "\nOpens: %d Closes: %d "
351 "Deletes: %d", 353 "Deletes: %d",
352 atomic_read(&tcon->num_opens), 354 atomic_read(&tcon->num_opens),
353 atomic_read(&tcon->num_closes), 355 atomic_read(&tcon->num_closes),
354 atomic_read(&tcon->num_deletes)); 356 atomic_read(&tcon->num_deletes));
357 seq_printf(m, "\nPosix Opens: %d "
358 "Posix Mkdirs: %d",
359 atomic_read(&tcon->num_posixopens),
360 atomic_read(&tcon->num_posixmkdirs));
355 seq_printf(m, "\nMkdirs: %d Rmdirs: %d", 361 seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
356 atomic_read(&tcon->num_mkdirs), 362 atomic_read(&tcon->num_mkdirs),
357 atomic_read(&tcon->num_rmdirs)); 363 atomic_read(&tcon->num_rmdirs));
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 83d62759c7c7..606912d8f2a8 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -55,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
55 * i.e. strips from UNC trailing path that is not part of share 55 * i.e. strips from UNC trailing path that is not part of share
56 * name and fixup missing '\' in the begining of DFS node refferal 56 * name and fixup missing '\' in the begining of DFS node refferal
57 * if neccessary. 57 * if neccessary.
58 * Returns pointer to share name on success or NULL on error. 58 * Returns pointer to share name on success or ERR_PTR on error.
59 * Caller is responsible for freeing returned string. 59 * Caller is responsible for freeing returned string.
60 */ 60 */
61static char *cifs_get_share_name(const char *node_name) 61static char *cifs_get_share_name(const char *node_name)
@@ -68,7 +68,7 @@ static char *cifs_get_share_name(const char *node_name)
68 UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */, 68 UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
69 GFP_KERNEL); 69 GFP_KERNEL);
70 if (!UNC) 70 if (!UNC)
71 return NULL; 71 return ERR_PTR(-ENOMEM);
72 72
73 /* get share name and server name */ 73 /* get share name and server name */
74 if (node_name[1] != '\\') { 74 if (node_name[1] != '\\') {
@@ -87,7 +87,7 @@ static char *cifs_get_share_name(const char *node_name)
87 cERROR(1, ("%s: no server name end in node name: %s", 87 cERROR(1, ("%s: no server name end in node name: %s",
88 __func__, node_name)); 88 __func__, node_name));
89 kfree(UNC); 89 kfree(UNC);
90 return NULL; 90 return ERR_PTR(-EINVAL);
91 } 91 }
92 92
93 /* find sharename end */ 93 /* find sharename end */
@@ -133,6 +133,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
133 return ERR_PTR(-EINVAL); 133 return ERR_PTR(-EINVAL);
134 134
135 *devname = cifs_get_share_name(ref->node_name); 135 *devname = cifs_get_share_name(ref->node_name);
136 if (IS_ERR(*devname)) {
137 rc = PTR_ERR(*devname);
138 *devname = NULL;
139 goto compose_mount_options_err;
140 }
141
136 rc = dns_resolve_server_name_to_ip(*devname, &srvIP); 142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
137 if (rc != 0) { 143 if (rc != 0) {
138 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", 144 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
@@ -275,7 +281,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
275 case -EBUSY: 281 case -EBUSY:
276 /* someone else made a mount here whilst we were busy */ 282 /* someone else made a mount here whilst we were busy */
277 while (d_mountpoint(nd->path.dentry) && 283 while (d_mountpoint(nd->path.dentry) &&
278 follow_down(&nd->path.mnt, &nd->path.dentry)) 284 follow_down(&nd->path))
279 ; 285 ;
280 err = 0; 286 err = 0;
281 default: 287 default:
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 67bf93a40d2e..051caecf7d67 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -23,6 +23,7 @@
23#include <linux/string.h> 23#include <linux/string.h>
24#include <keys/user-type.h> 24#include <keys/user-type.h>
25#include <linux/key-type.h> 25#include <linux/key-type.h>
26#include <linux/inet.h>
26#include "cifsglob.h" 27#include "cifsglob.h"
27#include "cifs_spnego.h" 28#include "cifs_spnego.h"
28#include "cifs_debug.h" 29#include "cifs_debug.h"
@@ -73,9 +74,6 @@ struct key_type cifs_spnego_key_type = {
73 * strlen(";sec=ntlmsspi") */ 74 * strlen(";sec=ntlmsspi") */
74#define MAX_MECH_STR_LEN 13 75#define MAX_MECH_STR_LEN 13
75 76
76/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
77#define MAX_IPV6_ADDR_LEN 43
78
79/* strlen of "host=" */ 77/* strlen of "host=" */
80#define HOST_KEY_LEN 5 78#define HOST_KEY_LEN 5
81 79
@@ -88,6 +86,9 @@ struct key_type cifs_spnego_key_type = {
88/* strlen of ";user=" */ 86/* strlen of ";user=" */
89#define USER_KEY_LEN 6 87#define USER_KEY_LEN 6
90 88
89/* strlen of ";pid=0x" */
90#define PID_KEY_LEN 7
91
91/* get a key struct with a SPNEGO security blob, suitable for session setup */ 92/* get a key struct with a SPNEGO security blob, suitable for session setup */
92struct key * 93struct key *
93cifs_get_spnego_key(struct cifsSesInfo *sesInfo) 94cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
@@ -102,10 +103,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
102 host=hostname sec=mechanism uid=0xFF user=username */ 103 host=hostname sec=mechanism uid=0xFF user=username */
103 desc_len = MAX_VER_STR_LEN + 104 desc_len = MAX_VER_STR_LEN +
104 HOST_KEY_LEN + strlen(hostname) + 105 HOST_KEY_LEN + strlen(hostname) +
105 IP_KEY_LEN + MAX_IPV6_ADDR_LEN + 106 IP_KEY_LEN + INET6_ADDRSTRLEN +
106 MAX_MECH_STR_LEN + 107 MAX_MECH_STR_LEN +
107 UID_KEY_LEN + (sizeof(uid_t) * 2) + 108 UID_KEY_LEN + (sizeof(uid_t) * 2) +
108 USER_KEY_LEN + strlen(sesInfo->userName) + 1; 109 USER_KEY_LEN + strlen(sesInfo->userName) +
110 PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
109 111
110 spnego_key = ERR_PTR(-ENOMEM); 112 spnego_key = ERR_PTR(-ENOMEM);
111 description = kzalloc(desc_len, GFP_KERNEL); 113 description = kzalloc(desc_len, GFP_KERNEL);
@@ -143,6 +145,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
143 dp = description + strlen(description); 145 dp = description + strlen(description);
144 sprintf(dp, ";user=%s", sesInfo->userName); 146 sprintf(dp, ";user=%s", sesInfo->userName);
145 147
148 dp = description + strlen(description);
149 sprintf(dp, ";pid=0x%x", current->pid);
150
146 cFYI(1, ("key description = %s", description)); 151 cFYI(1, ("key description = %s", description));
147 spnego_key = request_key(&cifs_spnego_key_type, description, ""); 152 spnego_key = request_key(&cifs_spnego_key_type, description, "");
148 153
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 60e3c4253de0..714a542cbafc 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,7 +44,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
44 int maxwords = maxbytes / 2; 44 int maxwords = maxbytes / 2;
45 char tmp[NLS_MAX_CHARSET_SIZE]; 45 char tmp[NLS_MAX_CHARSET_SIZE];
46 46
47 for (i = 0; from[i] && i < maxwords; i++) { 47 for (i = 0; i < maxwords && from[i]; i++) {
48 charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp, 48 charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
49 NLS_MAX_CHARSET_SIZE); 49 NLS_MAX_CHARSET_SIZE);
50 if (charlen > 0) 50 if (charlen > 0)
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 57ecdc83c26f..6941c22398a6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -327,7 +327,7 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
327 327
328static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, 328static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
329 struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, 329 struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
330 struct inode *inode) 330 struct cifs_fattr *fattr)
331{ 331{
332 int i; 332 int i;
333 int num_aces = 0; 333 int num_aces = 0;
@@ -340,7 +340,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
340 if (!pdacl) { 340 if (!pdacl) {
341 /* no DACL in the security descriptor, set 341 /* no DACL in the security descriptor, set
342 all the permissions for user/group/other */ 342 all the permissions for user/group/other */
343 inode->i_mode |= S_IRWXUGO; 343 fattr->cf_mode |= S_IRWXUGO;
344 return; 344 return;
345 } 345 }
346 346
@@ -357,7 +357,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
357 /* reset rwx permissions for user/group/other. 357 /* reset rwx permissions for user/group/other.
358 Also, if num_aces is 0 i.e. DACL has no ACEs, 358 Also, if num_aces is 0 i.e. DACL has no ACEs,
359 user/group/other have no permissions */ 359 user/group/other have no permissions */
360 inode->i_mode &= ~(S_IRWXUGO); 360 fattr->cf_mode &= ~(S_IRWXUGO);
361 361
362 acl_base = (char *)pdacl; 362 acl_base = (char *)pdacl;
363 acl_size = sizeof(struct cifs_acl); 363 acl_size = sizeof(struct cifs_acl);
@@ -379,17 +379,17 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
379 if (compare_sids(&(ppace[i]->sid), pownersid)) 379 if (compare_sids(&(ppace[i]->sid), pownersid))
380 access_flags_to_mode(ppace[i]->access_req, 380 access_flags_to_mode(ppace[i]->access_req,
381 ppace[i]->type, 381 ppace[i]->type,
382 &(inode->i_mode), 382 &fattr->cf_mode,
383 &user_mask); 383 &user_mask);
384 if (compare_sids(&(ppace[i]->sid), pgrpsid)) 384 if (compare_sids(&(ppace[i]->sid), pgrpsid))
385 access_flags_to_mode(ppace[i]->access_req, 385 access_flags_to_mode(ppace[i]->access_req,
386 ppace[i]->type, 386 ppace[i]->type,
387 &(inode->i_mode), 387 &fattr->cf_mode,
388 &group_mask); 388 &group_mask);
389 if (compare_sids(&(ppace[i]->sid), &sid_everyone)) 389 if (compare_sids(&(ppace[i]->sid), &sid_everyone))
390 access_flags_to_mode(ppace[i]->access_req, 390 access_flags_to_mode(ppace[i]->access_req,
391 ppace[i]->type, 391 ppace[i]->type,
392 &(inode->i_mode), 392 &fattr->cf_mode,
393 &other_mask); 393 &other_mask);
394 394
395/* memcpy((void *)(&(cifscred->aces[i])), 395/* memcpy((void *)(&(cifscred->aces[i])),
@@ -464,7 +464,7 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
464 464
465/* Convert CIFS ACL to POSIX form */ 465/* Convert CIFS ACL to POSIX form */
466static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len, 466static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
467 struct inode *inode) 467 struct cifs_fattr *fattr)
468{ 468{
469 int rc; 469 int rc;
470 struct cifs_sid *owner_sid_ptr, *group_sid_ptr; 470 struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
@@ -472,7 +472,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
472 char *end_of_acl = ((char *)pntsd) + acl_len; 472 char *end_of_acl = ((char *)pntsd) + acl_len;
473 __u32 dacloffset; 473 __u32 dacloffset;
474 474
475 if ((inode == NULL) || (pntsd == NULL)) 475 if (pntsd == NULL)
476 return -EIO; 476 return -EIO;
477 477
478 owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + 478 owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
@@ -497,7 +497,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
497 497
498 if (dacloffset) 498 if (dacloffset)
499 parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, 499 parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
500 group_sid_ptr, inode); 500 group_sid_ptr, fattr);
501 else 501 else
502 cFYI(1, ("no ACL")); /* BB grant all or default perms? */ 502 cFYI(1, ("no ACL")); /* BB grant all or default perms? */
503 503
@@ -508,7 +508,6 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
508 memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr, 508 memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
509 sizeof(struct cifs_sid)); */ 509 sizeof(struct cifs_sid)); */
510 510
511
512 return 0; 511 return 0;
513} 512}
514 513
@@ -552,134 +551,143 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
552 return rc; 551 return rc;
553} 552}
554 553
555 554static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
556/* Retrieve an ACL from the server */ 555 __u16 fid, u32 *pacllen)
557static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
558 const char *path, const __u16 *pfid)
559{ 556{
560 struct cifsFileInfo *open_file = NULL;
561 bool unlock_file = false;
562 int xid;
563 int rc = -EIO;
564 __u16 fid;
565 struct super_block *sb;
566 struct cifs_sb_info *cifs_sb;
567 struct cifs_ntsd *pntsd = NULL; 557 struct cifs_ntsd *pntsd = NULL;
558 int xid, rc;
559
560 xid = GetXid();
561 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
562 FreeXid(xid);
563
568 564
569 cFYI(1, ("get mode from ACL for %s", path)); 565 cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
566 return pntsd;
567}
570 568
571 if (inode == NULL) 569static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
572 return NULL; 570 const char *path, u32 *pacllen)
571{
572 struct cifs_ntsd *pntsd = NULL;
573 int oplock = 0;
574 int xid, rc;
575 __u16 fid;
573 576
574 xid = GetXid(); 577 xid = GetXid();
575 if (pfid == NULL)
576 open_file = find_readable_file(CIFS_I(inode));
577 else
578 fid = *pfid;
579 578
580 sb = inode->i_sb; 579 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
581 if (sb == NULL) { 580 &fid, &oplock, NULL, cifs_sb->local_nls,
582 FreeXid(xid); 581 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
583 return NULL; 582 if (rc) {
584 } 583 cERROR(1, ("Unable to open file to get ACL"));
585 cifs_sb = CIFS_SB(sb); 584 goto out;
586
587 if (open_file) {
588 unlock_file = true;
589 fid = open_file->netfid;
590 } else if (pfid == NULL) {
591 int oplock = 0;
592 /* open file */
593 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
594 READ_CONTROL, 0, &fid, &oplock, NULL,
595 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
596 CIFS_MOUNT_MAP_SPECIAL_CHR);
597 if (rc != 0) {
598 cERROR(1, ("Unable to open file to get ACL"));
599 FreeXid(xid);
600 return NULL;
601 }
602 } 585 }
603 586
604 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 587 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
605 cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen)); 588 cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
606 if (unlock_file == true) /* find_readable_file increments ref count */
607 atomic_dec(&open_file->wrtPending);
608 else if (pfid == NULL) /* if opened above we have to close the handle */
609 CIFSSMBClose(xid, cifs_sb->tcon, fid);
610 /* else handle was passed in by caller */
611 589
590 CIFSSMBClose(xid, cifs_sb->tcon, fid);
591 out:
612 FreeXid(xid); 592 FreeXid(xid);
613 return pntsd; 593 return pntsd;
614} 594}
615 595
616/* Set an ACL on the server */ 596/* Retrieve an ACL from the server */
617static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, 597static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
618 struct inode *inode, const char *path) 598 struct inode *inode, const char *path,
599 u32 *pacllen)
619{ 600{
620 struct cifsFileInfo *open_file; 601 struct cifs_ntsd *pntsd = NULL;
621 bool unlock_file = false; 602 struct cifsFileInfo *open_file = NULL;
622 int xid;
623 int rc = -EIO;
624 __u16 fid;
625 struct super_block *sb;
626 struct cifs_sb_info *cifs_sb;
627 603
628 cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode)); 604 if (inode)
605 open_file = find_readable_file(CIFS_I(inode));
606 if (!open_file)
607 return get_cifs_acl_by_path(cifs_sb, path, pacllen);
629 608
630 if (!inode) 609 pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
631 return rc; 610 atomic_dec(&open_file->wrtPending);
611 return pntsd;
612}
632 613
633 sb = inode->i_sb; 614static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
634 if (sb == NULL) 615 struct cifs_ntsd *pnntsd, u32 acllen)
635 return rc; 616{
617 int xid, rc;
636 618
637 cifs_sb = CIFS_SB(sb);
638 xid = GetXid(); 619 xid = GetXid();
620 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
621 FreeXid(xid);
639 622
640 open_file = find_readable_file(CIFS_I(inode)); 623 cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
641 if (open_file) { 624 return rc;
642 unlock_file = true; 625}
643 fid = open_file->netfid; 626
644 } else { 627static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
645 int oplock = 0; 628 struct cifs_ntsd *pnntsd, u32 acllen)
646 /* open file */ 629{
647 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, 630 int oplock = 0;
648 WRITE_DAC, 0, &fid, &oplock, NULL, 631 int xid, rc;
649 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 632 __u16 fid;
650 CIFS_MOUNT_MAP_SPECIAL_CHR); 633
651 if (rc != 0) { 634 xid = GetXid();
652 cERROR(1, ("Unable to open file to set ACL")); 635
653 FreeXid(xid); 636 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
654 return rc; 637 &fid, &oplock, NULL, cifs_sb->local_nls,
655 } 638 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
639 if (rc) {
640 cERROR(1, ("Unable to open file to set ACL"));
641 goto out;
656 } 642 }
657 643
658 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 644 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
659 cFYI(DBG2, ("SetCIFSACL rc = %d", rc)); 645 cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
660 if (unlock_file)
661 atomic_dec(&open_file->wrtPending);
662 else
663 CIFSSMBClose(xid, cifs_sb->tcon, fid);
664 646
647 CIFSSMBClose(xid, cifs_sb->tcon, fid);
648 out:
665 FreeXid(xid); 649 FreeXid(xid);
650 return rc;
651}
652
653/* Set an ACL on the server */
654static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
655 struct inode *inode, const char *path)
656{
657 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
658 struct cifsFileInfo *open_file;
659 int rc;
660
661 cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
666 662
663 open_file = find_readable_file(CIFS_I(inode));
664 if (!open_file)
665 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
666
667 rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
668 atomic_dec(&open_file->wrtPending);
667 return rc; 669 return rc;
668} 670}
669 671
670/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ 672/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
671void acl_to_uid_mode(struct inode *inode, const char *path, const __u16 *pfid) 673void
674cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
675 struct inode *inode, const char *path, const __u16 *pfid)
672{ 676{
673 struct cifs_ntsd *pntsd = NULL; 677 struct cifs_ntsd *pntsd = NULL;
674 u32 acllen = 0; 678 u32 acllen = 0;
675 int rc = 0; 679 int rc = 0;
676 680
677 cFYI(DBG2, ("converting ACL to mode for %s", path)); 681 cFYI(DBG2, ("converting ACL to mode for %s", path));
678 pntsd = get_cifs_acl(&acllen, inode, path, pfid); 682
683 if (pfid)
684 pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
685 else
686 pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
679 687
680 /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ 688 /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
681 if (pntsd) 689 if (pntsd)
682 rc = parse_sec_desc(pntsd, acllen, inode); 690 rc = parse_sec_desc(pntsd, acllen, fattr);
683 if (rc) 691 if (rc)
684 cFYI(1, ("parse sec desc failed rc = %d", rc)); 692 cFYI(1, ("parse sec desc failed rc = %d", rc));
685 693
@@ -698,7 +706,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
698 cFYI(DBG2, ("set ACL from mode for %s", path)); 706 cFYI(DBG2, ("set ACL from mode for %s", path));
699 707
700 /* Get the security descriptor */ 708 /* Get the security descriptor */
701 pntsd = get_cifs_acl(&secdesclen, inode, path, NULL); 709 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
702 710
703 /* Add three ACEs for owner, group, everyone getting rid of 711 /* Add three ACEs for owner, group, everyone getting rid of
704 other ACEs as chmod disables ACEs and set the security descriptor */ 712 other ACEs as chmod disables ACEs and set the security descriptor */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5e6d35804d73..84b75253b05a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -146,7 +146,7 @@ cifs_read_super(struct super_block *sb, void *data,
146#endif 146#endif
147 sb->s_blocksize = CIFS_MAX_MSGSIZE; 147 sb->s_blocksize = CIFS_MAX_MSGSIZE;
148 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ 148 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
149 inode = cifs_iget(sb, ROOT_I); 149 inode = cifs_root_iget(sb, ROOT_I);
150 150
151 if (IS_ERR(inode)) { 151 if (IS_ERR(inode)) {
152 rc = PTR_ERR(inode); 152 rc = PTR_ERR(inode);
@@ -204,6 +204,9 @@ cifs_put_super(struct super_block *sb)
204 cFYI(1, ("Empty cifs superblock info passed to unmount")); 204 cFYI(1, ("Empty cifs superblock info passed to unmount"));
205 return; 205 return;
206 } 206 }
207
208 lock_kernel();
209
207 rc = cifs_umount(sb, cifs_sb); 210 rc = cifs_umount(sb, cifs_sb);
208 if (rc) 211 if (rc)
209 cERROR(1, ("cifs_umount failed with return code %d", rc)); 212 cERROR(1, ("cifs_umount failed with return code %d", rc));
@@ -216,7 +219,8 @@ cifs_put_super(struct super_block *sb)
216 219
217 unload_nls(cifs_sb->local_nls); 220 unload_nls(cifs_sb->local_nls);
218 kfree(cifs_sb); 221 kfree(cifs_sb);
219 return; 222
223 unlock_kernel();
220} 224}
221 225
222static int 226static int
@@ -304,7 +308,6 @@ cifs_alloc_inode(struct super_block *sb)
304 if (!cifs_inode) 308 if (!cifs_inode)
305 return NULL; 309 return NULL;
306 cifs_inode->cifsAttrs = 0x20; /* default */ 310 cifs_inode->cifsAttrs = 0x20; /* default */
307 atomic_set(&cifs_inode->inUse, 0);
308 cifs_inode->time = 0; 311 cifs_inode->time = 0;
309 cifs_inode->write_behind_rc = 0; 312 cifs_inode->write_behind_rc = 0;
310 /* Until the file is open and we have gotten oplock 313 /* Until the file is open and we have gotten oplock
@@ -329,6 +332,27 @@ cifs_destroy_inode(struct inode *inode)
329 kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); 332 kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
330} 333}
331 334
335static void
336cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
337{
338 seq_printf(s, ",addr=");
339
340 switch (server->addr.sockAddr.sin_family) {
341 case AF_INET:
342 seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr);
343 break;
344 case AF_INET6:
345 seq_printf(s, "%pI6",
346 &server->addr.sockAddr6.sin6_addr.s6_addr);
347 if (server->addr.sockAddr6.sin6_scope_id)
348 seq_printf(s, "%%%u",
349 server->addr.sockAddr6.sin6_scope_id);
350 break;
351 default:
352 seq_printf(s, "(unknown)");
353 }
354}
355
332/* 356/*
333 * cifs_show_options() is for displaying mount options in /proc/mounts. 357 * cifs_show_options() is for displaying mount options in /proc/mounts.
334 * Not all settable options are displayed but most of the important 358 * Not all settable options are displayed but most of the important
@@ -339,83 +363,68 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
339{ 363{
340 struct cifs_sb_info *cifs_sb; 364 struct cifs_sb_info *cifs_sb;
341 struct cifsTconInfo *tcon; 365 struct cifsTconInfo *tcon;
342 struct TCP_Server_Info *server;
343 366
344 cifs_sb = CIFS_SB(m->mnt_sb); 367 cifs_sb = CIFS_SB(m->mnt_sb);
368 tcon = cifs_sb->tcon;
345 369
346 if (cifs_sb) { 370 seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
347 tcon = cifs_sb->tcon; 371 if (tcon->ses->userName)
348 if (tcon) { 372 seq_printf(s, ",username=%s", tcon->ses->userName);
349 seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName); 373 if (tcon->ses->domainName)
350 if (tcon->ses) { 374 seq_printf(s, ",domain=%s", tcon->ses->domainName);
351 if (tcon->ses->userName) 375
352 seq_printf(s, ",username=%s", 376 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
353 tcon->ses->userName); 377 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
354 if (tcon->ses->domainName) 378 seq_printf(s, ",forceuid");
355 seq_printf(s, ",domain=%s", 379 else
356 tcon->ses->domainName); 380 seq_printf(s, ",noforceuid");
357 server = tcon->ses->server; 381
358 if (server) { 382 seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
359 seq_printf(s, ",addr="); 383 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
360 switch (server->addr.sockAddr6. 384 seq_printf(s, ",forcegid");
361 sin6_family) { 385 else
362 case AF_INET6: 386 seq_printf(s, ",noforcegid");
363 seq_printf(s, "%pI6", 387
364 &server->addr.sockAddr6.sin6_addr); 388 cifs_show_address(s, tcon->ses->server);
365 break; 389
366 case AF_INET: 390 if (!tcon->unix_ext)
367 seq_printf(s, "%pI4", 391 seq_printf(s, ",file_mode=0%o,dir_mode=0%o",
368 &server->addr.sockAddr.sin_addr.s_addr);
369 break;
370 }
371 }
372 }
373 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) ||
374 !(tcon->unix_ext))
375 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
376 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) ||
377 !(tcon->unix_ext))
378 seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
379 if (!tcon->unix_ext) {
380 seq_printf(s, ",file_mode=0%o,dir_mode=0%o",
381 cifs_sb->mnt_file_mode, 392 cifs_sb->mnt_file_mode,
382 cifs_sb->mnt_dir_mode); 393 cifs_sb->mnt_dir_mode);
383 } 394 if (tcon->seal)
384 if (tcon->seal) 395 seq_printf(s, ",seal");
385 seq_printf(s, ",seal"); 396 if (tcon->nocase)
386 if (tcon->nocase) 397 seq_printf(s, ",nocase");
387 seq_printf(s, ",nocase"); 398 if (tcon->retry)
388 if (tcon->retry) 399 seq_printf(s, ",hard");
389 seq_printf(s, ",hard"); 400 if (cifs_sb->prepath)
390 } 401 seq_printf(s, ",prepath=%s", cifs_sb->prepath);
391 if (cifs_sb->prepath) 402 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
392 seq_printf(s, ",prepath=%s", cifs_sb->prepath); 403 seq_printf(s, ",posixpaths");
393 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) 404 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
394 seq_printf(s, ",posixpaths"); 405 seq_printf(s, ",setuids");
395 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) 406 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
396 seq_printf(s, ",setuids"); 407 seq_printf(s, ",serverino");
397 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) 408 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
398 seq_printf(s, ",serverino"); 409 seq_printf(s, ",directio");
399 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) 410 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
400 seq_printf(s, ",directio"); 411 seq_printf(s, ",nouser_xattr");
401 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 412 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
402 seq_printf(s, ",nouser_xattr"); 413 seq_printf(s, ",mapchars");
403 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) 414 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
404 seq_printf(s, ",mapchars"); 415 seq_printf(s, ",sfu");
405 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) 416 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
406 seq_printf(s, ",sfu"); 417 seq_printf(s, ",nobrl");
407 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 418 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
408 seq_printf(s, ",nobrl"); 419 seq_printf(s, ",cifsacl");
409 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) 420 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
410 seq_printf(s, ",cifsacl"); 421 seq_printf(s, ",dynperm");
411 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) 422 if (m->mnt_sb->s_flags & MS_POSIXACL)
412 seq_printf(s, ",dynperm"); 423 seq_printf(s, ",acl");
413 if (m->mnt_sb->s_flags & MS_POSIXACL) 424
414 seq_printf(s, ",acl"); 425 seq_printf(s, ",rsize=%d", cifs_sb->rsize);
415 426 seq_printf(s, ",wsize=%d", cifs_sb->wsize);
416 seq_printf(s, ",rsize=%d", cifs_sb->rsize); 427
417 seq_printf(s, ",wsize=%d", cifs_sb->wsize);
418 }
419 return 0; 428 return 0;
420} 429}
421 430
@@ -531,9 +540,14 @@ static void cifs_umount_begin(struct super_block *sb)
531 if (tcon == NULL) 540 if (tcon == NULL)
532 return; 541 return;
533 542
534 lock_kernel();
535 read_lock(&cifs_tcp_ses_lock); 543 read_lock(&cifs_tcp_ses_lock);
536 if (tcon->tc_count == 1) 544 if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
545 /* we have other mounts to same share or we have
546 already tried to force umount this and woken up
547 all waiting network requests, nothing to do */
548 read_unlock(&cifs_tcp_ses_lock);
549 return;
550 } else if (tcon->tc_count == 1)
537 tcon->tidStatus = CifsExiting; 551 tcon->tidStatus = CifsExiting;
538 read_unlock(&cifs_tcp_ses_lock); 552 read_unlock(&cifs_tcp_ses_lock);
539 553
@@ -548,9 +562,7 @@ static void cifs_umount_begin(struct super_block *sb)
548 wake_up_all(&tcon->ses->server->response_q); 562 wake_up_all(&tcon->ses->server->response_q);
549 msleep(1); 563 msleep(1);
550 } 564 }
551/* BB FIXME - finish add checks for tidStatus BB */
552 565
553 unlock_kernel();
554 return; 566 return;
555} 567}
556 568
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 051b71cfdea9..6c170948300d 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -24,6 +24,19 @@
24 24
25#define ROOT_I 2 25#define ROOT_I 2
26 26
27/*
28 * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
29 * so that it will fit.
30 */
31static inline ino_t
32cifs_uniqueid_to_ino_t(u64 fileid)
33{
34 ino_t ino = (ino_t) fileid;
35 if (sizeof(ino_t) < sizeof(u64))
36 ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8;
37 return ino;
38}
39
27extern struct file_system_type cifs_fs_type; 40extern struct file_system_type cifs_fs_type;
28extern const struct address_space_operations cifs_addr_ops; 41extern const struct address_space_operations cifs_addr_ops;
29extern const struct address_space_operations cifs_addr_ops_smallbuf; 42extern const struct address_space_operations cifs_addr_ops_smallbuf;
@@ -36,7 +49,7 @@ extern void cifs_read_inode(struct inode *);
36 49
37/* Functions related to inodes */ 50/* Functions related to inodes */
38extern const struct inode_operations cifs_dir_inode_ops; 51extern const struct inode_operations cifs_dir_inode_ops;
39extern struct inode *cifs_iget(struct super_block *, unsigned long); 52extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
40extern int cifs_create(struct inode *, struct dentry *, int, 53extern int cifs_create(struct inode *, struct dentry *, int,
41 struct nameidata *); 54 struct nameidata *);
42extern struct dentry *cifs_lookup(struct inode *, struct dentry *, 55extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
@@ -100,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
100extern const struct export_operations cifs_export_ops; 113extern const struct export_operations cifs_export_ops;
101#endif /* EXPERIMENTAL */ 114#endif /* EXPERIMENTAL */
102 115
103#define CIFS_VERSION "1.58" 116#define CIFS_VERSION "1.60"
104#endif /* _CIFSFS_H */ 117#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a61ab772c6f6..6084d6379c03 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -83,7 +83,7 @@ enum securityEnum {
83 NTLM, /* Legacy NTLM012 auth with NTLM hash */ 83 NTLM, /* Legacy NTLM012 auth with NTLM hash */
84 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ 84 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
85 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ 85 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
86 NTLMSSP, /* NTLMSSP via SPNEGO, NTLMv2 hash */ 86/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
87 Kerberos, /* Kerberos via SPNEGO */ 87 Kerberos, /* Kerberos via SPNEGO */
88 MSKerberos, /* MS Kerberos via SPNEGO */ 88 MSKerberos, /* MS Kerberos via SPNEGO */
89}; 89};
@@ -260,6 +260,8 @@ struct cifsTconInfo {
260 atomic_t num_closes; 260 atomic_t num_closes;
261 atomic_t num_deletes; 261 atomic_t num_deletes;
262 atomic_t num_mkdirs; 262 atomic_t num_mkdirs;
263 atomic_t num_posixopens;
264 atomic_t num_posixmkdirs;
263 atomic_t num_rmdirs; 265 atomic_t num_rmdirs;
264 atomic_t num_renames; 266 atomic_t num_renames;
265 atomic_t num_t2renames; 267 atomic_t num_t2renames;
@@ -364,13 +366,13 @@ struct cifsInodeInfo {
364 struct list_head openFileList; 366 struct list_head openFileList;
365 int write_behind_rc; 367 int write_behind_rc;
366 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ 368 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
367 atomic_t inUse; /* num concurrent users (local openers cifs) of file*/
368 unsigned long time; /* jiffies of last update/check of inode */ 369 unsigned long time; /* jiffies of last update/check of inode */
369 bool clientCanCacheRead:1; /* read oplock */ 370 bool clientCanCacheRead:1; /* read oplock */
370 bool clientCanCacheAll:1; /* read and writebehind oplock */ 371 bool clientCanCacheAll:1; /* read and writebehind oplock */
371 bool oplockPending:1; 372 bool oplockPending:1;
372 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 373 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
373 u64 server_eof; /* current file size on server */ 374 u64 server_eof; /* current file size on server */
375 u64 uniqueid; /* server inode number */
374 struct inode vfs_inode; 376 struct inode vfs_inode;
375}; 377};
376 378
@@ -472,6 +474,32 @@ struct dfs_info3_param {
472 char *node_name; 474 char *node_name;
473}; 475};
474 476
477/*
478 * common struct for holding inode info when searching for or updating an
479 * inode with new info
480 */
481
482#define CIFS_FATTR_DFS_REFERRAL 0x1
483#define CIFS_FATTR_DELETE_PENDING 0x2
484#define CIFS_FATTR_NEED_REVAL 0x4
485
486struct cifs_fattr {
487 u32 cf_flags;
488 u32 cf_cifsattrs;
489 u64 cf_uniqueid;
490 u64 cf_eof;
491 u64 cf_bytes;
492 uid_t cf_uid;
493 gid_t cf_gid;
494 umode_t cf_mode;
495 dev_t cf_rdev;
496 unsigned int cf_nlink;
497 unsigned int cf_dtype;
498 struct timespec cf_atime;
499 struct timespec cf_mtime;
500 struct timespec cf_ctime;
501};
502
475static inline void free_dfs_info_param(struct dfs_info3_param *param) 503static inline void free_dfs_info_param(struct dfs_info3_param *param)
476{ 504{
477 if (param) { 505 if (param) {
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index a785f69dbc9f..2d07f890a842 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2328,19 +2328,7 @@ struct file_attrib_tag {
2328typedef struct { 2328typedef struct {
2329 __le32 NextEntryOffset; 2329 __le32 NextEntryOffset;
2330 __u32 ResumeKey; /* as with FileIndex - no need to convert */ 2330 __u32 ResumeKey; /* as with FileIndex - no need to convert */
2331 __le64 EndOfFile; 2331 FILE_UNIX_BASIC_INFO basic;
2332 __le64 NumOfBytes;
2333 __le64 LastStatusChange; /*SNIA specs DCE time for the 3 time fields */
2334 __le64 LastAccessTime;
2335 __le64 LastModificationTime;
2336 __le64 Uid;
2337 __le64 Gid;
2338 __le32 Type;
2339 __le64 DevMajor;
2340 __le64 DevMinor;
2341 __le64 UniqueId;
2342 __le64 Permissions;
2343 __le64 Nlinks;
2344 char FileName[1]; 2332 char FileName[1];
2345} __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */ 2333} __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */
2346 2334
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fae083930eee..da8fbf565991 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -74,7 +74,7 @@ extern unsigned int smbCalcSize(struct smb_hdr *ptr);
74extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); 74extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
75extern int decode_negTokenInit(unsigned char *security_blob, int length, 75extern int decode_negTokenInit(unsigned char *security_blob, int length,
76 enum securityEnum *secType); 76 enum securityEnum *secType);
77extern int cifs_inet_pton(const int, const char *source, void *dst); 77extern int cifs_convert_address(char *src, void *dst);
78extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); 78extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
79extern void header_assemble(struct smb_hdr *, char /* command */ , 79extern void header_assemble(struct smb_hdr *, char /* command */ ,
80 const struct cifsTconInfo *, int /* length of 80 const struct cifsTconInfo *, int /* length of
@@ -90,17 +90,21 @@ extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
90 struct cifsTconInfo *); 90 struct cifsTconInfo *);
91extern void DeleteOplockQEntry(struct oplock_q_entry *); 91extern void DeleteOplockQEntry(struct oplock_q_entry *);
92extern void DeleteTconOplockQEntries(struct cifsTconInfo *); 92extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
93extern struct timespec cifs_NTtimeToUnix(u64 utc_nanoseconds_since_1601); 93extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
94extern u64 cifs_UnixTimeToNT(struct timespec); 94extern u64 cifs_UnixTimeToNT(struct timespec);
95extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time); 95extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
96extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time); 96 int offset);
97 97
98extern int cifs_posix_open(char *full_path, struct inode **pinode, 98extern int cifs_posix_open(char *full_path, struct inode **pinode,
99 struct super_block *sb, int mode, int oflags, 99 struct super_block *sb, int mode, int oflags,
100 int *poplock, __u16 *pnetfid, int xid); 100 int *poplock, __u16 *pnetfid, int xid);
101extern void posix_fill_in_inode(struct inode *tmp_inode, 101extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
102 FILE_UNIX_BASIC_INFO *pData, int isNewInode); 102 FILE_UNIX_BASIC_INFO *info,
103extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum); 103 struct cifs_sb_info *cifs_sb);
104extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
105extern struct inode *cifs_iget(struct super_block *sb,
106 struct cifs_fattr *fattr);
107
104extern int cifs_get_inode_info(struct inode **pinode, 108extern int cifs_get_inode_info(struct inode **pinode,
105 const unsigned char *search_path, 109 const unsigned char *search_path,
106 FILE_ALL_INFO *pfile_info, 110 FILE_ALL_INFO *pfile_info,
@@ -108,8 +112,9 @@ extern int cifs_get_inode_info(struct inode **pinode,
108extern int cifs_get_inode_info_unix(struct inode **pinode, 112extern int cifs_get_inode_info_unix(struct inode **pinode,
109 const unsigned char *search_path, 113 const unsigned char *search_path,
110 struct super_block *sb, int xid); 114 struct super_block *sb, int xid);
111extern void acl_to_uid_mode(struct inode *inode, const char *path, 115extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
112 const __u16 *pfid); 116 struct cifs_fattr *fattr, struct inode *inode,
117 const char *path, const __u16 *pfid);
113extern int mode_to_acl(struct inode *inode, const char *path, __u64); 118extern int mode_to_acl(struct inode *inode, const char *path, __u64);
114 119
115extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, 120extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
@@ -215,7 +220,11 @@ struct cifs_unix_set_info_args {
215 dev_t device; 220 dev_t device;
216}; 221};
217 222
218extern int CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *pTcon, 223extern int CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
224 const struct cifs_unix_set_info_args *args,
225 u16 fid, u32 pid_of_opener);
226
227extern int CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *pTcon,
219 char *fileName, 228 char *fileName,
220 const struct cifs_unix_set_info_args *args, 229 const struct cifs_unix_set_info_args *args,
221 const struct nls_table *nls_codepage, 230 const struct nls_table *nls_codepage,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d06260251c30..1866bc2927d4 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -524,8 +524,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
524 int val, seconds, remain, result; 524 int val, seconds, remain, result;
525 struct timespec ts, utc; 525 struct timespec ts, utc;
526 utc = CURRENT_TIME; 526 utc = CURRENT_TIME;
527 ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date), 527 ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
528 le16_to_cpu(rsp->SrvTime.Time)); 528 rsp->SrvTime.Time, 0);
529 cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d", 529 cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
530 (int)ts.tv_sec, (int)utc.tv_sec, 530 (int)ts.tv_sec, (int)utc.tv_sec,
531 (int)(utc.tv_sec - ts.tv_sec))); 531 (int)(utc.tv_sec - ts.tv_sec)));
@@ -594,7 +594,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
594 else if (secFlags & CIFSSEC_MAY_KRB5) 594 else if (secFlags & CIFSSEC_MAY_KRB5)
595 server->secType = Kerberos; 595 server->secType = Kerberos;
596 else if (secFlags & CIFSSEC_MAY_NTLMSSP) 596 else if (secFlags & CIFSSEC_MAY_NTLMSSP)
597 server->secType = NTLMSSP; 597 server->secType = RawNTLMSSP;
598 else if (secFlags & CIFSSEC_MAY_LANMAN) 598 else if (secFlags & CIFSSEC_MAY_LANMAN)
599 server->secType = LANMAN; 599 server->secType = LANMAN;
600/* #ifdef CONFIG_CIFS_EXPERIMENTAL 600/* #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -729,7 +729,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
729 * the tcon is no longer on the list, so no need to take lock before 729 * the tcon is no longer on the list, so no need to take lock before
730 * checking this. 730 * checking this.
731 */ 731 */
732 if (tcon->need_reconnect) 732 if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
733 return 0; 733 return 0;
734 734
735 rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon, 735 rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon,
@@ -1113,7 +1113,10 @@ PsxCreat:
1113psx_create_err: 1113psx_create_err:
1114 cifs_buf_release(pSMB); 1114 cifs_buf_release(pSMB);
1115 1115
1116 cifs_stats_inc(&tcon->num_mkdirs); 1116 if (posix_flags & SMB_O_DIRECTORY)
1117 cifs_stats_inc(&tcon->num_posixmkdirs);
1118 else
1119 cifs_stats_inc(&tcon->num_posixopens);
1117 1120
1118 if (rc == -EAGAIN) 1121 if (rc == -EAGAIN)
1119 goto PsxCreat; 1122 goto PsxCreat;
@@ -2427,8 +2430,7 @@ querySymLinkRetry:
2427 params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; 2430 params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ;
2428 pSMB->TotalDataCount = 0; 2431 pSMB->TotalDataCount = 0;
2429 pSMB->MaxParameterCount = cpu_to_le16(2); 2432 pSMB->MaxParameterCount = cpu_to_le16(2);
2430 /* BB find exact max data count below from sess structure BB */ 2433 pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
2431 pSMB->MaxDataCount = cpu_to_le16(4000);
2432 pSMB->MaxSetupCount = 0; 2434 pSMB->MaxSetupCount = 0;
2433 pSMB->Reserved = 0; 2435 pSMB->Reserved = 0;
2434 pSMB->Flags = 0; 2436 pSMB->Flags = 0;
@@ -5075,10 +5077,114 @@ SetAttrLgcyRetry:
5075} 5077}
5076#endif /* temporarily unneeded SetAttr legacy function */ 5078#endif /* temporarily unneeded SetAttr legacy function */
5077 5079
5080static void
5081cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
5082 const struct cifs_unix_set_info_args *args)
5083{
5084 u64 mode = args->mode;
5085
5086 /*
5087 * Samba server ignores set of file size to zero due to bugs in some
5088 * older clients, but we should be precise - we use SetFileSize to
5089 * set file size and do not want to truncate file size to zero
5090 * accidently as happened on one Samba server beta by putting
5091 * zero instead of -1 here
5092 */
5093 data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
5094 data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64);
5095 data_offset->LastStatusChange = cpu_to_le64(args->ctime);
5096 data_offset->LastAccessTime = cpu_to_le64(args->atime);
5097 data_offset->LastModificationTime = cpu_to_le64(args->mtime);
5098 data_offset->Uid = cpu_to_le64(args->uid);
5099 data_offset->Gid = cpu_to_le64(args->gid);
5100 /* better to leave device as zero when it is */
5101 data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
5102 data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
5103 data_offset->Permissions = cpu_to_le64(mode);
5104
5105 if (S_ISREG(mode))
5106 data_offset->Type = cpu_to_le32(UNIX_FILE);
5107 else if (S_ISDIR(mode))
5108 data_offset->Type = cpu_to_le32(UNIX_DIR);
5109 else if (S_ISLNK(mode))
5110 data_offset->Type = cpu_to_le32(UNIX_SYMLINK);
5111 else if (S_ISCHR(mode))
5112 data_offset->Type = cpu_to_le32(UNIX_CHARDEV);
5113 else if (S_ISBLK(mode))
5114 data_offset->Type = cpu_to_le32(UNIX_BLOCKDEV);
5115 else if (S_ISFIFO(mode))
5116 data_offset->Type = cpu_to_le32(UNIX_FIFO);
5117 else if (S_ISSOCK(mode))
5118 data_offset->Type = cpu_to_le32(UNIX_SOCKET);
5119}
5120
5078int 5121int
5079CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *tcon, char *fileName, 5122CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
5080 const struct cifs_unix_set_info_args *args, 5123 const struct cifs_unix_set_info_args *args,
5081 const struct nls_table *nls_codepage, int remap) 5124 u16 fid, u32 pid_of_opener)
5125{
5126 struct smb_com_transaction2_sfi_req *pSMB = NULL;
5127 FILE_UNIX_BASIC_INFO *data_offset;
5128 int rc = 0;
5129 u16 params, param_offset, offset, byte_count, count;
5130
5131 cFYI(1, ("Set Unix Info (via SetFileInfo)"));
5132 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
5133
5134 if (rc)
5135 return rc;
5136
5137 pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
5138 pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
5139
5140 params = 6;
5141 pSMB->MaxSetupCount = 0;
5142 pSMB->Reserved = 0;
5143 pSMB->Flags = 0;
5144 pSMB->Timeout = 0;
5145 pSMB->Reserved2 = 0;
5146 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
5147 offset = param_offset + params;
5148
5149 data_offset = (FILE_UNIX_BASIC_INFO *)
5150 ((char *)(&pSMB->hdr.Protocol) + offset);
5151 count = sizeof(FILE_UNIX_BASIC_INFO);
5152
5153 pSMB->MaxParameterCount = cpu_to_le16(2);
5154 /* BB find max SMB PDU from sess */
5155 pSMB->MaxDataCount = cpu_to_le16(1000);
5156 pSMB->SetupCount = 1;
5157 pSMB->Reserved3 = 0;
5158 pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
5159 byte_count = 3 /* pad */ + params + count;
5160 pSMB->DataCount = cpu_to_le16(count);
5161 pSMB->ParameterCount = cpu_to_le16(params);
5162 pSMB->TotalDataCount = pSMB->DataCount;
5163 pSMB->TotalParameterCount = pSMB->ParameterCount;
5164 pSMB->ParameterOffset = cpu_to_le16(param_offset);
5165 pSMB->DataOffset = cpu_to_le16(offset);
5166 pSMB->Fid = fid;
5167 pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
5168 pSMB->Reserved4 = 0;
5169 pSMB->hdr.smb_buf_length += byte_count;
5170 pSMB->ByteCount = cpu_to_le16(byte_count);
5171
5172 cifs_fill_unix_set_info(data_offset, args);
5173
5174 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
5175 if (rc)
5176 cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
5177
5178 /* Note: On -EAGAIN error only caller can retry on handle based calls
5179 since file handle passed in no longer valid */
5180
5181 return rc;
5182}
5183
5184int
5185CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
5186 const struct cifs_unix_set_info_args *args,
5187 const struct nls_table *nls_codepage, int remap)
5082{ 5188{
5083 TRANSACTION2_SPI_REQ *pSMB = NULL; 5189 TRANSACTION2_SPI_REQ *pSMB = NULL;
5084 TRANSACTION2_SPI_RSP *pSMBr = NULL; 5190 TRANSACTION2_SPI_RSP *pSMBr = NULL;
@@ -5087,7 +5193,6 @@ CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
5087 int bytes_returned = 0; 5193 int bytes_returned = 0;
5088 FILE_UNIX_BASIC_INFO *data_offset; 5194 FILE_UNIX_BASIC_INFO *data_offset;
5089 __u16 params, param_offset, offset, count, byte_count; 5195 __u16 params, param_offset, offset, count, byte_count;
5090 __u64 mode = args->mode;
5091 5196
5092 cFYI(1, ("In SetUID/GID/Mode")); 5197 cFYI(1, ("In SetUID/GID/Mode"));
5093setPermsRetry: 5198setPermsRetry:
@@ -5138,38 +5243,8 @@ setPermsRetry:
5138 pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC); 5243 pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
5139 pSMB->Reserved4 = 0; 5244 pSMB->Reserved4 = 0;
5140 pSMB->hdr.smb_buf_length += byte_count; 5245 pSMB->hdr.smb_buf_length += byte_count;
5141 /* Samba server ignores set of file size to zero due to bugs in some
5142 older clients, but we should be precise - we use SetFileSize to
5143 set file size and do not want to truncate file size to zero
5144 accidently as happened on one Samba server beta by putting
5145 zero instead of -1 here */
5146 data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
5147 data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64);
5148 data_offset->LastStatusChange = cpu_to_le64(args->ctime);
5149 data_offset->LastAccessTime = cpu_to_le64(args->atime);
5150 data_offset->LastModificationTime = cpu_to_le64(args->mtime);
5151 data_offset->Uid = cpu_to_le64(args->uid);
5152 data_offset->Gid = cpu_to_le64(args->gid);
5153 /* better to leave device as zero when it is */
5154 data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
5155 data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
5156 data_offset->Permissions = cpu_to_le64(mode);
5157
5158 if (S_ISREG(mode))
5159 data_offset->Type = cpu_to_le32(UNIX_FILE);
5160 else if (S_ISDIR(mode))
5161 data_offset->Type = cpu_to_le32(UNIX_DIR);
5162 else if (S_ISLNK(mode))
5163 data_offset->Type = cpu_to_le32(UNIX_SYMLINK);
5164 else if (S_ISCHR(mode))
5165 data_offset->Type = cpu_to_le32(UNIX_CHARDEV);
5166 else if (S_ISBLK(mode))
5167 data_offset->Type = cpu_to_le32(UNIX_BLOCKDEV);
5168 else if (S_ISFIFO(mode))
5169 data_offset->Type = cpu_to_le32(UNIX_FIFO);
5170 else if (S_ISSOCK(mode))
5171 data_offset->Type = cpu_to_le32(UNIX_SOCKET);
5172 5246
5247 cifs_fill_unix_set_info(data_offset, args);
5173 5248
5174 pSMB->ByteCount = cpu_to_le16(byte_count); 5249 pSMB->ByteCount = cpu_to_le16(byte_count);
5175 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5250 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4aa81a507b74..1f3345d7fa79 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -35,6 +35,7 @@
35#include <linux/namei.h> 35#include <linux/namei.h>
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37#include <asm/processor.h> 37#include <asm/processor.h>
38#include <linux/inet.h>
38#include <net/ipv6.h> 39#include <net/ipv6.h>
39#include "cifspdu.h" 40#include "cifspdu.h"
40#include "cifsglob.h" 41#include "cifsglob.h"
@@ -61,7 +62,6 @@ struct smb_vol {
61 char *domainname; 62 char *domainname;
62 char *UNC; 63 char *UNC;
63 char *UNCip; 64 char *UNCip;
64 char *in6_addr; /* ipv6 address as human readable form of in6_addr */
65 char *iocharset; /* local code page for mapping to and from Unicode */ 65 char *iocharset; /* local code page for mapping to and from Unicode */
66 char source_rfc1001_name[16]; /* netbios name of client */ 66 char source_rfc1001_name[16]; /* netbios name of client */
67 char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */ 67 char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
@@ -70,7 +70,6 @@ struct smb_vol {
70 mode_t file_mode; 70 mode_t file_mode;
71 mode_t dir_mode; 71 mode_t dir_mode;
72 unsigned secFlg; 72 unsigned secFlg;
73 bool rw:1;
74 bool retry:1; 73 bool retry:1;
75 bool intr:1; 74 bool intr:1;
76 bool setuids:1; 75 bool setuids:1;
@@ -804,6 +803,10 @@ cifs_parse_mount_options(char *options, const char *devname,
804 char *data; 803 char *data;
805 unsigned int temp_len, i, j; 804 unsigned int temp_len, i, j;
806 char separator[2]; 805 char separator[2];
806 short int override_uid = -1;
807 short int override_gid = -1;
808 bool uid_specified = false;
809 bool gid_specified = false;
807 810
808 separator[0] = ','; 811 separator[0] = ',';
809 separator[1] = 0; 812 separator[1] = 0;
@@ -827,14 +830,15 @@ cifs_parse_mount_options(char *options, const char *devname,
827 vol->target_rfc1001_name[0] = 0; 830 vol->target_rfc1001_name[0] = 0;
828 vol->linux_uid = current_uid(); /* use current_euid() instead? */ 831 vol->linux_uid = current_uid(); /* use current_euid() instead? */
829 vol->linux_gid = current_gid(); 832 vol->linux_gid = current_gid();
830 vol->dir_mode = S_IRWXUGO; 833
831 /* 2767 perms indicate mandatory locking support */ 834 /* default to only allowing write access to owner of the mount */
832 vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP); 835 vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
833 836
834 /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */ 837 /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
835 vol->rw = true;
836 /* default is always to request posix paths. */ 838 /* default is always to request posix paths. */
837 vol->posix_paths = 1; 839 vol->posix_paths = 1;
840 /* default to using server inode numbers where available */
841 vol->server_ino = 1;
838 842
839 if (!options) 843 if (!options)
840 return 1; 844 return 1;
@@ -955,10 +959,12 @@ cifs_parse_mount_options(char *options, const char *devname,
955 } 959 }
956 strcpy(vol->password, value); 960 strcpy(vol->password, value);
957 } 961 }
958 } else if (strnicmp(data, "ip", 2) == 0) { 962 } else if (!strnicmp(data, "ip", 2) ||
963 !strnicmp(data, "addr", 4)) {
959 if (!value || !*value) { 964 if (!value || !*value) {
960 vol->UNCip = NULL; 965 vol->UNCip = NULL;
961 } else if (strnlen(value, 35) < 35) { 966 } else if (strnlen(value, INET6_ADDRSTRLEN) <
967 INET6_ADDRSTRLEN) {
962 vol->UNCip = value; 968 vol->UNCip = value;
963 } else { 969 } else {
964 printk(KERN_WARNING "CIFS: ip address " 970 printk(KERN_WARNING "CIFS: ip address "
@@ -1091,18 +1097,20 @@ cifs_parse_mount_options(char *options, const char *devname,
1091 "too long.\n"); 1097 "too long.\n");
1092 return 1; 1098 return 1;
1093 } 1099 }
1094 } else if (strnicmp(data, "uid", 3) == 0) { 1100 } else if (!strnicmp(data, "uid", 3) && value && *value) {
1095 if (value && *value) { 1101 vol->linux_uid = simple_strtoul(value, &value, 0);
1096 vol->linux_uid = 1102 uid_specified = true;
1097 simple_strtoul(value, &value, 0); 1103 } else if (!strnicmp(data, "forceuid", 8)) {
1098 vol->override_uid = 1; 1104 override_uid = 1;
1099 } 1105 } else if (!strnicmp(data, "noforceuid", 10)) {
1100 } else if (strnicmp(data, "gid", 3) == 0) { 1106 override_uid = 0;
1101 if (value && *value) { 1107 } else if (!strnicmp(data, "gid", 3) && value && *value) {
1102 vol->linux_gid = 1108 vol->linux_gid = simple_strtoul(value, &value, 0);
1103 simple_strtoul(value, &value, 0); 1109 gid_specified = true;
1104 vol->override_gid = 1; 1110 } else if (!strnicmp(data, "forcegid", 8)) {
1105 } 1111 override_gid = 1;
1112 } else if (!strnicmp(data, "noforcegid", 10)) {
1113 override_gid = 0;
1106 } else if (strnicmp(data, "file_mode", 4) == 0) { 1114 } else if (strnicmp(data, "file_mode", 4) == 0) {
1107 if (value && *value) { 1115 if (value && *value) {
1108 vol->file_mode = 1116 vol->file_mode =
@@ -1195,7 +1203,9 @@ cifs_parse_mount_options(char *options, const char *devname,
1195 } else if (strnicmp(data, "guest", 5) == 0) { 1203 } else if (strnicmp(data, "guest", 5) == 0) {
1196 /* ignore */ 1204 /* ignore */
1197 } else if (strnicmp(data, "rw", 2) == 0) { 1205 } else if (strnicmp(data, "rw", 2) == 0) {
1198 vol->rw = true; 1206 /* ignore */
1207 } else if (strnicmp(data, "ro", 2) == 0) {
1208 /* ignore */
1199 } else if (strnicmp(data, "noblocksend", 11) == 0) { 1209 } else if (strnicmp(data, "noblocksend", 11) == 0) {
1200 vol->noblocksnd = 1; 1210 vol->noblocksnd = 1;
1201 } else if (strnicmp(data, "noautotune", 10) == 0) { 1211 } else if (strnicmp(data, "noautotune", 10) == 0) {
@@ -1214,8 +1224,6 @@ cifs_parse_mount_options(char *options, const char *devname,
1214 parse these options again and set anything and it 1224 parse these options again and set anything and it
1215 is ok to just ignore them */ 1225 is ok to just ignore them */
1216 continue; 1226 continue;
1217 } else if (strnicmp(data, "ro", 2) == 0) {
1218 vol->rw = false;
1219 } else if (strnicmp(data, "hard", 4) == 0) { 1227 } else if (strnicmp(data, "hard", 4) == 0) {
1220 vol->retry = 1; 1228 vol->retry = 1;
1221 } else if (strnicmp(data, "soft", 4) == 0) { 1229 } else if (strnicmp(data, "soft", 4) == 0) {
@@ -1315,16 +1323,6 @@ cifs_parse_mount_options(char *options, const char *devname,
1315 vol->direct_io = 1; 1323 vol->direct_io = 1;
1316 } else if (strnicmp(data, "forcedirectio", 13) == 0) { 1324 } else if (strnicmp(data, "forcedirectio", 13) == 0) {
1317 vol->direct_io = 1; 1325 vol->direct_io = 1;
1318 } else if (strnicmp(data, "in6_addr", 8) == 0) {
1319 if (!value || !*value) {
1320 vol->in6_addr = NULL;
1321 } else if (strnlen(value, 49) == 48) {
1322 vol->in6_addr = value;
1323 } else {
1324 printk(KERN_WARNING "CIFS: ip v6 address not "
1325 "48 characters long\n");
1326 return 1;
1327 }
1328 } else if (strnicmp(data, "noac", 4) == 0) { 1326 } else if (strnicmp(data, "noac", 4) == 0) {
1329 printk(KERN_WARNING "CIFS: Mount option noac not " 1327 printk(KERN_WARNING "CIFS: Mount option noac not "
1330 "supported. Instead set " 1328 "supported. Instead set "
@@ -1363,6 +1361,18 @@ cifs_parse_mount_options(char *options, const char *devname,
1363 if (vol->UNCip == NULL) 1361 if (vol->UNCip == NULL)
1364 vol->UNCip = &vol->UNC[2]; 1362 vol->UNCip = &vol->UNC[2];
1365 1363
1364 if (uid_specified)
1365 vol->override_uid = override_uid;
1366 else if (override_uid == 1)
1367 printk(KERN_NOTICE "CIFS: ignoring forceuid mount option "
1368 "specified with no uid= option.\n");
1369
1370 if (gid_specified)
1371 vol->override_gid = override_gid;
1372 else if (override_gid == 1)
1373 printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
1374 "specified with no gid= option.\n");
1375
1366 return 0; 1376 return 0;
1367} 1377}
1368 1378
@@ -1392,8 +1402,10 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
1392 server->addr.sockAddr.sin_addr.s_addr)) 1402 server->addr.sockAddr.sin_addr.s_addr))
1393 continue; 1403 continue;
1394 else if (addr->ss_family == AF_INET6 && 1404 else if (addr->ss_family == AF_INET6 &&
1395 !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr, 1405 (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
1396 &addr6->sin6_addr)) 1406 &addr6->sin6_addr) ||
1407 server->addr.sockAddr6.sin6_scope_id !=
1408 addr6->sin6_scope_id))
1397 continue; 1409 continue;
1398 1410
1399 ++server->srv_count; 1411 ++server->srv_count;
@@ -1439,28 +1451,15 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1439 1451
1440 memset(&addr, 0, sizeof(struct sockaddr_storage)); 1452 memset(&addr, 0, sizeof(struct sockaddr_storage));
1441 1453
1442 if (volume_info->UNCip && volume_info->UNC) { 1454 cFYI(1, ("UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip));
1443 rc = cifs_inet_pton(AF_INET, volume_info->UNCip,
1444 &sin_server->sin_addr.s_addr);
1445
1446 if (rc <= 0) {
1447 /* not ipv4 address, try ipv6 */
1448 rc = cifs_inet_pton(AF_INET6, volume_info->UNCip,
1449 &sin_server6->sin6_addr.in6_u);
1450 if (rc > 0)
1451 addr.ss_family = AF_INET6;
1452 } else {
1453 addr.ss_family = AF_INET;
1454 }
1455 1455
1456 if (rc <= 0) { 1456 if (volume_info->UNCip && volume_info->UNC) {
1457 rc = cifs_convert_address(volume_info->UNCip, &addr);
1458 if (!rc) {
1457 /* we failed translating address */ 1459 /* we failed translating address */
1458 rc = -EINVAL; 1460 rc = -EINVAL;
1459 goto out_err; 1461 goto out_err;
1460 } 1462 }
1461
1462 cFYI(1, ("UNC: %s ip: %s", volume_info->UNC,
1463 volume_info->UNCip));
1464 } else if (volume_info->UNCip) { 1463 } else if (volume_info->UNCip) {
1465 /* BB using ip addr as tcp_ses name to connect to the 1464 /* BB using ip addr as tcp_ses name to connect to the
1466 DFS root below */ 1465 DFS root below */
@@ -1519,14 +1518,14 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1519 cFYI(1, ("attempting ipv6 connect")); 1518 cFYI(1, ("attempting ipv6 connect"));
1520 /* BB should we allow ipv6 on port 139? */ 1519 /* BB should we allow ipv6 on port 139? */
1521 /* other OS never observed in Wild doing 139 with v6 */ 1520 /* other OS never observed in Wild doing 139 with v6 */
1521 sin_server6->sin6_port = htons(volume_info->port);
1522 memcpy(&tcp_ses->addr.sockAddr6, sin_server6, 1522 memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
1523 sizeof(struct sockaddr_in6)); 1523 sizeof(struct sockaddr_in6));
1524 sin_server6->sin6_port = htons(volume_info->port);
1525 rc = ipv6_connect(tcp_ses); 1524 rc = ipv6_connect(tcp_ses);
1526 } else { 1525 } else {
1526 sin_server->sin_port = htons(volume_info->port);
1527 memcpy(&tcp_ses->addr.sockAddr, sin_server, 1527 memcpy(&tcp_ses->addr.sockAddr, sin_server,
1528 sizeof(struct sockaddr_in)); 1528 sizeof(struct sockaddr_in));
1529 sin_server->sin_port = htons(volume_info->port);
1530 rc = ipv4_connect(tcp_ses); 1529 rc = ipv4_connect(tcp_ses);
1531 } 1530 }
1532 if (rc < 0) { 1531 if (rc < 0) {
@@ -2471,10 +2470,10 @@ try_mount_again:
2471 tcon->local_lease = volume_info->local_lease; 2470 tcon->local_lease = volume_info->local_lease;
2472 } 2471 }
2473 if (pSesInfo) { 2472 if (pSesInfo) {
2474 if (pSesInfo->capabilities & CAP_LARGE_FILES) { 2473 if (pSesInfo->capabilities & CAP_LARGE_FILES)
2475 sb->s_maxbytes = (u64) 1 << 63; 2474 sb->s_maxbytes = MAX_LFS_FILESIZE;
2476 } else 2475 else
2477 sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */ 2476 sb->s_maxbytes = MAX_NON_LFS;
2478 } 2477 }
2479 2478
2480 /* BB FIXME fix time_gran to be larger for LANMAN sessions */ 2479 /* BB FIXME fix time_gran to be larger for LANMAN sessions */
@@ -2563,11 +2562,20 @@ remote_path_check:
2563 2562
2564 if (mount_data != mount_data_global) 2563 if (mount_data != mount_data_global)
2565 kfree(mount_data); 2564 kfree(mount_data);
2565
2566 mount_data = cifs_compose_mount_options( 2566 mount_data = cifs_compose_mount_options(
2567 cifs_sb->mountdata, full_path + 1, 2567 cifs_sb->mountdata, full_path + 1,
2568 referrals, &fake_devname); 2568 referrals, &fake_devname);
2569 kfree(fake_devname); 2569
2570 free_dfs_info_array(referrals, num_referrals); 2570 free_dfs_info_array(referrals, num_referrals);
2571 kfree(fake_devname);
2572 kfree(full_path);
2573
2574 if (IS_ERR(mount_data)) {
2575 rc = PTR_ERR(mount_data);
2576 mount_data = NULL;
2577 goto mount_fail_check;
2578 }
2571 2579
2572 if (tcon) 2580 if (tcon)
2573 cifs_put_tcon(tcon); 2581 cifs_put_tcon(tcon);
@@ -2575,8 +2583,6 @@ remote_path_check:
2575 cifs_put_smb_ses(pSesInfo); 2583 cifs_put_smb_ses(pSesInfo);
2576 2584
2577 cleanup_volume_info(&volume_info); 2585 cleanup_volume_info(&volume_info);
2578 FreeXid(xid);
2579 kfree(full_path);
2580 referral_walks_count++; 2586 referral_walks_count++;
2581 goto try_mount_again; 2587 goto try_mount_again;
2582 } 2588 }
@@ -2745,6 +2751,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2745 strncpy(tcon->treeName, tree, MAX_TREE_SIZE); 2751 strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
2746 2752
2747 /* mostly informational -- no need to fail on error here */ 2753 /* mostly informational -- no need to fail on error here */
2754 kfree(tcon->nativeFileSystem);
2748 tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr, 2755 tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
2749 bytes_left, is_unicode, 2756 bytes_left, is_unicode,
2750 nls_codepage); 2757 nls_codepage);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3758965d73d5..4326ffd90fa9 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -188,6 +188,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
188 FILE_UNIX_BASIC_INFO *presp_data; 188 FILE_UNIX_BASIC_INFO *presp_data;
189 __u32 posix_flags = 0; 189 __u32 posix_flags = 0;
190 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 190 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
191 struct cifs_fattr fattr;
191 192
192 cFYI(1, ("posix open %s", full_path)); 193 cFYI(1, ("posix open %s", full_path));
193 194
@@ -236,22 +237,21 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
236 if (presp_data->Type == cpu_to_le32(-1)) 237 if (presp_data->Type == cpu_to_le32(-1))
237 goto posix_open_ret; /* open ok, caller does qpathinfo */ 238 goto posix_open_ret; /* open ok, caller does qpathinfo */
238 239
239 /* get new inode and set it up */
240 if (!pinode) 240 if (!pinode)
241 goto posix_open_ret; /* caller does not need info */ 241 goto posix_open_ret; /* caller does not need info */
242 242
243 cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
244
245 /* get new inode and set it up */
243 if (*pinode == NULL) { 246 if (*pinode == NULL) {
244 __u64 unique_id = le64_to_cpu(presp_data->UniqueId); 247 *pinode = cifs_iget(sb, &fattr);
245 *pinode = cifs_new_inode(sb, &unique_id); 248 if (!*pinode) {
249 rc = -ENOMEM;
250 goto posix_open_ret;
251 }
252 } else {
253 cifs_fattr_to_inode(*pinode, &fattr);
246 } 254 }
247 /* else an inode was passed in. Update its info, don't create one */
248
249 /* We do not need to close the file if new_inode fails since
250 the caller will retry qpathinfo as long as inode is null */
251 if (*pinode == NULL)
252 goto posix_open_ret;
253
254 posix_fill_in_inode(*pinode, presp_data, 1);
255 255
256 cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only); 256 cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only);
257 257
@@ -307,8 +307,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
307 307
308 full_path = build_path_from_dentry(direntry); 308 full_path = build_path_from_dentry(direntry);
309 if (full_path == NULL) { 309 if (full_path == NULL) {
310 rc = -ENOMEM;
310 FreeXid(xid); 311 FreeXid(xid);
311 return -ENOMEM; 312 return rc;
312 } 313 }
313 314
314 if (oplockEnabled) 315 if (oplockEnabled)
@@ -424,9 +425,10 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
424 args.uid = NO_CHANGE_64; 425 args.uid = NO_CHANGE_64;
425 args.gid = NO_CHANGE_64; 426 args.gid = NO_CHANGE_64;
426 } 427 }
427 CIFSSMBUnixSetInfo(xid, tcon, full_path, &args, 428 CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
428 cifs_sb->local_nls, 429 cifs_sb->local_nls,
429 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 430 cifs_sb->mnt_cifs_flags &
431 CIFS_MOUNT_MAP_SPECIAL_CHR);
430 } else { 432 } else {
431 /* BB implement mode setting via Windows security 433 /* BB implement mode setting via Windows security
432 descriptors e.g. */ 434 descriptors e.g. */
@@ -514,10 +516,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
514 args.uid = NO_CHANGE_64; 516 args.uid = NO_CHANGE_64;
515 args.gid = NO_CHANGE_64; 517 args.gid = NO_CHANGE_64;
516 } 518 }
517 rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path, 519 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
518 &args, cifs_sb->local_nls, 520 cifs_sb->local_nls,
519 cifs_sb->mnt_cifs_flags & 521 cifs_sb->mnt_cifs_flags &
520 CIFS_MOUNT_MAP_SPECIAL_CHR); 522 CIFS_MOUNT_MAP_SPECIAL_CHR);
521 523
522 if (!rc) { 524 if (!rc) {
523 rc = cifs_get_inode_info_unix(&newinode, full_path, 525 rc = cifs_get_inode_info_unix(&newinode, full_path,
@@ -540,8 +542,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
540 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 542 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
541 if (buf == NULL) { 543 if (buf == NULL) {
542 kfree(full_path); 544 kfree(full_path);
545 rc = -ENOMEM;
543 FreeXid(xid); 546 FreeXid(xid);
544 return -ENOMEM; 547 return rc;
545 } 548 }
546 549
547 rc = CIFSSMBOpen(xid, pTcon, full_path, 550 rc = CIFSSMBOpen(xid, pTcon, full_path,
@@ -641,6 +644,15 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
641 } 644 }
642 } 645 }
643 646
647 /*
648 * O_EXCL: optimize away the lookup, but don't hash the dentry. Let
649 * the VFS handle the create.
650 */
651 if (nd->flags & LOOKUP_EXCL) {
652 d_instantiate(direntry, NULL);
653 return 0;
654 }
655
644 /* can not grab the rename sem here since it would 656 /* can not grab the rename sem here since it would
645 deadlock in the cases (beginning of sys_rename itself) 657 deadlock in the cases (beginning of sys_rename itself)
646 in which we already have the sb rename sem */ 658 in which we already have the sb rename sem */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index df4a306f697e..87948147d7ec 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -35,26 +35,11 @@
35 * 0 - name is not IP 35 * 0 - name is not IP
36 */ 36 */
37static int 37static int
38is_ip(const char *name) 38is_ip(char *name)
39{ 39{
40 int rc; 40 struct sockaddr_storage ss;
41 struct sockaddr_in sin_server; 41
42 struct sockaddr_in6 sin_server6; 42 return cifs_convert_address(name, &ss);
43
44 rc = cifs_inet_pton(AF_INET, name,
45 &sin_server.sin_addr.s_addr);
46
47 if (rc <= 0) {
48 /* not ipv4 address, try ipv6 */
49 rc = cifs_inet_pton(AF_INET6, name,
50 &sin_server6.sin6_addr.in6_u);
51 if (rc > 0)
52 return 1;
53 } else {
54 return 1;
55 }
56 /* we failed translating address */
57 return 0;
58} 43}
59 44
60static int 45static int
@@ -72,7 +57,7 @@ dns_resolver_instantiate(struct key *key, const void *data,
72 ip[datalen] = '\0'; 57 ip[datalen] = '\0';
73 58
74 /* make sure this looks like an address */ 59 /* make sure this looks like an address */
75 if (!is_ip((const char *) ip)) { 60 if (!is_ip(ip)) {
76 kfree(ip); 61 kfree(ip);
77 return -EINVAL; 62 return -EINVAL;
78 } 63 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 302ea15f02e6..c34b7f8a217b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -241,7 +241,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
241 /* BB need same check in cifs_create too? */ 241 /* BB need same check in cifs_create too? */
242 /* if not oplocked, invalidate inode pages if mtime or file 242 /* if not oplocked, invalidate inode pages if mtime or file
243 size changed */ 243 size changed */
244 temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime)); 244 temp = cifs_NTtimeToUnix(buf->LastWriteTime);
245 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && 245 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
246 (file->f_path.dentry->d_inode->i_size == 246 (file->f_path.dentry->d_inode->i_size ==
247 (loff_t)le64_to_cpu(buf->EndOfFile))) { 247 (loff_t)le64_to_cpu(buf->EndOfFile))) {
@@ -300,14 +300,16 @@ int cifs_open(struct inode *inode, struct file *file)
300 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 300 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
301 pCifsFile = cifs_fill_filedata(file); 301 pCifsFile = cifs_fill_filedata(file);
302 if (pCifsFile) { 302 if (pCifsFile) {
303 rc = 0;
303 FreeXid(xid); 304 FreeXid(xid);
304 return 0; 305 return rc;
305 } 306 }
306 307
307 full_path = build_path_from_dentry(file->f_path.dentry); 308 full_path = build_path_from_dentry(file->f_path.dentry);
308 if (full_path == NULL) { 309 if (full_path == NULL) {
310 rc = -ENOMEM;
309 FreeXid(xid); 311 FreeXid(xid);
310 return -ENOMEM; 312 return rc;
311 } 313 }
312 314
313 cFYI(1, ("inode = 0x%p file flags are 0x%x for %s", 315 cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
@@ -446,9 +448,9 @@ int cifs_open(struct inode *inode, struct file *file)
446 .mtime = NO_CHANGE_64, 448 .mtime = NO_CHANGE_64,
447 .device = 0, 449 .device = 0,
448 }; 450 };
449 CIFSSMBUnixSetInfo(xid, tcon, full_path, &args, 451 CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
450 cifs_sb->local_nls, 452 cifs_sb->local_nls,
451 cifs_sb->mnt_cifs_flags & 453 cifs_sb->mnt_cifs_flags &
452 CIFS_MOUNT_MAP_SPECIAL_CHR); 454 CIFS_MOUNT_MAP_SPECIAL_CHR);
453 } 455 }
454 } 456 }
@@ -491,11 +493,12 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
491 return -EBADF; 493 return -EBADF;
492 494
493 xid = GetXid(); 495 xid = GetXid();
494 mutex_unlock(&pCifsFile->fh_mutex); 496 mutex_lock(&pCifsFile->fh_mutex);
495 if (!pCifsFile->invalidHandle) { 497 if (!pCifsFile->invalidHandle) {
496 mutex_lock(&pCifsFile->fh_mutex); 498 mutex_unlock(&pCifsFile->fh_mutex);
499 rc = 0;
497 FreeXid(xid); 500 FreeXid(xid);
498 return 0; 501 return rc;
499 } 502 }
500 503
501 if (file->f_path.dentry == NULL) { 504 if (file->f_path.dentry == NULL) {
@@ -524,7 +527,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
524 if (full_path == NULL) { 527 if (full_path == NULL) {
525 rc = -ENOMEM; 528 rc = -ENOMEM;
526reopen_error_exit: 529reopen_error_exit:
527 mutex_lock(&pCifsFile->fh_mutex); 530 mutex_unlock(&pCifsFile->fh_mutex);
528 FreeXid(xid); 531 FreeXid(xid);
529 return rc; 532 return rc;
530 } 533 }
@@ -566,14 +569,14 @@ reopen_error_exit:
566 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 569 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
567 CIFS_MOUNT_MAP_SPECIAL_CHR); 570 CIFS_MOUNT_MAP_SPECIAL_CHR);
568 if (rc) { 571 if (rc) {
569 mutex_lock(&pCifsFile->fh_mutex); 572 mutex_unlock(&pCifsFile->fh_mutex);
570 cFYI(1, ("cifs_open returned 0x%x", rc)); 573 cFYI(1, ("cifs_open returned 0x%x", rc));
571 cFYI(1, ("oplock: %d", oplock)); 574 cFYI(1, ("oplock: %d", oplock));
572 } else { 575 } else {
573reopen_success: 576reopen_success:
574 pCifsFile->netfid = netfid; 577 pCifsFile->netfid = netfid;
575 pCifsFile->invalidHandle = false; 578 pCifsFile->invalidHandle = false;
576 mutex_lock(&pCifsFile->fh_mutex); 579 mutex_unlock(&pCifsFile->fh_mutex);
577 pCifsInode = CIFS_I(inode); 580 pCifsInode = CIFS_I(inode);
578 if (pCifsInode) { 581 if (pCifsInode) {
579 if (can_flush) { 582 if (can_flush) {
@@ -845,8 +848,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
845 tcon = cifs_sb->tcon; 848 tcon = cifs_sb->tcon;
846 849
847 if (file->private_data == NULL) { 850 if (file->private_data == NULL) {
851 rc = -EBADF;
848 FreeXid(xid); 852 FreeXid(xid);
849 return -EBADF; 853 return rc;
850 } 854 }
851 netfid = ((struct cifsFileInfo *)file->private_data)->netfid; 855 netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
852 856
@@ -1805,8 +1809,9 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1805 pTcon = cifs_sb->tcon; 1809 pTcon = cifs_sb->tcon;
1806 1810
1807 if (file->private_data == NULL) { 1811 if (file->private_data == NULL) {
1812 rc = -EBADF;
1808 FreeXid(xid); 1813 FreeXid(xid);
1809 return -EBADF; 1814 return rc;
1810 } 1815 }
1811 open_file = (struct cifsFileInfo *)file->private_data; 1816 open_file = (struct cifsFileInfo *)file->private_data;
1812 1817
@@ -1885,8 +1890,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1885 pTcon = cifs_sb->tcon; 1890 pTcon = cifs_sb->tcon;
1886 1891
1887 if (file->private_data == NULL) { 1892 if (file->private_data == NULL) {
1893 rc = -EBADF;
1888 FreeXid(xid); 1894 FreeXid(xid);
1889 return -EBADF; 1895 return rc;
1890 } 1896 }
1891 open_file = (struct cifsFileInfo *)file->private_data; 1897 open_file = (struct cifsFileInfo *)file->private_data;
1892 1898
@@ -2019,8 +2025,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2019 2025
2020 xid = GetXid(); 2026 xid = GetXid();
2021 if (file->private_data == NULL) { 2027 if (file->private_data == NULL) {
2028 rc = -EBADF;
2022 FreeXid(xid); 2029 FreeXid(xid);
2023 return -EBADF; 2030 return rc;
2024 } 2031 }
2025 open_file = (struct cifsFileInfo *)file->private_data; 2032 open_file = (struct cifsFileInfo *)file->private_data;
2026 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2033 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -2185,8 +2192,9 @@ static int cifs_readpage(struct file *file, struct page *page)
2185 xid = GetXid(); 2192 xid = GetXid();
2186 2193
2187 if (file->private_data == NULL) { 2194 if (file->private_data == NULL) {
2195 rc = -EBADF;
2188 FreeXid(xid); 2196 FreeXid(xid);
2189 return -EBADF; 2197 return rc;
2190 } 2198 }
2191 2199
2192 cFYI(1, ("readpage %p at offset %d 0x%x\n", 2200 cFYI(1, ("readpage %p at offset %d 0x%x\n",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9c869a6dcba1..82d83839655e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -77,239 +77,202 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
77 } 77 }
78} 78}
79 79
80static void cifs_unix_info_to_inode(struct inode *inode, 80/* populate an inode with info from a cifs_fattr struct */
81 FILE_UNIX_BASIC_INFO *info, int force_uid_gid) 81void
82cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
82{ 83{
84 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
83 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 85 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
84 struct cifsInodeInfo *cifsInfo = CIFS_I(inode); 86 unsigned long oldtime = cifs_i->time;
85 __u64 num_of_bytes = le64_to_cpu(info->NumOfBytes); 87
86 __u64 end_of_file = le64_to_cpu(info->EndOfFile); 88 inode->i_atime = fattr->cf_atime;
89 inode->i_mtime = fattr->cf_mtime;
90 inode->i_ctime = fattr->cf_ctime;
91 inode->i_rdev = fattr->cf_rdev;
92 inode->i_nlink = fattr->cf_nlink;
93 inode->i_uid = fattr->cf_uid;
94 inode->i_gid = fattr->cf_gid;
95
96 /* if dynperm is set, don't clobber existing mode */
97 if (inode->i_state & I_NEW ||
98 !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM))
99 inode->i_mode = fattr->cf_mode;
100
101 cifs_i->cifsAttrs = fattr->cf_cifsattrs;
102 cifs_i->uniqueid = fattr->cf_uniqueid;
103
104 if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
105 cifs_i->time = 0;
106 else
107 cifs_i->time = jiffies;
108
109 cFYI(1, ("inode 0x%p old_time=%ld new_time=%ld", inode,
110 oldtime, cifs_i->time));
87 111
88 inode->i_atime = cifs_NTtimeToUnix(le64_to_cpu(info->LastAccessTime)); 112 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
89 inode->i_mtime = 113
90 cifs_NTtimeToUnix(le64_to_cpu(info->LastModificationTime)); 114 /*
91 inode->i_ctime = cifs_NTtimeToUnix(le64_to_cpu(info->LastStatusChange)); 115 * Can't safely change the file size here if the client is writing to
92 inode->i_mode = le64_to_cpu(info->Permissions); 116 * it due to potential races.
117 */
118 spin_lock(&inode->i_lock);
119 if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) {
120 i_size_write(inode, fattr->cf_eof);
121
122 /*
123 * i_blocks is not related to (i_size / i_blksize),
124 * but instead 512 byte (2**9) size is required for
125 * calculating num blocks.
126 */
127 inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9;
128 }
129 spin_unlock(&inode->i_lock);
130
131 cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
132}
133
134/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
135void
136cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
137 struct cifs_sb_info *cifs_sb)
138{
139 memset(fattr, 0, sizeof(*fattr));
140 fattr->cf_uniqueid = le64_to_cpu(info->UniqueId);
141 fattr->cf_bytes = le64_to_cpu(info->NumOfBytes);
142 fattr->cf_eof = le64_to_cpu(info->EndOfFile);
143
144 fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
145 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastModificationTime);
146 fattr->cf_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
147 fattr->cf_mode = le64_to_cpu(info->Permissions);
93 148
94 /* 149 /*
95 * Since we set the inode type below we need to mask off 150 * Since we set the inode type below we need to mask off
96 * to avoid strange results if bits set above. 151 * to avoid strange results if bits set above.
97 */ 152 */
98 inode->i_mode &= ~S_IFMT; 153 fattr->cf_mode &= ~S_IFMT;
99 switch (le32_to_cpu(info->Type)) { 154 switch (le32_to_cpu(info->Type)) {
100 case UNIX_FILE: 155 case UNIX_FILE:
101 inode->i_mode |= S_IFREG; 156 fattr->cf_mode |= S_IFREG;
157 fattr->cf_dtype = DT_REG;
102 break; 158 break;
103 case UNIX_SYMLINK: 159 case UNIX_SYMLINK:
104 inode->i_mode |= S_IFLNK; 160 fattr->cf_mode |= S_IFLNK;
161 fattr->cf_dtype = DT_LNK;
105 break; 162 break;
106 case UNIX_DIR: 163 case UNIX_DIR:
107 inode->i_mode |= S_IFDIR; 164 fattr->cf_mode |= S_IFDIR;
165 fattr->cf_dtype = DT_DIR;
108 break; 166 break;
109 case UNIX_CHARDEV: 167 case UNIX_CHARDEV:
110 inode->i_mode |= S_IFCHR; 168 fattr->cf_mode |= S_IFCHR;
111 inode->i_rdev = MKDEV(le64_to_cpu(info->DevMajor), 169 fattr->cf_dtype = DT_CHR;
112 le64_to_cpu(info->DevMinor) & MINORMASK); 170 fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor),
171 le64_to_cpu(info->DevMinor) & MINORMASK);
113 break; 172 break;
114 case UNIX_BLOCKDEV: 173 case UNIX_BLOCKDEV:
115 inode->i_mode |= S_IFBLK; 174 fattr->cf_mode |= S_IFBLK;
116 inode->i_rdev = MKDEV(le64_to_cpu(info->DevMajor), 175 fattr->cf_dtype = DT_BLK;
117 le64_to_cpu(info->DevMinor) & MINORMASK); 176 fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor),
177 le64_to_cpu(info->DevMinor) & MINORMASK);
118 break; 178 break;
119 case UNIX_FIFO: 179 case UNIX_FIFO:
120 inode->i_mode |= S_IFIFO; 180 fattr->cf_mode |= S_IFIFO;
181 fattr->cf_dtype = DT_FIFO;
121 break; 182 break;
122 case UNIX_SOCKET: 183 case UNIX_SOCKET:
123 inode->i_mode |= S_IFSOCK; 184 fattr->cf_mode |= S_IFSOCK;
185 fattr->cf_dtype = DT_SOCK;
124 break; 186 break;
125 default: 187 default:
126 /* safest to call it a file if we do not know */ 188 /* safest to call it a file if we do not know */
127 inode->i_mode |= S_IFREG; 189 fattr->cf_mode |= S_IFREG;
190 fattr->cf_dtype = DT_REG;
128 cFYI(1, ("unknown type %d", le32_to_cpu(info->Type))); 191 cFYI(1, ("unknown type %d", le32_to_cpu(info->Type)));
129 break; 192 break;
130 } 193 }
131 194
132 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) && 195 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
133 !force_uid_gid) 196 fattr->cf_uid = cifs_sb->mnt_uid;
134 inode->i_uid = cifs_sb->mnt_uid;
135 else 197 else
136 inode->i_uid = le64_to_cpu(info->Uid); 198 fattr->cf_uid = le64_to_cpu(info->Uid);
137 199
138 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) && 200 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
139 !force_uid_gid) 201 fattr->cf_gid = cifs_sb->mnt_gid;
140 inode->i_gid = cifs_sb->mnt_gid;
141 else 202 else
142 inode->i_gid = le64_to_cpu(info->Gid); 203 fattr->cf_gid = le64_to_cpu(info->Gid);
143
144 inode->i_nlink = le64_to_cpu(info->Nlinks);
145
146 cifsInfo->server_eof = end_of_file;
147 spin_lock(&inode->i_lock);
148 if (is_size_safe_to_change(cifsInfo, end_of_file)) {
149 /*
150 * We can not safely change the file size here if the client
151 * is writing to it due to potential races.
152 */
153 i_size_write(inode, end_of_file);
154 204
155 /* 205 fattr->cf_nlink = le64_to_cpu(info->Nlinks);
156 * i_blocks is not related to (i_size / i_blksize),
157 * but instead 512 byte (2**9) size is required for
158 * calculating num blocks.
159 */
160 inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
161 }
162 spin_unlock(&inode->i_lock);
163} 206}
164 207
165
166/* 208/*
167 * Needed to setup inode data for the directory which is the 209 * Fill a cifs_fattr struct with fake inode info.
168 * junction to the new submount (ie to setup the fake directory
169 * which represents a DFS referral)
170 */
171static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
172 struct super_block *sb)
173{
174 struct inode *pinode = NULL;
175
176 memset(pfnd_dat, 0, sizeof(FILE_UNIX_BASIC_INFO));
177
178/* __le64 pfnd_dat->EndOfFile = cpu_to_le64(0);
179 __le64 pfnd_dat->NumOfBytes = cpu_to_le64(0);
180 __u64 UniqueId = 0; */
181 pfnd_dat->LastStatusChange =
182 cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
183 pfnd_dat->LastAccessTime =
184 cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
185 pfnd_dat->LastModificationTime =
186 cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
187 pfnd_dat->Type = cpu_to_le32(UNIX_DIR);
188 pfnd_dat->Permissions = cpu_to_le64(S_IXUGO | S_IRWXU);
189 pfnd_dat->Nlinks = cpu_to_le64(2);
190 if (sb->s_root)
191 pinode = sb->s_root->d_inode;
192 if (pinode == NULL)
193 return;
194
195 /* fill in default values for the remaining based on root
196 inode since we can not query the server for this inode info */
197 pfnd_dat->DevMajor = cpu_to_le64(MAJOR(pinode->i_rdev));
198 pfnd_dat->DevMinor = cpu_to_le64(MINOR(pinode->i_rdev));
199 pfnd_dat->Uid = cpu_to_le64(pinode->i_uid);
200 pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
201}
202
203/**
204 * cifs_new inode - create new inode, initialize, and hash it
205 * @sb - pointer to superblock
206 * @inum - if valid pointer and serverino is enabled, replace i_ino with val
207 *
208 * Create a new inode, initialize it for CIFS and hash it. Returns the new
209 * inode or NULL if one couldn't be allocated.
210 * 210 *
211 * If the share isn't mounted with "serverino" or inum is a NULL pointer then 211 * Needed to setup cifs_fattr data for the directory which is the
212 * we'll just use the inode number assigned by new_inode(). Note that this can 212 * junction to the new submount (ie to setup the fake directory
213 * mean i_ino collisions since the i_ino assigned by new_inode is not 213 * which represents a DFS referral).
214 * guaranteed to be unique.
215 */ 214 */
216struct inode * 215static void
217cifs_new_inode(struct super_block *sb, __u64 *inum) 216cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
218{ 217{
219 struct inode *inode; 218 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
220
221 inode = new_inode(sb);
222 if (inode == NULL)
223 return NULL;
224
225 /*
226 * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we
227 * stop passing inum as ptr. Are there sanity checks we can use to
228 * ensure that the server is really filling in that field? Also,
229 * if serverino is disabled, perhaps we should be using iunique()?
230 */
231 if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
232 inode->i_ino = (unsigned long) *inum;
233
234 /*
235 * must set this here instead of cifs_alloc_inode since VFS will
236 * clobber i_flags
237 */
238 if (sb->s_flags & MS_NOATIME)
239 inode->i_flags |= S_NOATIME | S_NOCMTIME;
240
241 insert_inode_hash(inode);
242 219
243 return inode; 220 cFYI(1, ("creating fake fattr for DFS referral"));
221
222 memset(fattr, 0, sizeof(*fattr));
223 fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
224 fattr->cf_uid = cifs_sb->mnt_uid;
225 fattr->cf_gid = cifs_sb->mnt_gid;
226 fattr->cf_atime = CURRENT_TIME;
227 fattr->cf_ctime = CURRENT_TIME;
228 fattr->cf_mtime = CURRENT_TIME;
229 fattr->cf_nlink = 2;
230 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
244} 231}
245 232
246int cifs_get_inode_info_unix(struct inode **pinode, 233int cifs_get_inode_info_unix(struct inode **pinode,
247 const unsigned char *full_path, struct super_block *sb, int xid) 234 const unsigned char *full_path,
235 struct super_block *sb, int xid)
248{ 236{
249 int rc = 0; 237 int rc;
250 FILE_UNIX_BASIC_INFO find_data; 238 FILE_UNIX_BASIC_INFO find_data;
251 struct cifsTconInfo *pTcon; 239 struct cifs_fattr fattr;
252 struct inode *inode; 240 struct cifsTconInfo *tcon;
253 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 241 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
254 bool is_dfs_referral = false;
255 struct cifsInodeInfo *cifsInfo;
256 __u64 num_of_bytes;
257 __u64 end_of_file;
258 242
259 pTcon = cifs_sb->tcon; 243 tcon = cifs_sb->tcon;
260 cFYI(1, ("Getting info on %s", full_path)); 244 cFYI(1, ("Getting info on %s", full_path));
261 245
262 /* could have done a find first instead but this returns more info */ 246 /* could have done a find first instead but this returns more info */
263 rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data, 247 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
264 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 248 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
265 CIFS_MOUNT_MAP_SPECIAL_CHR); 249 CIFS_MOUNT_MAP_SPECIAL_CHR);
266 if (rc == -EREMOTE && !is_dfs_referral) {
267 is_dfs_referral = true;
268 cFYI(DBG2, ("DFS ref"));
269 /* for DFS, server does not give us real inode data */
270 fill_fake_finddataunix(&find_data, sb);
271 rc = 0;
272 } else if (rc)
273 goto cgiiu_exit;
274 250
275 num_of_bytes = le64_to_cpu(find_data.NumOfBytes); 251 if (!rc) {
276 end_of_file = le64_to_cpu(find_data.EndOfFile); 252 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
253 } else if (rc == -EREMOTE) {
254 cifs_create_dfs_fattr(&fattr, sb);
255 rc = 0;
256 } else {
257 return rc;
258 }
277 259
278 /* get new inode */
279 if (*pinode == NULL) { 260 if (*pinode == NULL) {
280 __u64 unique_id = le64_to_cpu(find_data.UniqueId); 261 /* get new inode */
281 *pinode = cifs_new_inode(sb, &unique_id); 262 *pinode = cifs_iget(sb, &fattr);
282 if (*pinode == NULL) { 263 if (!*pinode)
283 rc = -ENOMEM; 264 rc = -ENOMEM;
284 goto cgiiu_exit; 265 } else {
285 } 266 /* we already have inode, update it */
267 cifs_fattr_to_inode(*pinode, &fattr);
286 } 268 }
287 269
288 inode = *pinode;
289 cifsInfo = CIFS_I(inode);
290
291 cFYI(1, ("Old time %ld", cifsInfo->time));
292 cifsInfo->time = jiffies;
293 cFYI(1, ("New time %ld", cifsInfo->time));
294 /* this is ok to set on every inode revalidate */
295 atomic_set(&cifsInfo->inUse, 1);
296
297 cifs_unix_info_to_inode(inode, &find_data, 0);
298
299 if (num_of_bytes < end_of_file)
300 cFYI(1, ("allocation size less than end of file"));
301 cFYI(1, ("Size %ld and blocks %llu",
302 (unsigned long) inode->i_size,
303 (unsigned long long)inode->i_blocks));
304
305 cifs_set_ops(inode, is_dfs_referral);
306cgiiu_exit:
307 return rc; 270 return rc;
308} 271}
309 272
310static int decode_sfu_inode(struct inode *inode, __u64 size, 273static int
311 const unsigned char *path, 274cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
312 struct cifs_sb_info *cifs_sb, int xid) 275 struct cifs_sb_info *cifs_sb, int xid)
313{ 276{
314 int rc; 277 int rc;
315 int oplock = 0; 278 int oplock = 0;
@@ -321,10 +284,15 @@ static int decode_sfu_inode(struct inode *inode, __u64 size,
321 284
322 pbuf = buf; 285 pbuf = buf;
323 286
324 if (size == 0) { 287 fattr->cf_mode &= ~S_IFMT;
325 inode->i_mode |= S_IFIFO; 288
289 if (fattr->cf_eof == 0) {
290 fattr->cf_mode |= S_IFIFO;
291 fattr->cf_dtype = DT_FIFO;
326 return 0; 292 return 0;
327 } else if (size < 8) { 293 } else if (fattr->cf_eof < 8) {
294 fattr->cf_mode |= S_IFREG;
295 fattr->cf_dtype = DT_REG;
328 return -EINVAL; /* EOPNOTSUPP? */ 296 return -EINVAL; /* EOPNOTSUPP? */
329 } 297 }
330 298
@@ -336,42 +304,46 @@ static int decode_sfu_inode(struct inode *inode, __u64 size,
336 if (rc == 0) { 304 if (rc == 0) {
337 int buf_type = CIFS_NO_BUFFER; 305 int buf_type = CIFS_NO_BUFFER;
338 /* Read header */ 306 /* Read header */
339 rc = CIFSSMBRead(xid, pTcon, 307 rc = CIFSSMBRead(xid, pTcon, netfid,
340 netfid,
341 24 /* length */, 0 /* offset */, 308 24 /* length */, 0 /* offset */,
342 &bytes_read, &pbuf, &buf_type); 309 &bytes_read, &pbuf, &buf_type);
343 if ((rc == 0) && (bytes_read >= 8)) { 310 if ((rc == 0) && (bytes_read >= 8)) {
344 if (memcmp("IntxBLK", pbuf, 8) == 0) { 311 if (memcmp("IntxBLK", pbuf, 8) == 0) {
345 cFYI(1, ("Block device")); 312 cFYI(1, ("Block device"));
346 inode->i_mode |= S_IFBLK; 313 fattr->cf_mode |= S_IFBLK;
314 fattr->cf_dtype = DT_BLK;
347 if (bytes_read == 24) { 315 if (bytes_read == 24) {
348 /* we have enough to decode dev num */ 316 /* we have enough to decode dev num */
349 __u64 mjr; /* major */ 317 __u64 mjr; /* major */
350 __u64 mnr; /* minor */ 318 __u64 mnr; /* minor */
351 mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); 319 mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
352 mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); 320 mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
353 inode->i_rdev = MKDEV(mjr, mnr); 321 fattr->cf_rdev = MKDEV(mjr, mnr);
354 } 322 }
355 } else if (memcmp("IntxCHR", pbuf, 8) == 0) { 323 } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
356 cFYI(1, ("Char device")); 324 cFYI(1, ("Char device"));
357 inode->i_mode |= S_IFCHR; 325 fattr->cf_mode |= S_IFCHR;
326 fattr->cf_dtype = DT_CHR;
358 if (bytes_read == 24) { 327 if (bytes_read == 24) {
359 /* we have enough to decode dev num */ 328 /* we have enough to decode dev num */
360 __u64 mjr; /* major */ 329 __u64 mjr; /* major */
361 __u64 mnr; /* minor */ 330 __u64 mnr; /* minor */
362 mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); 331 mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
363 mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); 332 mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
364 inode->i_rdev = MKDEV(mjr, mnr); 333 fattr->cf_rdev = MKDEV(mjr, mnr);
365 } 334 }
366 } else if (memcmp("IntxLNK", pbuf, 7) == 0) { 335 } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
367 cFYI(1, ("Symlink")); 336 cFYI(1, ("Symlink"));
368 inode->i_mode |= S_IFLNK; 337 fattr->cf_mode |= S_IFLNK;
338 fattr->cf_dtype = DT_LNK;
369 } else { 339 } else {
370 inode->i_mode |= S_IFREG; /* file? */ 340 fattr->cf_mode |= S_IFREG; /* file? */
341 fattr->cf_dtype = DT_REG;
371 rc = -EOPNOTSUPP; 342 rc = -EOPNOTSUPP;
372 } 343 }
373 } else { 344 } else {
374 inode->i_mode |= S_IFREG; /* then it is a file */ 345 fattr->cf_mode |= S_IFREG; /* then it is a file */
346 fattr->cf_dtype = DT_REG;
375 rc = -EOPNOTSUPP; /* or some unknown SFU type */ 347 rc = -EOPNOTSUPP; /* or some unknown SFU type */
376 } 348 }
377 CIFSSMBClose(xid, pTcon, netfid); 349 CIFSSMBClose(xid, pTcon, netfid);
@@ -381,9 +353,13 @@ static int decode_sfu_inode(struct inode *inode, __u64 size,
381 353
382#define SFBITS_MASK (S_ISVTX | S_ISGID | S_ISUID) /* SETFILEBITS valid bits */ 354#define SFBITS_MASK (S_ISVTX | S_ISGID | S_ISUID) /* SETFILEBITS valid bits */
383 355
384static int get_sfu_mode(struct inode *inode, 356/*
385 const unsigned char *path, 357 * Fetch mode bits as provided by SFU.
386 struct cifs_sb_info *cifs_sb, int xid) 358 *
359 * FIXME: Doesn't this clobber the type bit we got from cifs_sfu_type ?
360 */
361static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
362 struct cifs_sb_info *cifs_sb, int xid)
387{ 363{
388#ifdef CONFIG_CIFS_XATTR 364#ifdef CONFIG_CIFS_XATTR
389 ssize_t rc; 365 ssize_t rc;
@@ -391,68 +367,80 @@ static int get_sfu_mode(struct inode *inode,
391 __u32 mode; 367 __u32 mode;
392 368
393 rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS", 369 rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS",
394 ea_value, 4 /* size of buf */, cifs_sb->local_nls, 370 ea_value, 4 /* size of buf */, cifs_sb->local_nls,
395 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 371 cifs_sb->mnt_cifs_flags &
372 CIFS_MOUNT_MAP_SPECIAL_CHR);
396 if (rc < 0) 373 if (rc < 0)
397 return (int)rc; 374 return (int)rc;
398 else if (rc > 3) { 375 else if (rc > 3) {
399 mode = le32_to_cpu(*((__le32 *)ea_value)); 376 mode = le32_to_cpu(*((__le32 *)ea_value));
400 inode->i_mode &= ~SFBITS_MASK; 377 fattr->cf_mode &= ~SFBITS_MASK;
401 cFYI(1, ("special bits 0%o org mode 0%o", mode, inode->i_mode)); 378 cFYI(1, ("special bits 0%o org mode 0%o", mode,
402 inode->i_mode = (mode & SFBITS_MASK) | inode->i_mode; 379 fattr->cf_mode));
380 fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
403 cFYI(1, ("special mode bits 0%o", mode)); 381 cFYI(1, ("special mode bits 0%o", mode));
404 return 0;
405 } else {
406 return 0;
407 } 382 }
383
384 return 0;
408#else 385#else
409 return -EOPNOTSUPP; 386 return -EOPNOTSUPP;
410#endif 387#endif
411} 388}
412 389
413/* 390/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
414 * Needed to setup inode data for the directory which is the 391static void
415 * junction to the new submount (ie to setup the fake directory 392cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
416 * which represents a DFS referral) 393 struct cifs_sb_info *cifs_sb, bool adjust_tz)
417 */
418static void fill_fake_finddata(FILE_ALL_INFO *pfnd_dat,
419 struct super_block *sb)
420{ 394{
421 memset(pfnd_dat, 0, sizeof(FILE_ALL_INFO)); 395 memset(fattr, 0, sizeof(*fattr));
422 396 fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
423/* __le64 pfnd_dat->AllocationSize = cpu_to_le64(0); 397 if (info->DeletePending)
424 __le64 pfnd_dat->EndOfFile = cpu_to_le64(0); 398 fattr->cf_flags |= CIFS_FATTR_DELETE_PENDING;
425 __u8 pfnd_dat->DeletePending = 0; 399
426 __u8 pfnd_data->Directory = 0; 400 if (info->LastAccessTime)
427 __le32 pfnd_dat->EASize = 0; 401 fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
428 __u64 pfnd_dat->IndexNumber = 0; 402 else
429 __u64 pfnd_dat->IndexNumber1 = 0; */ 403 fattr->cf_atime = CURRENT_TIME;
430 pfnd_dat->CreationTime = 404
431 cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); 405 fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
432 pfnd_dat->LastAccessTime = 406 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
433 cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); 407
434 pfnd_dat->LastWriteTime = 408 if (adjust_tz) {
435 cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); 409 fattr->cf_ctime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
436 pfnd_dat->ChangeTime = 410 fattr->cf_mtime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
437 cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); 411 }
438 pfnd_dat->Attributes = cpu_to_le32(ATTR_DIRECTORY); 412
439 pfnd_dat->NumberOfLinks = cpu_to_le32(2); 413 fattr->cf_eof = le64_to_cpu(info->EndOfFile);
414 fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
415
416 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
417 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
418 fattr->cf_dtype = DT_DIR;
419 } else {
420 fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
421 fattr->cf_dtype = DT_REG;
422
423 /* clear write bits if ATTR_READONLY is set */
424 if (fattr->cf_cifsattrs & ATTR_READONLY)
425 fattr->cf_mode &= ~(S_IWUGO);
426 }
427
428 fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
429
430 fattr->cf_uid = cifs_sb->mnt_uid;
431 fattr->cf_gid = cifs_sb->mnt_gid;
440} 432}
441 433
442int cifs_get_inode_info(struct inode **pinode, 434int cifs_get_inode_info(struct inode **pinode,
443 const unsigned char *full_path, FILE_ALL_INFO *pfindData, 435 const unsigned char *full_path, FILE_ALL_INFO *pfindData,
444 struct super_block *sb, int xid, const __u16 *pfid) 436 struct super_block *sb, int xid, const __u16 *pfid)
445{ 437{
446 int rc = 0; 438 int rc = 0, tmprc;
447 __u32 attr;
448 struct cifsInodeInfo *cifsInfo;
449 struct cifsTconInfo *pTcon; 439 struct cifsTconInfo *pTcon;
450 struct inode *inode;
451 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 440 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
452 char *buf = NULL; 441 char *buf = NULL;
453 bool adjustTZ = false; 442 bool adjustTZ = false;
454 bool is_dfs_referral = false; 443 struct cifs_fattr fattr;
455 umode_t default_mode;
456 444
457 pTcon = cifs_sb->tcon; 445 pTcon = cifs_sb->tcon;
458 cFYI(1, ("Getting info on %s", full_path)); 446 cFYI(1, ("Getting info on %s", full_path));
@@ -487,166 +475,85 @@ int cifs_get_inode_info(struct inode **pinode,
487 adjustTZ = true; 475 adjustTZ = true;
488 } 476 }
489 } 477 }
490 /* dump_mem("\nQPathInfo return data",&findData, sizeof(findData)); */ 478
491 if (rc == -EREMOTE) { 479 if (!rc) {
492 is_dfs_referral = true; 480 cifs_all_info_to_fattr(&fattr, (FILE_ALL_INFO *) pfindData,
493 fill_fake_finddata(pfindData, sb); 481 cifs_sb, adjustTZ);
482 } else if (rc == -EREMOTE) {
483 cifs_create_dfs_fattr(&fattr, sb);
494 rc = 0; 484 rc = 0;
495 } else if (rc) 485 } else {
496 goto cgii_exit; 486 goto cgii_exit;
487 }
497 488
498 attr = le32_to_cpu(pfindData->Attributes); 489 /*
499 490 * If an inode wasn't passed in, then get the inode number
500 /* get new inode */ 491 *
492 * Is an i_ino of zero legal? Can we use that to check if the server
493 * supports returning inode numbers? Are there other sanity checks we
494 * can use to ensure that the server is really filling in that field?
495 *
496 * We can not use the IndexNumber field by default from Windows or
497 * Samba (in ALL_INFO buf) but we can request it explicitly. The SNIA
498 * CIFS spec claims that this value is unique within the scope of a
499 * share, and the windows docs hint that it's actually unique
500 * per-machine.
501 *
502 * There may be higher info levels that work but are there Windows
503 * server or network appliances for which IndexNumber field is not
504 * guaranteed unique?
505 */
501 if (*pinode == NULL) { 506 if (*pinode == NULL) {
502 __u64 inode_num;
503 __u64 *pinum = &inode_num;
504
505 /* Is an i_ino of zero legal? Can we use that to check
506 if the server supports returning inode numbers? Are
507 there other sanity checks we can use to ensure that
508 the server is really filling in that field? */
509
510 /* We can not use the IndexNumber field by default from
511 Windows or Samba (in ALL_INFO buf) but we can request
512 it explicitly. It may not be unique presumably if
513 the server has multiple devices mounted under one share */
514
515 /* There may be higher info levels that work but are
516 there Windows server or network appliances for which
517 IndexNumber field is not guaranteed unique? */
518
519 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 507 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
520 int rc1 = 0; 508 int rc1 = 0;
521 509
522 rc1 = CIFSGetSrvInodeNumber(xid, pTcon, 510 rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
523 full_path, pinum, 511 full_path, &fattr.cf_uniqueid,
524 cifs_sb->local_nls, 512 cifs_sb->local_nls,
525 cifs_sb->mnt_cifs_flags & 513 cifs_sb->mnt_cifs_flags &
526 CIFS_MOUNT_MAP_SPECIAL_CHR); 514 CIFS_MOUNT_MAP_SPECIAL_CHR);
527 if (rc1) { 515 if (rc1) {
528 cFYI(1, ("GetSrvInodeNum rc %d", rc1)); 516 cFYI(1, ("GetSrvInodeNum rc %d", rc1));
529 pinum = NULL; 517 fattr.cf_uniqueid = iunique(sb, ROOT_I);
530 /* BB EOPNOSUPP disable SERVER_INUM? */ 518 /* disable serverino if call not supported */
519 if (rc1 == -EINVAL)
520 cifs_sb->mnt_cifs_flags &=
521 ~CIFS_MOUNT_SERVER_INUM;
531 } 522 }
532 } else { 523 } else {
533 pinum = NULL; 524 fattr.cf_uniqueid = iunique(sb, ROOT_I);
534 }
535
536 *pinode = cifs_new_inode(sb, pinum);
537 if (*pinode == NULL) {
538 rc = -ENOMEM;
539 goto cgii_exit;
540 } 525 }
541 }
542 inode = *pinode;
543 cifsInfo = CIFS_I(inode);
544 cifsInfo->cifsAttrs = attr;
545 cifsInfo->delete_pending = pfindData->DeletePending ? true : false;
546 cFYI(1, ("Old time %ld", cifsInfo->time));
547 cifsInfo->time = jiffies;
548 cFYI(1, ("New time %ld", cifsInfo->time));
549
550 /* blksize needs to be multiple of two. So safer to default to
551 blksize and blkbits set in superblock so 2**blkbits and blksize
552 will match rather than setting to:
553 (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
554
555 /* Linux can not store file creation time so ignore it */
556 if (pfindData->LastAccessTime)
557 inode->i_atime = cifs_NTtimeToUnix
558 (le64_to_cpu(pfindData->LastAccessTime));
559 else /* do not need to use current_fs_time - time not stored */
560 inode->i_atime = CURRENT_TIME;
561 inode->i_mtime =
562 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
563 inode->i_ctime =
564 cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
565 cFYI(DBG2, ("Attributes came in as 0x%x", attr));
566 if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
567 inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
568 inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
569 }
570
571 /* get default inode mode */
572 if (attr & ATTR_DIRECTORY)
573 default_mode = cifs_sb->mnt_dir_mode;
574 else
575 default_mode = cifs_sb->mnt_file_mode;
576
577 /* set permission bits */
578 if (atomic_read(&cifsInfo->inUse) == 0 ||
579 (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
580 inode->i_mode = default_mode;
581 else {
582 /* just reenable write bits if !ATTR_READONLY */
583 if ((inode->i_mode & S_IWUGO) == 0 &&
584 (attr & ATTR_READONLY) == 0)
585 inode->i_mode |= (S_IWUGO & default_mode);
586
587 inode->i_mode &= ~S_IFMT;
588 }
589 /* clear write bits if ATTR_READONLY is set */
590 if (attr & ATTR_READONLY)
591 inode->i_mode &= ~S_IWUGO;
592
593 /* set inode type */
594 if ((attr & ATTR_SYSTEM) &&
595 (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
596 /* no need to fix endianness on 0 */
597 if (pfindData->EndOfFile == 0)
598 inode->i_mode |= S_IFIFO;
599 else if (decode_sfu_inode(inode,
600 le64_to_cpu(pfindData->EndOfFile),
601 full_path, cifs_sb, xid))
602 cFYI(1, ("unknown SFU file type\n"));
603 } else { 526 } else {
604 if (attr & ATTR_DIRECTORY) 527 fattr.cf_uniqueid = CIFS_I(*pinode)->uniqueid;
605 inode->i_mode |= S_IFDIR;
606 else
607 inode->i_mode |= S_IFREG;
608 } 528 }
609 529
610 cifsInfo->server_eof = le64_to_cpu(pfindData->EndOfFile); 530 /* query for SFU type info if supported and needed */
611 spin_lock(&inode->i_lock); 531 if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
612 if (is_size_safe_to_change(cifsInfo, cifsInfo->server_eof)) { 532 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
613 /* can not safely shrink the file size here if the 533 tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
614 client is writing to it due to potential races */ 534 if (tmprc)
615 i_size_write(inode, cifsInfo->server_eof); 535 cFYI(1, ("cifs_sfu_type failed: %d", tmprc));
616
617 /* 512 bytes (2**9) is the fake blocksize that must be
618 used for this calculation */
619 inode->i_blocks = (512 - 1 + le64_to_cpu(
620 pfindData->AllocationSize)) >> 9;
621 } 536 }
622 spin_unlock(&inode->i_lock);
623 537
624 inode->i_nlink = le32_to_cpu(pfindData->NumberOfLinks);
625
626 /* BB fill in uid and gid here? with help from winbind?
627 or retrieve from NTFS stream extended attribute */
628#ifdef CONFIG_CIFS_EXPERIMENTAL 538#ifdef CONFIG_CIFS_EXPERIMENTAL
629 /* fill in 0777 bits from ACL */ 539 /* fill in 0777 bits from ACL */
630 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { 540 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
631 cFYI(1, ("Getting mode bits from ACL")); 541 cFYI(1, ("Getting mode bits from ACL"));
632 acl_to_uid_mode(inode, full_path, pfid); 542 cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
633 } 543 }
634#endif 544#endif
635 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
636 /* fill in remaining high mode bits e.g. SUID, VTX */
637 get_sfu_mode(inode, full_path, cifs_sb, xid);
638 } else if (atomic_read(&cifsInfo->inUse) == 0) {
639 inode->i_uid = cifs_sb->mnt_uid;
640 inode->i_gid = cifs_sb->mnt_gid;
641 /* set so we do not keep refreshing these fields with
642 bad data after user has changed them in memory */
643 atomic_set(&cifsInfo->inUse, 1);
644 }
645
646 cifs_set_ops(inode, is_dfs_referral);
647
648 545
546 /* fill in remaining high mode bits e.g. SUID, VTX */
547 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
548 cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
649 549
550 if (!*pinode) {
551 *pinode = cifs_iget(sb, &fattr);
552 if (!*pinode)
553 rc = -ENOMEM;
554 } else {
555 cifs_fattr_to_inode(*pinode, &fattr);
556 }
650 557
651cgii_exit: 558cgii_exit:
652 kfree(buf); 559 kfree(buf);
@@ -698,33 +605,78 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
698 return full_path; 605 return full_path;
699} 606}
700 607
608static int
609cifs_find_inode(struct inode *inode, void *opaque)
610{
611 struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
612
613 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
614 return 0;
615
616 return 1;
617}
618
619static int
620cifs_init_inode(struct inode *inode, void *opaque)
621{
622 struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
623
624 CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
625 return 0;
626}
627
628/* Given fattrs, get a corresponding inode */
629struct inode *
630cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
631{
632 unsigned long hash;
633 struct inode *inode;
634
635 cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid));
636
637 /* hash down to 32-bits on 32-bit arch */
638 hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
639
640 inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
641
642 /* we have fattrs in hand, update the inode */
643 if (inode) {
644 cifs_fattr_to_inode(inode, fattr);
645 if (sb->s_flags & MS_NOATIME)
646 inode->i_flags |= S_NOATIME | S_NOCMTIME;
647 if (inode->i_state & I_NEW) {
648 inode->i_ino = hash;
649 unlock_new_inode(inode);
650 }
651 }
652
653 return inode;
654}
655
701/* gets root inode */ 656/* gets root inode */
702struct inode *cifs_iget(struct super_block *sb, unsigned long ino) 657struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
703{ 658{
704 int xid; 659 int xid;
705 struct cifs_sb_info *cifs_sb; 660 struct cifs_sb_info *cifs_sb;
706 struct inode *inode; 661 struct inode *inode = NULL;
707 long rc; 662 long rc;
708 char *full_path; 663 char *full_path;
709 664
710 inode = iget_locked(sb, ino); 665 cifs_sb = CIFS_SB(sb);
711 if (!inode)
712 return ERR_PTR(-ENOMEM);
713 if (!(inode->i_state & I_NEW))
714 return inode;
715
716 cifs_sb = CIFS_SB(inode->i_sb);
717 full_path = cifs_build_path_to_root(cifs_sb); 666 full_path = cifs_build_path_to_root(cifs_sb);
718 if (full_path == NULL) 667 if (full_path == NULL)
719 return ERR_PTR(-ENOMEM); 668 return ERR_PTR(-ENOMEM);
720 669
721 xid = GetXid(); 670 xid = GetXid();
722 if (cifs_sb->tcon->unix_ext) 671 if (cifs_sb->tcon->unix_ext)
723 rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, 672 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
724 xid);
725 else 673 else
726 rc = cifs_get_inode_info(&inode, full_path, NULL, inode->i_sb, 674 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
727 xid, NULL); 675 xid, NULL);
676
677 if (!inode)
678 return ERR_PTR(-ENOMEM);
679
728 if (rc && cifs_sb->tcon->ipc) { 680 if (rc && cifs_sb->tcon->ipc) {
729 cFYI(1, ("ipc connection - fake read inode")); 681 cFYI(1, ("ipc connection - fake read inode"));
730 inode->i_mode |= S_IFDIR; 682 inode->i_mode |= S_IFDIR;
@@ -740,7 +692,6 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
740 return ERR_PTR(rc); 692 return ERR_PTR(rc);
741 } 693 }
742 694
743 unlock_new_inode(inode);
744 695
745 kfree(full_path); 696 kfree(full_path);
746 /* can not call macro FreeXid here since in a void func 697 /* can not call macro FreeXid here since in a void func
@@ -991,8 +942,9 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
991 * sb->s_vfs_rename_mutex here */ 942 * sb->s_vfs_rename_mutex here */
992 full_path = build_path_from_dentry(dentry); 943 full_path = build_path_from_dentry(dentry);
993 if (full_path == NULL) { 944 if (full_path == NULL) {
945 rc = -ENOMEM;
994 FreeXid(xid); 946 FreeXid(xid);
995 return -ENOMEM; 947 return rc;
996 } 948 }
997 949
998 if ((tcon->ses->capabilities & CAP_UNIX) && 950 if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -1065,44 +1017,6 @@ out_reval:
1065 return rc; 1017 return rc;
1066} 1018}
1067 1019
1068void posix_fill_in_inode(struct inode *tmp_inode,
1069 FILE_UNIX_BASIC_INFO *pData, int isNewInode)
1070{
1071 struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
1072 loff_t local_size;
1073 struct timespec local_mtime;
1074
1075 cifsInfo->time = jiffies;
1076 atomic_inc(&cifsInfo->inUse);
1077
1078 /* save mtime and size */
1079 local_mtime = tmp_inode->i_mtime;
1080 local_size = tmp_inode->i_size;
1081
1082 cifs_unix_info_to_inode(tmp_inode, pData, 1);
1083 cifs_set_ops(tmp_inode, false);
1084
1085 if (!S_ISREG(tmp_inode->i_mode))
1086 return;
1087
1088 /*
1089 * No sense invalidating pages for new inode
1090 * since we we have not started caching
1091 * readahead file data yet.
1092 */
1093 if (isNewInode)
1094 return;
1095
1096 if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
1097 (local_size == tmp_inode->i_size)) {
1098 cFYI(1, ("inode exists but unchanged"));
1099 } else {
1100 /* file may have changed on server */
1101 cFYI(1, ("invalidate inode, readdir detected change"));
1102 invalidate_remote_inode(tmp_inode);
1103 }
1104}
1105
1106int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) 1020int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1107{ 1021{
1108 int rc = 0, tmprc; 1022 int rc = 0, tmprc;
@@ -1111,6 +1025,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1111 struct cifsTconInfo *pTcon; 1025 struct cifsTconInfo *pTcon;
1112 char *full_path = NULL; 1026 char *full_path = NULL;
1113 struct inode *newinode = NULL; 1027 struct inode *newinode = NULL;
1028 struct cifs_fattr fattr;
1114 1029
1115 cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode)); 1030 cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode));
1116 1031
@@ -1121,8 +1036,9 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1121 1036
1122 full_path = build_path_from_dentry(direntry); 1037 full_path = build_path_from_dentry(direntry);
1123 if (full_path == NULL) { 1038 if (full_path == NULL) {
1039 rc = -ENOMEM;
1124 FreeXid(xid); 1040 FreeXid(xid);
1125 return -ENOMEM; 1041 return rc;
1126 } 1042 }
1127 1043
1128 if ((pTcon->ses->capabilities & CAP_UNIX) && 1044 if ((pTcon->ses->capabilities & CAP_UNIX) &&
@@ -1149,7 +1065,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1149 cFYI(1, ("posix mkdir returned 0x%x", rc)); 1065 cFYI(1, ("posix mkdir returned 0x%x", rc));
1150 d_drop(direntry); 1066 d_drop(direntry);
1151 } else { 1067 } else {
1152 __u64 unique_id;
1153 if (pInfo->Type == cpu_to_le32(-1)) { 1068 if (pInfo->Type == cpu_to_le32(-1)) {
1154 /* no return info, go query for it */ 1069 /* no return info, go query for it */
1155 kfree(pInfo); 1070 kfree(pInfo);
@@ -1163,20 +1078,15 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1163 else 1078 else
1164 direntry->d_op = &cifs_dentry_ops; 1079 direntry->d_op = &cifs_dentry_ops;
1165 1080
1166 unique_id = le64_to_cpu(pInfo->UniqueId); 1081 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
1167 newinode = cifs_new_inode(inode->i_sb, &unique_id); 1082 newinode = cifs_iget(inode->i_sb, &fattr);
1168 if (newinode == NULL) { 1083 if (!newinode) {
1169 kfree(pInfo); 1084 kfree(pInfo);
1170 goto mkdir_get_info; 1085 goto mkdir_get_info;
1171 } 1086 }
1172 1087
1173 newinode->i_nlink = 2;
1174 d_instantiate(direntry, newinode); 1088 d_instantiate(direntry, newinode);
1175 1089
1176 /* we already checked in POSIXCreate whether
1177 frame was long enough */
1178 posix_fill_in_inode(direntry->d_inode,
1179 pInfo, 1 /* NewInode */);
1180#ifdef CONFIG_CIFS_DEBUG2 1090#ifdef CONFIG_CIFS_DEBUG2
1181 cFYI(1, ("instantiated dentry %p %s to inode %p", 1091 cFYI(1, ("instantiated dentry %p %s to inode %p",
1182 direntry, direntry->d_name.name, newinode)); 1092 direntry, direntry->d_name.name, newinode));
@@ -1239,10 +1149,10 @@ mkdir_get_info:
1239 args.uid = NO_CHANGE_64; 1149 args.uid = NO_CHANGE_64;
1240 args.gid = NO_CHANGE_64; 1150 args.gid = NO_CHANGE_64;
1241 } 1151 }
1242 CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args, 1152 CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
1243 cifs_sb->local_nls, 1153 cifs_sb->local_nls,
1244 cifs_sb->mnt_cifs_flags & 1154 cifs_sb->mnt_cifs_flags &
1245 CIFS_MOUNT_MAP_SPECIAL_CHR); 1155 CIFS_MOUNT_MAP_SPECIAL_CHR);
1246 } else { 1156 } else {
1247 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && 1157 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
1248 (mode & S_IWUGO) == 0) { 1158 (mode & S_IWUGO) == 0) {
@@ -1306,8 +1216,9 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1306 1216
1307 full_path = build_path_from_dentry(direntry); 1217 full_path = build_path_from_dentry(direntry);
1308 if (full_path == NULL) { 1218 if (full_path == NULL) {
1219 rc = -ENOMEM;
1309 FreeXid(xid); 1220 FreeXid(xid);
1310 return -ENOMEM; 1221 return rc;
1311 } 1222 }
1312 1223
1313 rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls, 1224 rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
@@ -1511,8 +1422,9 @@ int cifs_revalidate(struct dentry *direntry)
1511 since that would deadlock */ 1422 since that would deadlock */
1512 full_path = build_path_from_dentry(direntry); 1423 full_path = build_path_from_dentry(direntry);
1513 if (full_path == NULL) { 1424 if (full_path == NULL) {
1425 rc = -ENOMEM;
1514 FreeXid(xid); 1426 FreeXid(xid);
1515 return -ENOMEM; 1427 return rc;
1516 } 1428 }
1517 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld " 1429 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1518 "jiffies %ld", full_path, direntry->d_inode, 1430 "jiffies %ld", full_path, direntry->d_inode,
@@ -1621,6 +1533,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1621 if (!err) { 1533 if (!err) {
1622 generic_fillattr(dentry->d_inode, stat); 1534 generic_fillattr(dentry->d_inode, stat);
1623 stat->blksize = CIFS_MAX_MSGSIZE; 1535 stat->blksize = CIFS_MAX_MSGSIZE;
1536 stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
1624 } 1537 }
1625 return err; 1538 return err;
1626} 1539}
@@ -1785,6 +1698,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1785 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1698 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1786 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1699 struct cifsTconInfo *pTcon = cifs_sb->tcon;
1787 struct cifs_unix_set_info_args *args = NULL; 1700 struct cifs_unix_set_info_args *args = NULL;
1701 struct cifsFileInfo *open_file;
1788 1702
1789 cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x", 1703 cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x",
1790 direntry->d_name.name, attrs->ia_valid)); 1704 direntry->d_name.name, attrs->ia_valid));
@@ -1871,10 +1785,18 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1871 args->ctime = NO_CHANGE_64; 1785 args->ctime = NO_CHANGE_64;
1872 1786
1873 args->device = 0; 1787 args->device = 0;
1874 rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path, args, 1788 open_file = find_writable_file(cifsInode);
1875 cifs_sb->local_nls, 1789 if (open_file) {
1876 cifs_sb->mnt_cifs_flags & 1790 u16 nfid = open_file->netfid;
1877 CIFS_MOUNT_MAP_SPECIAL_CHR); 1791 u32 npid = open_file->pid;
1792 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
1793 atomic_dec(&open_file->wrtPending);
1794 } else {
1795 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
1796 cifs_sb->local_nls,
1797 cifs_sb->mnt_cifs_flags &
1798 CIFS_MOUNT_MAP_SPECIAL_CHR);
1799 }
1878 1800
1879 if (!rc) 1801 if (!rc)
1880 rc = inode_setattr(inode, attrs); 1802 rc = inode_setattr(inode, attrs);
@@ -1914,8 +1836,9 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1914 1836
1915 full_path = build_path_from_dentry(direntry); 1837 full_path = build_path_from_dentry(direntry);
1916 if (full_path == NULL) { 1838 if (full_path == NULL) {
1839 rc = -ENOMEM;
1917 FreeXid(xid); 1840 FreeXid(xid);
1918 return -ENOMEM; 1841 return rc;
1919 } 1842 }
1920 1843
1921 /* 1844 /*
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index cd83c53fcbb5..fc1e0487eaee 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -172,8 +172,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
172 full_path = build_path_from_dentry(direntry); 172 full_path = build_path_from_dentry(direntry);
173 173
174 if (full_path == NULL) { 174 if (full_path == NULL) {
175 rc = -ENOMEM;
175 FreeXid(xid); 176 FreeXid(xid);
176 return -ENOMEM; 177 return rc;
177 } 178 }
178 179
179 cFYI(1, ("Full path: %s", full_path)); 180 cFYI(1, ("Full path: %s", full_path));
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index e2fe998989a3..bd6d6895730d 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -133,10 +133,12 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
133 {0, 0} 133 {0, 0}
134}; 134};
135 135
136/* Convert string containing dotted ip address to binary form */ 136/*
137/* returns 0 if invalid address */ 137 * Convert a string containing text IPv4 or IPv6 address to binary form.
138 138 *
139int 139 * Returns 0 on failure.
140 */
141static int
140cifs_inet_pton(const int address_family, const char *cp, void *dst) 142cifs_inet_pton(const int address_family, const char *cp, void *dst)
141{ 143{
142 int ret = 0; 144 int ret = 0;
@@ -153,6 +155,52 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
153 return ret; 155 return ret;
154} 156}
155 157
158/*
159 * Try to convert a string to an IPv4 address and then attempt to convert
160 * it to an IPv6 address if that fails. Set the family field if either
161 * succeeds. If it's an IPv6 address and it has a '%' sign in it, try to
162 * treat the part following it as a numeric sin6_scope_id.
163 *
164 * Returns 0 on failure.
165 */
166int
167cifs_convert_address(char *src, void *dst)
168{
169 int rc;
170 char *pct, *endp;
171 struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
172 struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
173
174 /* IPv4 address */
175 if (cifs_inet_pton(AF_INET, src, &s4->sin_addr.s_addr)) {
176 s4->sin_family = AF_INET;
177 return 1;
178 }
179
180 /* temporarily terminate string */
181 pct = strchr(src, '%');
182 if (pct)
183 *pct = '\0';
184
185 rc = cifs_inet_pton(AF_INET6, src, &s6->sin6_addr.s6_addr);
186
187 /* repair temp termination (if any) and make pct point to scopeid */
188 if (pct)
189 *pct++ = '%';
190
191 if (!rc)
192 return rc;
193
194 s6->sin6_family = AF_INET6;
195 if (pct) {
196 s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
197 if (!*pct || *endp)
198 return 0;
199 }
200
201 return rc;
202}
203
156/***************************************************************************** 204/*****************************************************************************
157convert a NT status code to a dos class/code 205convert a NT status code to a dos class/code
158 *****************************************************************************/ 206 *****************************************************************************/
@@ -853,12 +901,12 @@ smbCalcSize_LE(struct smb_hdr *ptr)
853 901
854#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000) 902#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
855 903
856 /* 904/*
857 * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units) 905 * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
858 * into Unix UTC (based 1970-01-01, in seconds). 906 * into Unix UTC (based 1970-01-01, in seconds).
859 */ 907 */
860struct timespec 908struct timespec
861cifs_NTtimeToUnix(u64 ntutc) 909cifs_NTtimeToUnix(__le64 ntutc)
862{ 910{
863 struct timespec ts; 911 struct timespec ts;
864 /* BB what about the timezone? BB */ 912 /* BB what about the timezone? BB */
@@ -866,7 +914,7 @@ cifs_NTtimeToUnix(u64 ntutc)
866 /* Subtract the NTFS time offset, then convert to 1s intervals. */ 914 /* Subtract the NTFS time offset, then convert to 1s intervals. */
867 u64 t; 915 u64 t;
868 916
869 t = ntutc - NTFS_TIME_OFFSET; 917 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
870 ts.tv_nsec = do_div(t, 10000000) * 100; 918 ts.tv_nsec = do_div(t, 10000000) * 100;
871 ts.tv_sec = t; 919 ts.tv_sec = t;
872 return ts; 920 return ts;
@@ -883,16 +931,12 @@ cifs_UnixTimeToNT(struct timespec t)
883static int total_days_of_prev_months[] = 931static int total_days_of_prev_months[] =
884{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}; 932{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
885 933
886 934struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
887__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
888{
889 return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
890}
891
892struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
893{ 935{
894 struct timespec ts; 936 struct timespec ts;
895 int sec, min, days, month, year; 937 int sec, min, days, month, year;
938 u16 date = le16_to_cpu(le_date);
939 u16 time = le16_to_cpu(le_time);
896 SMB_TIME *st = (SMB_TIME *)&time; 940 SMB_TIME *st = (SMB_TIME *)&time;
897 SMB_DATE *sd = (SMB_DATE *)&date; 941 SMB_DATE *sd = (SMB_DATE *)&date;
898 942
@@ -933,7 +977,7 @@ struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
933 days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0); 977 days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
934 sec += 24 * 60 * 60 * days; 978 sec += 24 * 60 * 60 * days;
935 979
936 ts.tv_sec = sec; 980 ts.tv_sec = sec + offset;
937 981
938 /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */ 982 /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
939 983
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 964e097c8203..f823a4a208a7 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -63,386 +63,123 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
63} 63}
64#endif /* DEBUG2 */ 64#endif /* DEBUG2 */
65 65
66/* Returns 1 if new inode created, 2 if both dentry and inode were */ 66/*
67/* Might check in the future if inode number changed so we can rehash inode */ 67 * Find the dentry that matches "name". If there isn't one, create one. If it's
68static int 68 * a negative dentry or the uniqueid changed, then drop it and recreate it.
69construct_dentry(struct qstr *qstring, struct file *file, 69 */
70 struct inode **ptmp_inode, struct dentry **pnew_dentry, 70static struct dentry *
71 __u64 *inum) 71cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
72 struct cifs_fattr *fattr)
72{ 73{
73 struct dentry *tmp_dentry = NULL; 74 struct dentry *dentry, *alias;
74 struct super_block *sb = file->f_path.dentry->d_sb; 75 struct inode *inode;
75 int rc = 0; 76 struct super_block *sb = parent->d_inode->i_sb;
76 77
77 cFYI(1, ("For %s", qstring->name)); 78 cFYI(1, ("For %s", name->name));
78 79
79 qstring->hash = full_name_hash(qstring->name, qstring->len); 80 dentry = d_lookup(parent, name);
80 tmp_dentry = d_lookup(file->f_path.dentry, qstring); 81 if (dentry) {
81 if (tmp_dentry) { 82 /* FIXME: check for inode number changes? */
82 /* BB: overwrite old name? i.e. tmp_dentry->d_name and 83 if (dentry->d_inode != NULL)
83 * tmp_dentry->d_name.len?? 84 return dentry;
84 */ 85 d_drop(dentry);
85 cFYI(0, ("existing dentry with inode 0x%p", 86 dput(dentry);
86 tmp_dentry->d_inode));
87 *ptmp_inode = tmp_dentry->d_inode;
88 if (*ptmp_inode == NULL) {
89 *ptmp_inode = cifs_new_inode(sb, inum);
90 if (*ptmp_inode == NULL)
91 return rc;
92 rc = 1;
93 }
94 } else {
95 tmp_dentry = d_alloc(file->f_path.dentry, qstring);
96 if (tmp_dentry == NULL) {
97 cERROR(1, ("Failed allocating dentry"));
98 *ptmp_inode = NULL;
99 return rc;
100 }
101
102 if (CIFS_SB(sb)->tcon->nocase)
103 tmp_dentry->d_op = &cifs_ci_dentry_ops;
104 else
105 tmp_dentry->d_op = &cifs_dentry_ops;
106
107 *ptmp_inode = cifs_new_inode(sb, inum);
108 if (*ptmp_inode == NULL)
109 return rc;
110 rc = 2;
111 } 87 }
112 88
113 tmp_dentry->d_time = jiffies; 89 dentry = d_alloc(parent, name);
114 *pnew_dentry = tmp_dentry; 90 if (dentry == NULL)
115 return rc; 91 return NULL;
116}
117 92
118static void AdjustForTZ(struct cifsTconInfo *tcon, struct inode *inode) 93 inode = cifs_iget(sb, fattr);
119{ 94 if (!inode) {
120 if ((tcon) && (tcon->ses) && (tcon->ses->server)) { 95 dput(dentry);
121 inode->i_ctime.tv_sec += tcon->ses->server->timeAdj; 96 return NULL;
122 inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
123 inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
124 } 97 }
125 return;
126}
127
128 98
129static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, 99 if (CIFS_SB(sb)->tcon->nocase)
130 char *buf, unsigned int *pobject_type, int isNewInode) 100 dentry->d_op = &cifs_ci_dentry_ops;
131{ 101 else
132 loff_t local_size; 102 dentry->d_op = &cifs_dentry_ops;
133 struct timespec local_mtime; 103
134 104 alias = d_materialise_unique(dentry, inode);
135 struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode); 105 if (alias != NULL) {
136 struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb); 106 dput(dentry);
137 __u32 attr; 107 if (IS_ERR(alias))
138 __u64 allocation_size; 108 return NULL;
139 __u64 end_of_file; 109 dentry = alias;
140 umode_t default_mode;
141
142 /* save mtime and size */
143 local_mtime = tmp_inode->i_mtime;
144 local_size = tmp_inode->i_size;
145
146 if (new_buf_type) {
147 FILE_DIRECTORY_INFO *pfindData = (FILE_DIRECTORY_INFO *)buf;
148
149 attr = le32_to_cpu(pfindData->ExtFileAttributes);
150 allocation_size = le64_to_cpu(pfindData->AllocationSize);
151 end_of_file = le64_to_cpu(pfindData->EndOfFile);
152 tmp_inode->i_atime =
153 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
154 tmp_inode->i_mtime =
155 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
156 tmp_inode->i_ctime =
157 cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
158 } else { /* legacy, OS2 and DOS style */
159/* struct timespec ts;*/
160 FIND_FILE_STANDARD_INFO *pfindData =
161 (FIND_FILE_STANDARD_INFO *)buf;
162
163 tmp_inode->i_mtime = cnvrtDosUnixTm(
164 le16_to_cpu(pfindData->LastWriteDate),
165 le16_to_cpu(pfindData->LastWriteTime));
166 tmp_inode->i_atime = cnvrtDosUnixTm(
167 le16_to_cpu(pfindData->LastAccessDate),
168 le16_to_cpu(pfindData->LastAccessTime));
169 tmp_inode->i_ctime = cnvrtDosUnixTm(
170 le16_to_cpu(pfindData->LastWriteDate),
171 le16_to_cpu(pfindData->LastWriteTime));
172 AdjustForTZ(cifs_sb->tcon, tmp_inode);
173 attr = le16_to_cpu(pfindData->Attributes);
174 allocation_size = le32_to_cpu(pfindData->AllocationSize);
175 end_of_file = le32_to_cpu(pfindData->DataSize);
176 } 110 }
177 111
178 /* Linux can not store file creation time unfortunately so ignore it */ 112 return dentry;
113}
179 114
180 cifsInfo->cifsAttrs = attr; 115static void
181#ifdef CONFIG_CIFS_EXPERIMENTAL 116cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
182 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { 117{
183 /* get more accurate mode via ACL - so force inode refresh */ 118 fattr->cf_uid = cifs_sb->mnt_uid;
184 cifsInfo->time = 0; 119 fattr->cf_gid = cifs_sb->mnt_gid;
185 } else
186#endif /* CONFIG_CIFS_EXPERIMENTAL */
187 cifsInfo->time = jiffies;
188
189 /* treat dos attribute of read-only as read-only mode bit e.g. 555? */
190 /* 2767 perms - indicate mandatory locking */
191 /* BB fill in uid and gid here? with help from winbind?
192 or retrieve from NTFS stream extended attribute */
193 if (atomic_read(&cifsInfo->inUse) == 0) {
194 tmp_inode->i_uid = cifs_sb->mnt_uid;
195 tmp_inode->i_gid = cifs_sb->mnt_gid;
196 }
197 120
198 if (attr & ATTR_DIRECTORY) 121 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
199 default_mode = cifs_sb->mnt_dir_mode; 122 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
200 else 123 fattr->cf_dtype = DT_DIR;
201 default_mode = cifs_sb->mnt_file_mode; 124 } else {
202 125 fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
203 /* set initial permissions */ 126 fattr->cf_dtype = DT_REG;
204 if ((atomic_read(&cifsInfo->inUse) == 0) ||
205 (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
206 tmp_inode->i_mode = default_mode;
207 else {
208 /* just reenable write bits if !ATTR_READONLY */
209 if ((tmp_inode->i_mode & S_IWUGO) == 0 &&
210 (attr & ATTR_READONLY) == 0)
211 tmp_inode->i_mode |= (S_IWUGO & default_mode);
212
213 tmp_inode->i_mode &= ~S_IFMT;
214 } 127 }
215 128
216 /* clear write bits if ATTR_READONLY is set */ 129 if (fattr->cf_cifsattrs & ATTR_READONLY)
217 if (attr & ATTR_READONLY) 130 fattr->cf_mode &= ~S_IWUGO;
218 tmp_inode->i_mode &= ~S_IWUGO;
219 131
220 /* set inode type */ 132 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
221 if ((attr & ATTR_SYSTEM) && 133 fattr->cf_cifsattrs & ATTR_SYSTEM) {
222 (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) { 134 if (fattr->cf_eof == 0) {
223 if (end_of_file == 0) { 135 fattr->cf_mode &= ~S_IFMT;
224 tmp_inode->i_mode |= S_IFIFO; 136 fattr->cf_mode |= S_IFIFO;
225 *pobject_type = DT_FIFO; 137 fattr->cf_dtype = DT_FIFO;
226 } else { 138 } else {
227 /* 139 /*
228 * trying to get the type can be slow, so just call 140 * trying to get the type and mode via SFU can be slow,
229 * this a regular file for now, and mark for reval 141 * so just call those regular files for now, and mark
142 * for reval
230 */ 143 */
231 tmp_inode->i_mode |= S_IFREG; 144 fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
232 *pobject_type = DT_REG;
233 cifsInfo->time = 0;
234 }
235 } else {
236 if (attr & ATTR_DIRECTORY) {
237 tmp_inode->i_mode |= S_IFDIR;
238 *pobject_type = DT_DIR;
239 } else {
240 tmp_inode->i_mode |= S_IFREG;
241 *pobject_type = DT_REG;
242 } 145 }
243 } 146 }
147}
244 148
245 /* can not fill in nlink here as in qpathinfo version and Unx search */ 149void
246 if (atomic_read(&cifsInfo->inUse) == 0) 150cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
247 atomic_set(&cifsInfo->inUse, 1); 151 struct cifs_sb_info *cifs_sb)
248 152{
249 cifsInfo->server_eof = end_of_file; 153 memset(fattr, 0, sizeof(*fattr));
250 spin_lock(&tmp_inode->i_lock); 154 fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
251 if (is_size_safe_to_change(cifsInfo, end_of_file)) { 155 fattr->cf_eof = le64_to_cpu(info->EndOfFile);
252 /* can not safely change the file size here if the 156 fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
253 client is writing to it due to potential races */ 157 fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
254 i_size_write(tmp_inode, end_of_file); 158 fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
255 159 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
256 /* 512 bytes (2**9) is the fake blocksize that must be used */ 160
257 /* for this calculation, even though the reported blocksize is larger */ 161 cifs_fill_common_info(fattr, cifs_sb);
258 tmp_inode->i_blocks = (512 - 1 + allocation_size) >> 9;
259 }
260 spin_unlock(&tmp_inode->i_lock);
261
262 if (allocation_size < end_of_file)
263 cFYI(1, ("May be sparse file, allocation less than file size"));
264 cFYI(1, ("File Size %ld and blocks %llu",
265 (unsigned long)tmp_inode->i_size,
266 (unsigned long long)tmp_inode->i_blocks));
267 if (S_ISREG(tmp_inode->i_mode)) {
268 cFYI(1, ("File inode"));
269 tmp_inode->i_op = &cifs_file_inode_ops;
270 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
271 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
272 tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
273 else
274 tmp_inode->i_fop = &cifs_file_direct_ops;
275 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
276 tmp_inode->i_fop = &cifs_file_nobrl_ops;
277 else
278 tmp_inode->i_fop = &cifs_file_ops;
279
280 if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
281 (cifs_sb->tcon->ses->server->maxBuf <
282 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
283 tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
284 else
285 tmp_inode->i_data.a_ops = &cifs_addr_ops;
286
287 if (isNewInode)
288 return; /* No sense invalidating pages for new inode
289 since have not started caching readahead file
290 data yet */
291
292 if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
293 (local_size == tmp_inode->i_size)) {
294 cFYI(1, ("inode exists but unchanged"));
295 } else {
296 /* file may have changed on server */
297 cFYI(1, ("invalidate inode, readdir detected change"));
298 invalidate_remote_inode(tmp_inode);
299 }
300 } else if (S_ISDIR(tmp_inode->i_mode)) {
301 cFYI(1, ("Directory inode"));
302 tmp_inode->i_op = &cifs_dir_inode_ops;
303 tmp_inode->i_fop = &cifs_dir_ops;
304 } else if (S_ISLNK(tmp_inode->i_mode)) {
305 cFYI(1, ("Symbolic Link inode"));
306 tmp_inode->i_op = &cifs_symlink_inode_ops;
307 } else {
308 cFYI(1, ("Init special inode"));
309 init_special_inode(tmp_inode, tmp_inode->i_mode,
310 tmp_inode->i_rdev);
311 }
312} 162}
313 163
314static void unix_fill_in_inode(struct inode *tmp_inode, 164void
315 FILE_UNIX_INFO *pfindData, unsigned int *pobject_type, int isNewInode) 165cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
166 struct cifs_sb_info *cifs_sb)
316{ 167{
317 loff_t local_size; 168 int offset = cifs_sb->tcon->ses->server->timeAdj;
318 struct timespec local_mtime;
319
320 struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
321 struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb);
322
323 __u32 type = le32_to_cpu(pfindData->Type);
324 __u64 num_of_bytes = le64_to_cpu(pfindData->NumOfBytes);
325 __u64 end_of_file = le64_to_cpu(pfindData->EndOfFile);
326 cifsInfo->time = jiffies;
327 atomic_inc(&cifsInfo->inUse);
328
329 /* save mtime and size */
330 local_mtime = tmp_inode->i_mtime;
331 local_size = tmp_inode->i_size;
332
333 tmp_inode->i_atime =
334 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
335 tmp_inode->i_mtime =
336 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastModificationTime));
337 tmp_inode->i_ctime =
338 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastStatusChange));
339
340 tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions);
341 /* since we set the inode type below we need to mask off type
342 to avoid strange results if bits above were corrupt */
343 tmp_inode->i_mode &= ~S_IFMT;
344 if (type == UNIX_FILE) {
345 *pobject_type = DT_REG;
346 tmp_inode->i_mode |= S_IFREG;
347 } else if (type == UNIX_SYMLINK) {
348 *pobject_type = DT_LNK;
349 tmp_inode->i_mode |= S_IFLNK;
350 } else if (type == UNIX_DIR) {
351 *pobject_type = DT_DIR;
352 tmp_inode->i_mode |= S_IFDIR;
353 } else if (type == UNIX_CHARDEV) {
354 *pobject_type = DT_CHR;
355 tmp_inode->i_mode |= S_IFCHR;
356 tmp_inode->i_rdev = MKDEV(le64_to_cpu(pfindData->DevMajor),
357 le64_to_cpu(pfindData->DevMinor) & MINORMASK);
358 } else if (type == UNIX_BLOCKDEV) {
359 *pobject_type = DT_BLK;
360 tmp_inode->i_mode |= S_IFBLK;
361 tmp_inode->i_rdev = MKDEV(le64_to_cpu(pfindData->DevMajor),
362 le64_to_cpu(pfindData->DevMinor) & MINORMASK);
363 } else if (type == UNIX_FIFO) {
364 *pobject_type = DT_FIFO;
365 tmp_inode->i_mode |= S_IFIFO;
366 } else if (type == UNIX_SOCKET) {
367 *pobject_type = DT_SOCK;
368 tmp_inode->i_mode |= S_IFSOCK;
369 } else {
370 /* safest to just call it a file */
371 *pobject_type = DT_REG;
372 tmp_inode->i_mode |= S_IFREG;
373 cFYI(1, ("unknown inode type %d", type));
374 }
375 169
376 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) 170 memset(fattr, 0, sizeof(*fattr));
377 tmp_inode->i_uid = cifs_sb->mnt_uid; 171 fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
378 else 172 info->LastAccessTime, offset);
379 tmp_inode->i_uid = le64_to_cpu(pfindData->Uid); 173 fattr->cf_ctime = cnvrtDosUnixTm(info->LastWriteDate,
380 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) 174 info->LastWriteTime, offset);
381 tmp_inode->i_gid = cifs_sb->mnt_gid; 175 fattr->cf_mtime = cnvrtDosUnixTm(info->LastWriteDate,
382 else 176 info->LastWriteTime, offset);
383 tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
384 tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
385
386 cifsInfo->server_eof = end_of_file;
387 spin_lock(&tmp_inode->i_lock);
388 if (is_size_safe_to_change(cifsInfo, end_of_file)) {
389 /* can not safely change the file size here if the
390 client is writing to it due to potential races */
391 i_size_write(tmp_inode, end_of_file);
392
393 /* 512 bytes (2**9) is the fake blocksize that must be used */
394 /* for this calculation, not the real blocksize */
395 tmp_inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
396 }
397 spin_unlock(&tmp_inode->i_lock);
398 177
399 if (S_ISREG(tmp_inode->i_mode)) { 178 fattr->cf_cifsattrs = le16_to_cpu(info->Attributes);
400 cFYI(1, ("File inode")); 179 fattr->cf_bytes = le32_to_cpu(info->AllocationSize);
401 tmp_inode->i_op = &cifs_file_inode_ops; 180 fattr->cf_eof = le32_to_cpu(info->DataSize);
402 181
403 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { 182 cifs_fill_common_info(fattr, cifs_sb);
404 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
405 tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
406 else
407 tmp_inode->i_fop = &cifs_file_direct_ops;
408 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
409 tmp_inode->i_fop = &cifs_file_nobrl_ops;
410 else
411 tmp_inode->i_fop = &cifs_file_ops;
412
413 if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
414 (cifs_sb->tcon->ses->server->maxBuf <
415 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
416 tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
417 else
418 tmp_inode->i_data.a_ops = &cifs_addr_ops;
419
420 if (isNewInode)
421 return; /* No sense invalidating pages for new inode
422 since we have not started caching readahead
423 file data for it yet */
424
425 if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
426 (local_size == tmp_inode->i_size)) {
427 cFYI(1, ("inode exists but unchanged"));
428 } else {
429 /* file may have changed on server */
430 cFYI(1, ("invalidate inode, readdir detected change"));
431 invalidate_remote_inode(tmp_inode);
432 }
433 } else if (S_ISDIR(tmp_inode->i_mode)) {
434 cFYI(1, ("Directory inode"));
435 tmp_inode->i_op = &cifs_dir_inode_ops;
436 tmp_inode->i_fop = &cifs_dir_ops;
437 } else if (S_ISLNK(tmp_inode->i_mode)) {
438 cFYI(1, ("Symbolic Link inode"));
439 tmp_inode->i_op = &cifs_symlink_inode_ops;
440/* tmp_inode->i_fop = *//* do not need to set to anything */
441 } else {
442 cFYI(1, ("Special inode"));
443 init_special_inode(tmp_inode, tmp_inode->i_mode,
444 tmp_inode->i_rdev);
445 }
446} 183}
447 184
448/* BB eventually need to add the following helper function to 185/* BB eventually need to add the following helper function to
@@ -884,7 +621,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
884 len = strnlen(filename, PATH_MAX); 621 len = strnlen(filename, PATH_MAX);
885 } 622 }
886 623
887 *pinum = le64_to_cpu(pFindData->UniqueId); 624 *pinum = le64_to_cpu(pFindData->basic.UniqueId);
888 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { 625 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
889 FILE_DIRECTORY_INFO *pFindData = 626 FILE_DIRECTORY_INFO *pFindData =
890 (FILE_DIRECTORY_INFO *)current_entry; 627 (FILE_DIRECTORY_INFO *)current_entry;
@@ -944,11 +681,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
944 int rc = 0; 681 int rc = 0;
945 struct qstr qstring; 682 struct qstr qstring;
946 struct cifsFileInfo *pCifsF; 683 struct cifsFileInfo *pCifsF;
947 unsigned int obj_type; 684 u64 inum;
948 __u64 inum; 685 ino_t ino;
686 struct super_block *sb;
949 struct cifs_sb_info *cifs_sb; 687 struct cifs_sb_info *cifs_sb;
950 struct inode *tmp_inode;
951 struct dentry *tmp_dentry; 688 struct dentry *tmp_dentry;
689 struct cifs_fattr fattr;
952 690
953 /* get filename and len into qstring */ 691 /* get filename and len into qstring */
954 /* get dentry */ 692 /* get dentry */
@@ -966,60 +704,53 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
966 if (rc != 0) 704 if (rc != 0)
967 return 0; 705 return 0;
968 706
969 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 707 sb = file->f_path.dentry->d_sb;
708 cifs_sb = CIFS_SB(sb);
970 709
971 qstring.name = scratch_buf; 710 qstring.name = scratch_buf;
972 rc = cifs_get_name_from_search_buf(&qstring, pfindEntry, 711 rc = cifs_get_name_from_search_buf(&qstring, pfindEntry,
973 pCifsF->srch_inf.info_level, 712 pCifsF->srch_inf.info_level,
974 pCifsF->srch_inf.unicode, cifs_sb, 713 pCifsF->srch_inf.unicode, cifs_sb,
975 max_len, 714 max_len, &inum /* returned */);
976 &inum /* returned */);
977 715
978 if (rc) 716 if (rc)
979 return rc; 717 return rc;
980 718
981 /* only these two infolevels return valid inode numbers */
982 if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX ||
983 pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
984 rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
985 &inum);
986 else
987 rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
988 NULL);
989
990 if ((tmp_inode == NULL) || (tmp_dentry == NULL))
991 return -ENOMEM;
992
993 /* we pass in rc below, indicating whether it is a new inode,
994 so we can figure out whether to invalidate the inode cached
995 data if the file has changed */
996 if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX) 719 if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX)
997 unix_fill_in_inode(tmp_inode, 720 cifs_unix_basic_to_fattr(&fattr,
998 (FILE_UNIX_INFO *)pfindEntry, 721 &((FILE_UNIX_INFO *) pfindEntry)->basic,
999 &obj_type, rc); 722 cifs_sb);
1000 else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) 723 else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD)
1001 fill_in_inode(tmp_inode, 0 /* old level 1 buffer type */, 724 cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *)
1002 pfindEntry, &obj_type, rc); 725 pfindEntry, cifs_sb);
1003 else 726 else
1004 fill_in_inode(tmp_inode, 1 /* NT */, pfindEntry, &obj_type, rc); 727 cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)
728 pfindEntry, cifs_sb);
1005 729
1006 if (rc) /* new inode - needs to be tied to dentry */ { 730 /* FIXME: make _to_fattr functions fill this out */
1007 d_instantiate(tmp_dentry, tmp_inode); 731 if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
1008 if (rc == 2) 732 fattr.cf_uniqueid = inum;
1009 d_rehash(tmp_dentry); 733 else
1010 } 734 fattr.cf_uniqueid = iunique(sb, ROOT_I);
1011 735
736 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
737 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
1012 738
1013 rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, 739 rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
1014 tmp_inode->i_ino, obj_type); 740 ino, fattr.cf_dtype);
741
742 /*
743 * we can not return filldir errors to the caller since they are
744 * "normal" when the stat blocksize is too small - we return remapped
745 * error instead
746 *
747 * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above
748 * case already. Why should we be clobbering other errors from it?
749 */
1015 if (rc) { 750 if (rc) {
1016 cFYI(1, ("filldir rc = %d", rc)); 751 cFYI(1, ("filldir rc = %d", rc));
1017 /* we can not return filldir errors to the caller
1018 since they are "normal" when the stat blocksize
1019 is too small - we return remapped error instead */
1020 rc = -EOVERFLOW; 752 rc = -EOVERFLOW;
1021 } 753 }
1022
1023 dput(tmp_dentry); 754 dput(tmp_dentry);
1024 return rc; 755 return rc;
1025} 756}
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 897a052270f9..7085a6275c4c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -802,7 +802,7 @@ ssetup_ntlmssp_authenticate:
802#endif /* CONFIG_CIFS_UPCALL */ 802#endif /* CONFIG_CIFS_UPCALL */
803 } else { 803 } else {
804#ifdef CONFIG_CIFS_EXPERIMENTAL 804#ifdef CONFIG_CIFS_EXPERIMENTAL
805 if ((experimEnabled > 1) && (type == RawNTLMSSP)) { 805 if (type == RawNTLMSSP) {
806 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { 806 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
807 cERROR(1, ("NTLMSSP requires Unicode support")); 807 cERROR(1, ("NTLMSSP requires Unicode support"));
808 rc = -ENOSYS; 808 rc = -ENOSYS;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index e9527eedc639..a75afa3dd9e1 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -64,8 +64,9 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
64 64
65 full_path = build_path_from_dentry(direntry); 65 full_path = build_path_from_dentry(direntry);
66 if (full_path == NULL) { 66 if (full_path == NULL) {
67 rc = -ENOMEM;
67 FreeXid(xid); 68 FreeXid(xid);
68 return -ENOMEM; 69 return rc;
69 } 70 }
70 if (ea_name == NULL) { 71 if (ea_name == NULL) {
71 cFYI(1, ("Null xattr names not supported")); 72 cFYI(1, ("Null xattr names not supported"));
@@ -118,8 +119,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
118 119
119 full_path = build_path_from_dentry(direntry); 120 full_path = build_path_from_dentry(direntry);
120 if (full_path == NULL) { 121 if (full_path == NULL) {
122 rc = -ENOMEM;
121 FreeXid(xid); 123 FreeXid(xid);
122 return -ENOMEM; 124 return rc;
123 } 125 }
124 /* return dos attributes as pseudo xattr */ 126 /* return dos attributes as pseudo xattr */
125 /* return alt name if available as pseudo attr */ 127 /* return alt name if available as pseudo attr */
@@ -225,8 +227,9 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
225 227
226 full_path = build_path_from_dentry(direntry); 228 full_path = build_path_from_dentry(direntry);
227 if (full_path == NULL) { 229 if (full_path == NULL) {
230 rc = -ENOMEM;
228 FreeXid(xid); 231 FreeXid(xid);
229 return -ENOMEM; 232 return rc;
230 } 233 }
231 /* return dos attributes as pseudo xattr */ 234 /* return dos attributes as pseudo xattr */
232 /* return alt name if available as pseudo attr */ 235 /* return alt name if available as pseudo attr */
@@ -351,8 +354,9 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
351 354
352 full_path = build_path_from_dentry(direntry); 355 full_path = build_path_from_dentry(direntry);
353 if (full_path == NULL) { 356 if (full_path == NULL) {
357 rc = -ENOMEM;
354 FreeXid(xid); 358 FreeXid(xid);
355 return -ENOMEM; 359 return rc;
356 } 360 }
357 /* return dos attributes as pseudo xattr */ 361 /* return dos attributes as pseudo xattr */
358 /* return alt name if available as pseudo attr */ 362 /* return alt name if available as pseudo attr */
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 6a347fbc998a..ffd42815fda1 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -47,6 +47,8 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
47 struct pipe_inode_info *pipe, size_t count, 47 struct pipe_inode_info *pipe, size_t count,
48 unsigned int flags) 48 unsigned int flags)
49{ 49{
50 ssize_t (*splice_read)(struct file *, loff_t *,
51 struct pipe_inode_info *, size_t, unsigned int);
50 struct coda_file_info *cfi; 52 struct coda_file_info *cfi;
51 struct file *host_file; 53 struct file *host_file;
52 54
@@ -54,10 +56,11 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
54 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 56 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
55 host_file = cfi->cfi_container; 57 host_file = cfi->cfi_container;
56 58
57 if (!host_file->f_op || !host_file->f_op->splice_read) 59 splice_read = host_file->f_op->splice_read;
58 return -EINVAL; 60 if (!splice_read)
61 splice_read = default_file_splice_read;
59 62
60 return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags); 63 return splice_read(host_file, ppos, pipe, count, flags);
61} 64}
62 65
63static ssize_t 66static ssize_t
diff --git a/fs/compat.c b/fs/compat.c
index 681ed81e6be0..94502dab972a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -32,7 +32,6 @@
32#include <linux/smb_mount.h> 32#include <linux/smb_mount.h>
33#include <linux/ncp_mount.h> 33#include <linux/ncp_mount.h>
34#include <linux/nfs4_mount.h> 34#include <linux/nfs4_mount.h>
35#include <linux/smp_lock.h>
36#include <linux/syscalls.h> 35#include <linux/syscalls.h>
37#include <linux/ctype.h> 36#include <linux/ctype.h>
38#include <linux/module.h> 37#include <linux/module.h>
@@ -471,7 +470,7 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
471 ret = sys_fcntl(fd, cmd, (unsigned long)&f); 470 ret = sys_fcntl(fd, cmd, (unsigned long)&f);
472 set_fs(old_fs); 471 set_fs(old_fs);
473 if (cmd == F_GETLK && ret == 0) { 472 if (cmd == F_GETLK && ret == 0) {
474 /* GETLK was successfule and we need to return the data... 473 /* GETLK was successful and we need to return the data...
475 * but it needs to fit in the compat structure. 474 * but it needs to fit in the compat structure.
476 * l_start shouldn't be too big, unless the original 475 * l_start shouldn't be too big, unless the original
477 * start + end is greater than COMPAT_OFF_T_MAX, in which 476 * start + end is greater than COMPAT_OFF_T_MAX, in which
@@ -812,10 +811,8 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
812 } 811 }
813 } 812 }
814 813
815 lock_kernel();
816 retval = do_mount((char*)dev_page, dir_page, (char*)type_page, 814 retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
817 flags, (void*)data_page); 815 flags, (void*)data_page);
818 unlock_kernel();
819 816
820 out4: 817 out4:
821 free_page(data_page); 818 free_page(data_page);
@@ -1488,8 +1485,8 @@ int compat_do_execve(char * filename,
1488 if (!bprm) 1485 if (!bprm)
1489 goto out_files; 1486 goto out_files;
1490 1487
1491 retval = mutex_lock_interruptible(&current->cred_exec_mutex); 1488 retval = -ERESTARTNOINTR;
1492 if (retval < 0) 1489 if (mutex_lock_interruptible(&current->cred_guard_mutex))
1493 goto out_free; 1490 goto out_free;
1494 current->in_execve = 1; 1491 current->in_execve = 1;
1495 1492
@@ -1550,7 +1547,7 @@ int compat_do_execve(char * filename,
1550 /* execve succeeded */ 1547 /* execve succeeded */
1551 current->fs->in_exec = 0; 1548 current->fs->in_exec = 0;
1552 current->in_execve = 0; 1549 current->in_execve = 0;
1553 mutex_unlock(&current->cred_exec_mutex); 1550 mutex_unlock(&current->cred_guard_mutex);
1554 acct_update_integrals(current); 1551 acct_update_integrals(current);
1555 free_bprm(bprm); 1552 free_bprm(bprm);
1556 if (displaced) 1553 if (displaced)
@@ -1573,7 +1570,7 @@ out_unmark:
1573 1570
1574out_unlock: 1571out_unlock:
1575 current->in_execve = 0; 1572 current->in_execve = 0;
1576 mutex_unlock(&current->cred_exec_mutex); 1573 mutex_unlock(&current->cred_guard_mutex);
1577 1574
1578out_free: 1575out_free:
1579 free_bprm(bprm); 1576 free_bprm(bprm);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index b83f6bcfa51a..f91fd51b32e3 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -19,6 +19,7 @@
19#include <linux/compiler.h> 19#include <linux/compiler.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/smp_lock.h>
22#include <linux/ioctl.h> 23#include <linux/ioctl.h>
23#include <linux/if.h> 24#include <linux/if.h>
24#include <linux/if_bridge.h> 25#include <linux/if_bridge.h>
@@ -31,6 +32,7 @@
31#include <linux/skbuff.h> 32#include <linux/skbuff.h>
32#include <linux/netlink.h> 33#include <linux/netlink.h>
33#include <linux/vt.h> 34#include <linux/vt.h>
35#include <linux/falloc.h>
34#include <linux/fs.h> 36#include <linux/fs.h>
35#include <linux/file.h> 37#include <linux/file.h>
36#include <linux/ppp_defs.h> 38#include <linux/ppp_defs.h>
@@ -94,7 +96,6 @@
94#include <linux/atm_tcp.h> 96#include <linux/atm_tcp.h>
95#include <linux/sonet.h> 97#include <linux/sonet.h>
96#include <linux/atm_suni.h> 98#include <linux/atm_suni.h>
97#include <linux/mtd/mtd.h>
98 99
99#include <linux/usb.h> 100#include <linux/usb.h>
100#include <linux/usbdevice_fs.h> 101#include <linux/usbdevice_fs.h>
@@ -788,12 +789,6 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
788 if (put_user(compat_ptr(data), &sgio->usr_ptr)) 789 if (put_user(compat_ptr(data), &sgio->usr_ptr))
789 return -EFAULT; 790 return -EFAULT;
790 791
791 if (copy_in_user(&sgio->status, &sgio32->status,
792 (4 * sizeof(unsigned char)) +
793 (2 * sizeof(unsigned short)) +
794 (3 * sizeof(int))))
795 return -EFAULT;
796
797 err = sys_ioctl(fd, cmd, (unsigned long) sgio); 792 err = sys_ioctl(fd, cmd, (unsigned long) sgio);
798 793
799 if (err >= 0) { 794 if (err >= 0) {
@@ -1411,46 +1406,6 @@ static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
1411#define HIDPGETCONNLIST _IOR('H', 210, int) 1406#define HIDPGETCONNLIST _IOR('H', 210, int)
1412#define HIDPGETCONNINFO _IOR('H', 211, int) 1407#define HIDPGETCONNINFO _IOR('H', 211, int)
1413 1408
1414struct mtd_oob_buf32 {
1415 u_int32_t start;
1416 u_int32_t length;
1417 compat_caddr_t ptr; /* unsigned char* */
1418};
1419
1420#define MEMWRITEOOB32 _IOWR('M',3,struct mtd_oob_buf32)
1421#define MEMREADOOB32 _IOWR('M',4,struct mtd_oob_buf32)
1422
1423static int mtd_rw_oob(unsigned int fd, unsigned int cmd, unsigned long arg)
1424{
1425 struct mtd_oob_buf __user *buf = compat_alloc_user_space(sizeof(*buf));
1426 struct mtd_oob_buf32 __user *buf32 = compat_ptr(arg);
1427 u32 data;
1428 char __user *datap;
1429 unsigned int real_cmd;
1430 int err;
1431
1432 real_cmd = (cmd == MEMREADOOB32) ?
1433 MEMREADOOB : MEMWRITEOOB;
1434
1435 if (copy_in_user(&buf->start, &buf32->start,
1436 2 * sizeof(u32)) ||
1437 get_user(data, &buf32->ptr))
1438 return -EFAULT;
1439 datap = compat_ptr(data);
1440 if (put_user(datap, &buf->ptr))
1441 return -EFAULT;
1442
1443 err = sys_ioctl(fd, real_cmd, (unsigned long) buf);
1444
1445 if (!err) {
1446 if (copy_in_user(&buf32->start, &buf->start,
1447 2 * sizeof(u32)))
1448 err = -EFAULT;
1449 }
1450
1451 return err;
1452}
1453
1454#ifdef CONFIG_BLOCK 1409#ifdef CONFIG_BLOCK
1455struct raw32_config_request 1410struct raw32_config_request
1456{ 1411{
@@ -1765,7 +1720,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
1765 1720
1766/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE 1721/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
1767 * for some operations; this forces use of the newer bridge-utils that 1722 * for some operations; this forces use of the newer bridge-utils that
1768 * use compatiable ioctls 1723 * use compatible ioctls
1769 */ 1724 */
1770static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) 1725static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
1771{ 1726{
@@ -1826,6 +1781,41 @@ lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
1826 return sys_ioctl(fd, cmd, (unsigned long)tn); 1781 return sys_ioctl(fd, cmd, (unsigned long)tn);
1827} 1782}
1828 1783
1784/* on ia32 l_start is on a 32-bit boundary */
1785#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
1786struct space_resv_32 {
1787 __s16 l_type;
1788 __s16 l_whence;
1789 __s64 l_start __attribute__((packed));
1790 /* len == 0 means until end of file */
1791 __s64 l_len __attribute__((packed));
1792 __s32 l_sysid;
1793 __u32 l_pid;
1794 __s32 l_pad[4]; /* reserve area */
1795};
1796
1797#define FS_IOC_RESVSP_32 _IOW ('X', 40, struct space_resv_32)
1798#define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32)
1799
1800/* just account for different alignment */
1801static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
1802{
1803 struct space_resv_32 __user *p32 = (void __user *)arg;
1804 struct space_resv __user *p = compat_alloc_user_space(sizeof(*p));
1805
1806 if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) ||
1807 copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) ||
1808 copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) ||
1809 copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) ||
1810 copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) ||
1811 copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) ||
1812 copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32)))
1813 return -EFAULT;
1814
1815 return ioctl_preallocate(file, p);
1816}
1817#endif
1818
1829 1819
1830typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int, 1820typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int,
1831 unsigned long, struct file *); 1821 unsigned long, struct file *);
@@ -1915,6 +1905,7 @@ COMPATIBLE_IOCTL(FIONCLEX)
1915COMPATIBLE_IOCTL(FIOASYNC) 1905COMPATIBLE_IOCTL(FIOASYNC)
1916COMPATIBLE_IOCTL(FIONBIO) 1906COMPATIBLE_IOCTL(FIONBIO)
1917COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */ 1907COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */
1908COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
1918/* 0x00 */ 1909/* 0x00 */
1919COMPATIBLE_IOCTL(FIBMAP) 1910COMPATIBLE_IOCTL(FIBMAP)
1920COMPATIBLE_IOCTL(FIGETBSZ) 1911COMPATIBLE_IOCTL(FIGETBSZ)
@@ -2432,15 +2423,6 @@ COMPATIBLE_IOCTL(USBDEVFS_SUBMITURB32)
2432COMPATIBLE_IOCTL(USBDEVFS_REAPURB32) 2423COMPATIBLE_IOCTL(USBDEVFS_REAPURB32)
2433COMPATIBLE_IOCTL(USBDEVFS_REAPURBNDELAY32) 2424COMPATIBLE_IOCTL(USBDEVFS_REAPURBNDELAY32)
2434COMPATIBLE_IOCTL(USBDEVFS_CLEAR_HALT) 2425COMPATIBLE_IOCTL(USBDEVFS_CLEAR_HALT)
2435/* MTD */
2436COMPATIBLE_IOCTL(MEMGETINFO)
2437COMPATIBLE_IOCTL(MEMERASE)
2438COMPATIBLE_IOCTL(MEMLOCK)
2439COMPATIBLE_IOCTL(MEMUNLOCK)
2440COMPATIBLE_IOCTL(MEMGETREGIONCOUNT)
2441COMPATIBLE_IOCTL(MEMGETREGIONINFO)
2442COMPATIBLE_IOCTL(MEMGETBADBLOCK)
2443COMPATIBLE_IOCTL(MEMSETBADBLOCK)
2444/* NBD */ 2426/* NBD */
2445ULONG_IOCTL(NBD_SET_SOCK) 2427ULONG_IOCTL(NBD_SET_SOCK)
2446ULONG_IOCTL(NBD_SET_BLKSIZE) 2428ULONG_IOCTL(NBD_SET_BLKSIZE)
@@ -2550,8 +2532,6 @@ COMPATIBLE_IOCTL(JSIOCGBUTTONS)
2550COMPATIBLE_IOCTL(JSIOCGNAME(0)) 2532COMPATIBLE_IOCTL(JSIOCGNAME(0))
2551 2533
2552/* now things that need handlers */ 2534/* now things that need handlers */
2553HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob)
2554HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob)
2555#ifdef CONFIG_NET 2535#ifdef CONFIG_NET
2556HANDLE_IOCTL(SIOCGIFNAME, dev_ifname32) 2536HANDLE_IOCTL(SIOCGIFNAME, dev_ifname32)
2557HANDLE_IOCTL(SIOCGIFCONF, dev_ifconf) 2537HANDLE_IOCTL(SIOCGIFCONF, dev_ifconf)
@@ -2814,6 +2794,18 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
2814 case FIOQSIZE: 2794 case FIOQSIZE:
2815 break; 2795 break;
2816 2796
2797#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
2798 case FS_IOC_RESVSP_32:
2799 case FS_IOC_RESVSP64_32:
2800 error = compat_ioctl_preallocate(filp, arg);
2801 goto out_fput;
2802#else
2803 case FS_IOC_RESVSP:
2804 case FS_IOC_RESVSP64:
2805 error = ioctl_preallocate(filp, (void __user *)arg);
2806 goto out_fput;
2807#endif
2808
2817 case FIBMAP: 2809 case FIBMAP:
2818 case FIGETBSZ: 2810 case FIGETBSZ:
2819 case FIONREAD: 2811 case FIONREAD:
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 762d287123ca..da6061a6df40 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -39,6 +39,9 @@ struct configfs_dirent {
39 umode_t s_mode; 39 umode_t s_mode;
40 struct dentry * s_dentry; 40 struct dentry * s_dentry;
41 struct iattr * s_iattr; 41 struct iattr * s_iattr;
42#ifdef CONFIG_LOCKDEP
43 int s_depth;
44#endif
42}; 45};
43 46
44#define CONFIGFS_ROOT 0x0001 47#define CONFIGFS_ROOT 0x0001
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 05373db21a4e..8e48b52205aa 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -78,11 +78,97 @@ static const struct dentry_operations configfs_dentry_ops = {
78 .d_delete = configfs_d_delete, 78 .d_delete = configfs_d_delete,
79}; 79};
80 80
81#ifdef CONFIG_LOCKDEP
82
83/*
84 * Helpers to make lockdep happy with our recursive locking of default groups'
85 * inodes (see configfs_attach_group() and configfs_detach_group()).
86 * We put default groups i_mutexes in separate classes according to their depth
87 * from the youngest non-default group ancestor.
88 *
89 * For a non-default group A having default groups A/B, A/C, and A/C/D, default
90 * groups A/B and A/C will have their inode's mutex in class
91 * default_group_class[0], and default group A/C/D will be in
92 * default_group_class[1].
93 *
94 * The lock classes are declared and assigned in inode.c, according to the
95 * s_depth value.
96 * The s_depth value is initialized to -1, adjusted to >= 0 when attaching
97 * default groups, and reset to -1 when all default groups are attached. During
98 * attachment, if configfs_create() sees s_depth > 0, the lock class of the new
99 * inode's mutex is set to default_group_class[s_depth - 1].
100 */
101
102static void configfs_init_dirent_depth(struct configfs_dirent *sd)
103{
104 sd->s_depth = -1;
105}
106
107static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
108 struct configfs_dirent *sd)
109{
110 int parent_depth = parent_sd->s_depth;
111
112 if (parent_depth >= 0)
113 sd->s_depth = parent_depth + 1;
114}
115
116static void
117configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
118{
119 /*
120 * item's i_mutex class is already setup, so s_depth is now only
121 * used to set new sub-directories s_depth, which is always done
122 * with item's i_mutex locked.
123 */
124 /*
125 * sd->s_depth == -1 iff we are a non default group.
126 * else (we are a default group) sd->s_depth > 0 (see
127 * create_dir()).
128 */
129 if (sd->s_depth == -1)
130 /*
131 * We are a non default group and we are going to create
132 * default groups.
133 */
134 sd->s_depth = 0;
135}
136
137static void
138configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
139{
140 /* We will not create default groups anymore. */
141 sd->s_depth = -1;
142}
143
144#else /* CONFIG_LOCKDEP */
145
146static void configfs_init_dirent_depth(struct configfs_dirent *sd)
147{
148}
149
150static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
151 struct configfs_dirent *sd)
152{
153}
154
155static void
156configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
157{
158}
159
160static void
161configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
162{
163}
164
165#endif /* CONFIG_LOCKDEP */
166
81/* 167/*
82 * Allocates a new configfs_dirent and links it to the parent configfs_dirent 168 * Allocates a new configfs_dirent and links it to the parent configfs_dirent
83 */ 169 */
84static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd, 170static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd,
85 void * element) 171 void *element, int type)
86{ 172{
87 struct configfs_dirent * sd; 173 struct configfs_dirent * sd;
88 174
@@ -94,6 +180,8 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
94 INIT_LIST_HEAD(&sd->s_links); 180 INIT_LIST_HEAD(&sd->s_links);
95 INIT_LIST_HEAD(&sd->s_children); 181 INIT_LIST_HEAD(&sd->s_children);
96 sd->s_element = element; 182 sd->s_element = element;
183 sd->s_type = type;
184 configfs_init_dirent_depth(sd);
97 spin_lock(&configfs_dirent_lock); 185 spin_lock(&configfs_dirent_lock);
98 if (parent_sd->s_type & CONFIGFS_USET_DROPPING) { 186 if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
99 spin_unlock(&configfs_dirent_lock); 187 spin_unlock(&configfs_dirent_lock);
@@ -138,12 +226,11 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
138{ 226{
139 struct configfs_dirent * sd; 227 struct configfs_dirent * sd;
140 228
141 sd = configfs_new_dirent(parent_sd, element); 229 sd = configfs_new_dirent(parent_sd, element, type);
142 if (IS_ERR(sd)) 230 if (IS_ERR(sd))
143 return PTR_ERR(sd); 231 return PTR_ERR(sd);
144 232
145 sd->s_mode = mode; 233 sd->s_mode = mode;
146 sd->s_type = type;
147 sd->s_dentry = dentry; 234 sd->s_dentry = dentry;
148 if (dentry) { 235 if (dentry) {
149 dentry->d_fsdata = configfs_get(sd); 236 dentry->d_fsdata = configfs_get(sd);
@@ -187,6 +274,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
187 error = configfs_make_dirent(p->d_fsdata, d, k, mode, 274 error = configfs_make_dirent(p->d_fsdata, d, k, mode,
188 CONFIGFS_DIR | CONFIGFS_USET_CREATING); 275 CONFIGFS_DIR | CONFIGFS_USET_CREATING);
189 if (!error) { 276 if (!error) {
277 configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata);
190 error = configfs_create(d, mode, init_dir); 278 error = configfs_create(d, mode, init_dir);
191 if (!error) { 279 if (!error) {
192 inc_nlink(p->d_inode); 280 inc_nlink(p->d_inode);
@@ -789,11 +877,13 @@ static int configfs_attach_group(struct config_item *parent_item,
789 * error, as rmdir() would. 877 * error, as rmdir() would.
790 */ 878 */
791 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 879 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
880 configfs_adjust_dir_dirent_depth_before_populate(sd);
792 ret = populate_groups(to_config_group(item)); 881 ret = populate_groups(to_config_group(item));
793 if (ret) { 882 if (ret) {
794 configfs_detach_item(item); 883 configfs_detach_item(item);
795 dentry->d_inode->i_flags |= S_DEAD; 884 dentry->d_inode->i_flags |= S_DEAD;
796 } 885 }
886 configfs_adjust_dir_dirent_depth_after_populate(sd);
797 mutex_unlock(&dentry->d_inode->i_mutex); 887 mutex_unlock(&dentry->d_inode->i_mutex);
798 if (ret) 888 if (ret)
799 d_delete(dentry); 889 d_delete(dentry);
@@ -916,11 +1006,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
916 * Note, btw, that this can be called at *any* time, even when a configfs 1006 * Note, btw, that this can be called at *any* time, even when a configfs
917 * subsystem isn't registered, or when configfs is loading or unloading. 1007 * subsystem isn't registered, or when configfs is loading or unloading.
918 * Just like configfs_register_subsystem(). So we take the same 1008 * Just like configfs_register_subsystem(). So we take the same
919 * precautions. We pin the filesystem. We lock each i_mutex _in_order_ 1009 * precautions. We pin the filesystem. We lock configfs_dirent_lock.
920 * on our way down the tree. If we can find the target item in the 1010 * If we can find the target item in the
921 * configfs tree, it must be part of the subsystem tree as well, so we 1011 * configfs tree, it must be part of the subsystem tree as well, so we
922 * do not need the subsystem semaphore. Holding the i_mutex chain locks 1012 * do not need the subsystem semaphore. Holding configfs_dirent_lock helps
923 * out mkdir() and rmdir(), who might be racing us. 1013 * locking out mkdir() and rmdir(), who might be racing us.
924 */ 1014 */
925 1015
926/* 1016/*
@@ -933,17 +1023,21 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
933 * do that so we can unlock it if we find nothing. 1023 * do that so we can unlock it if we find nothing.
934 * 1024 *
935 * Here we do a depth-first search of the dentry hierarchy looking for 1025 * Here we do a depth-first search of the dentry hierarchy looking for
936 * our object. We take i_mutex on each step of the way down. IT IS 1026 * our object.
937 * ESSENTIAL THAT i_mutex LOCKING IS ORDERED. If we come back up a branch, 1027 * We deliberately ignore items tagged as dropping since they are virtually
938 * we'll drop the i_mutex. 1028 * dead, as well as items in the middle of attachment since they virtually
1029 * do not exist yet. This completes the locking out of racing mkdir() and
1030 * rmdir().
1031 * Note: subdirectories in the middle of attachment start with s_type =
1032 * CONFIGFS_DIR|CONFIGFS_USET_CREATING set by create_dir(). When
1033 * CONFIGFS_USET_CREATING is set, we ignore the item. The actual set of
1034 * s_type is in configfs_new_dirent(), which has configfs_dirent_lock.
939 * 1035 *
940 * If the target is not found, -ENOENT is bubbled up and we have released 1036 * If the target is not found, -ENOENT is bubbled up.
941 * all locks. If the target was found, the locks will be cleared by
942 * configfs_depend_rollback().
943 * 1037 *
944 * This adds a requirement that all config_items be unique! 1038 * This adds a requirement that all config_items be unique!
945 * 1039 *
946 * This is recursive because the locking traversal is tricky. There isn't 1040 * This is recursive. There isn't
947 * much on the stack, though, so folks that need this function - be careful 1041 * much on the stack, though, so folks that need this function - be careful
948 * about your stack! Patches will be accepted to make it iterative. 1042 * about your stack! Patches will be accepted to make it iterative.
949 */ 1043 */
@@ -955,13 +1049,13 @@ static int configfs_depend_prep(struct dentry *origin,
955 1049
956 BUG_ON(!origin || !sd); 1050 BUG_ON(!origin || !sd);
957 1051
958 /* Lock this guy on the way down */
959 mutex_lock(&sd->s_dentry->d_inode->i_mutex);
960 if (sd->s_element == target) /* Boo-yah */ 1052 if (sd->s_element == target) /* Boo-yah */
961 goto out; 1053 goto out;
962 1054
963 list_for_each_entry(child_sd, &sd->s_children, s_sibling) { 1055 list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
964 if (child_sd->s_type & CONFIGFS_DIR) { 1056 if ((child_sd->s_type & CONFIGFS_DIR) &&
1057 !(child_sd->s_type & CONFIGFS_USET_DROPPING) &&
1058 !(child_sd->s_type & CONFIGFS_USET_CREATING)) {
965 ret = configfs_depend_prep(child_sd->s_dentry, 1059 ret = configfs_depend_prep(child_sd->s_dentry,
966 target); 1060 target);
967 if (!ret) 1061 if (!ret)
@@ -970,33 +1064,12 @@ static int configfs_depend_prep(struct dentry *origin,
970 } 1064 }
971 1065
972 /* We looped all our children and didn't find target */ 1066 /* We looped all our children and didn't find target */
973 mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
974 ret = -ENOENT; 1067 ret = -ENOENT;
975 1068
976out: 1069out:
977 return ret; 1070 return ret;
978} 1071}
979 1072
980/*
981 * This is ONLY called if configfs_depend_prep() did its job. So we can
982 * trust the entire path from item back up to origin.
983 *
984 * We walk backwards from item, unlocking each i_mutex. We finish by
985 * unlocking origin.
986 */
987static void configfs_depend_rollback(struct dentry *origin,
988 struct config_item *item)
989{
990 struct dentry *dentry = item->ci_dentry;
991
992 while (dentry != origin) {
993 mutex_unlock(&dentry->d_inode->i_mutex);
994 dentry = dentry->d_parent;
995 }
996
997 mutex_unlock(&origin->d_inode->i_mutex);
998}
999
1000int configfs_depend_item(struct configfs_subsystem *subsys, 1073int configfs_depend_item(struct configfs_subsystem *subsys,
1001 struct config_item *target) 1074 struct config_item *target)
1002{ 1075{
@@ -1037,17 +1110,21 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
1037 1110
1038 /* Ok, now we can trust subsys/s_item */ 1111 /* Ok, now we can trust subsys/s_item */
1039 1112
1040 /* Scan the tree, locking i_mutex recursively, return 0 if found */ 1113 spin_lock(&configfs_dirent_lock);
1114 /* Scan the tree, return 0 if found */
1041 ret = configfs_depend_prep(subsys_sd->s_dentry, target); 1115 ret = configfs_depend_prep(subsys_sd->s_dentry, target);
1042 if (ret) 1116 if (ret)
1043 goto out_unlock_fs; 1117 goto out_unlock_dirent_lock;
1044 1118
1045 /* We hold all i_mutexes from the subsystem down to the target */ 1119 /*
1120 * We are sure that the item is not about to be removed by rmdir(), and
1121 * not in the middle of attachment by mkdir().
1122 */
1046 p = target->ci_dentry->d_fsdata; 1123 p = target->ci_dentry->d_fsdata;
1047 p->s_dependent_count += 1; 1124 p->s_dependent_count += 1;
1048 1125
1049 configfs_depend_rollback(subsys_sd->s_dentry, target); 1126out_unlock_dirent_lock:
1050 1127 spin_unlock(&configfs_dirent_lock);
1051out_unlock_fs: 1128out_unlock_fs:
1052 mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex); 1129 mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
1053 1130
@@ -1072,10 +1149,10 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
1072 struct configfs_dirent *sd; 1149 struct configfs_dirent *sd;
1073 1150
1074 /* 1151 /*
1075 * Since we can trust everything is pinned, we just need i_mutex 1152 * Since we can trust everything is pinned, we just need
1076 * on the item. 1153 * configfs_dirent_lock.
1077 */ 1154 */
1078 mutex_lock(&target->ci_dentry->d_inode->i_mutex); 1155 spin_lock(&configfs_dirent_lock);
1079 1156
1080 sd = target->ci_dentry->d_fsdata; 1157 sd = target->ci_dentry->d_fsdata;
1081 BUG_ON(sd->s_dependent_count < 1); 1158 BUG_ON(sd->s_dependent_count < 1);
@@ -1086,7 +1163,7 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
1086 * After this unlock, we cannot trust the item to stay alive! 1163 * After this unlock, we cannot trust the item to stay alive!
1087 * DO NOT REFERENCE item after this unlock. 1164 * DO NOT REFERENCE item after this unlock.
1088 */ 1165 */
1089 mutex_unlock(&target->ci_dentry->d_inode->i_mutex); 1166 spin_unlock(&configfs_dirent_lock);
1090} 1167}
1091EXPORT_SYMBOL(configfs_undepend_item); 1168EXPORT_SYMBOL(configfs_undepend_item);
1092 1169
@@ -1286,13 +1363,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1286 if (sd->s_type & CONFIGFS_USET_DEFAULT) 1363 if (sd->s_type & CONFIGFS_USET_DEFAULT)
1287 return -EPERM; 1364 return -EPERM;
1288 1365
1289 /*
1290 * Here's where we check for dependents. We're protected by
1291 * i_mutex.
1292 */
1293 if (sd->s_dependent_count)
1294 return -EBUSY;
1295
1296 /* Get a working ref until we have the child */ 1366 /* Get a working ref until we have the child */
1297 parent_item = configfs_get_config_item(dentry->d_parent); 1367 parent_item = configfs_get_config_item(dentry->d_parent);
1298 subsys = to_config_group(parent_item)->cg_subsys; 1368 subsys = to_config_group(parent_item)->cg_subsys;
@@ -1316,9 +1386,17 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1316 1386
1317 mutex_lock(&configfs_symlink_mutex); 1387 mutex_lock(&configfs_symlink_mutex);
1318 spin_lock(&configfs_dirent_lock); 1388 spin_lock(&configfs_dirent_lock);
1319 ret = configfs_detach_prep(dentry, &wait_mutex); 1389 /*
1320 if (ret) 1390 * Here's where we check for dependents. We're protected by
1321 configfs_detach_rollback(dentry); 1391 * configfs_dirent_lock.
1392 * If no dependent, atomically tag the item as dropping.
1393 */
1394 ret = sd->s_dependent_count ? -EBUSY : 0;
1395 if (!ret) {
1396 ret = configfs_detach_prep(dentry, &wait_mutex);
1397 if (ret)
1398 configfs_detach_rollback(dentry);
1399 }
1322 spin_unlock(&configfs_dirent_lock); 1400 spin_unlock(&configfs_dirent_lock);
1323 mutex_unlock(&configfs_symlink_mutex); 1401 mutex_unlock(&configfs_symlink_mutex);
1324 1402
@@ -1429,7 +1507,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
1429 */ 1507 */
1430 err = -ENOENT; 1508 err = -ENOENT;
1431 if (configfs_dirent_is_ready(parent_sd)) { 1509 if (configfs_dirent_is_ready(parent_sd)) {
1432 file->private_data = configfs_new_dirent(parent_sd, NULL); 1510 file->private_data = configfs_new_dirent(parent_sd, NULL, 0);
1433 if (IS_ERR(file->private_data)) 1511 if (IS_ERR(file->private_data))
1434 err = PTR_ERR(file->private_data); 1512 err = PTR_ERR(file->private_data);
1435 else 1513 else
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5d349d38e056..4921e7426d95 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -33,10 +33,15 @@
33#include <linux/backing-dev.h> 33#include <linux/backing-dev.h>
34#include <linux/capability.h> 34#include <linux/capability.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/lockdep.h>
36 37
37#include <linux/configfs.h> 38#include <linux/configfs.h>
38#include "configfs_internal.h" 39#include "configfs_internal.h"
39 40
41#ifdef CONFIG_LOCKDEP
42static struct lock_class_key default_group_class[MAX_LOCK_DEPTH];
43#endif
44
40extern struct super_block * configfs_sb; 45extern struct super_block * configfs_sb;
41 46
42static const struct address_space_operations configfs_aops = { 47static const struct address_space_operations configfs_aops = {
@@ -150,6 +155,38 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
150 return inode; 155 return inode;
151} 156}
152 157
158#ifdef CONFIG_LOCKDEP
159
160static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
161 struct inode *inode)
162{
163 int depth = sd->s_depth;
164
165 if (depth > 0) {
166 if (depth <= ARRAY_SIZE(default_group_class)) {
167 lockdep_set_class(&inode->i_mutex,
168 &default_group_class[depth - 1]);
169 } else {
170 /*
171 * In practice the maximum level of locking depth is
172 * already reached. Just inform about possible reasons.
173 */
174 printk(KERN_INFO "configfs: Too many levels of inodes"
175 " for the locking correctness validator.\n");
176 printk(KERN_INFO "Spurious warnings may appear.\n");
177 }
178 }
179}
180
181#else /* CONFIG_LOCKDEP */
182
183static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
184 struct inode *inode)
185{
186}
187
188#endif /* CONFIG_LOCKDEP */
189
153int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) 190int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
154{ 191{
155 int error = 0; 192 int error = 0;
@@ -162,6 +199,7 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *
162 struct inode *p_inode = dentry->d_parent->d_inode; 199 struct inode *p_inode = dentry->d_parent->d_inode;
163 p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; 200 p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
164 } 201 }
202 configfs_set_inode_lock_class(sd, inode);
165 goto Proceed; 203 goto Proceed;
166 } 204 }
167 else 205 else
diff --git a/fs/dcache.c b/fs/dcache.c
index 75659a6fd1f8..9e5cd3c3a6ba 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1910,7 +1910,7 @@ char *__d_path(const struct path *path, struct path *root,
1910 1910
1911 spin_lock(&vfsmount_lock); 1911 spin_lock(&vfsmount_lock);
1912 prepend(&end, &buflen, "\0", 1); 1912 prepend(&end, &buflen, "\0", 1);
1913 if (!IS_ROOT(dentry) && d_unhashed(dentry) && 1913 if (d_unlinked(dentry) &&
1914 (prepend(&end, &buflen, " (deleted)", 10) != 0)) 1914 (prepend(&end, &buflen, " (deleted)", 10) != 0))
1915 goto Elong; 1915 goto Elong;
1916 1916
@@ -2035,7 +2035,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
2035 2035
2036 spin_lock(&dcache_lock); 2036 spin_lock(&dcache_lock);
2037 prepend(&end, &buflen, "\0", 1); 2037 prepend(&end, &buflen, "\0", 1);
2038 if (!IS_ROOT(dentry) && d_unhashed(dentry) && 2038 if (d_unlinked(dentry) &&
2039 (prepend(&end, &buflen, "//deleted", 9) != 0)) 2039 (prepend(&end, &buflen, "//deleted", 9) != 0))
2040 goto Elong; 2040 goto Elong;
2041 if (buflen < 1) 2041 if (buflen < 1)
@@ -2097,9 +2097,8 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2097 read_unlock(&current->fs->lock); 2097 read_unlock(&current->fs->lock);
2098 2098
2099 error = -ENOENT; 2099 error = -ENOENT;
2100 /* Has the current directory has been unlinked? */
2101 spin_lock(&dcache_lock); 2100 spin_lock(&dcache_lock);
2102 if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) { 2101 if (!d_unlinked(pwd.dentry)) {
2103 unsigned long len; 2102 unsigned long len;
2104 struct path tmp = root; 2103 struct path tmp = root;
2105 char * cwd; 2104 char * cwd;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 33a90120f6ad..4d74fc72c195 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -67,6 +67,8 @@ static int debugfs_u8_get(void *data, u64 *val)
67 return 0; 67 return 0;
68} 68}
69DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); 69DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
70DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n");
71DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
70 72
71/** 73/**
72 * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value 74 * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value
@@ -95,6 +97,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
95struct dentry *debugfs_create_u8(const char *name, mode_t mode, 97struct dentry *debugfs_create_u8(const char *name, mode_t mode,
96 struct dentry *parent, u8 *value) 98 struct dentry *parent, u8 *value)
97{ 99{
100 /* if there are no write bits set, make read only */
101 if (!(mode & S_IWUGO))
102 return debugfs_create_file(name, mode, parent, value, &fops_u8_ro);
103 /* if there are no read bits set, make write only */
104 if (!(mode & S_IRUGO))
105 return debugfs_create_file(name, mode, parent, value, &fops_u8_wo);
106
98 return debugfs_create_file(name, mode, parent, value, &fops_u8); 107 return debugfs_create_file(name, mode, parent, value, &fops_u8);
99} 108}
100EXPORT_SYMBOL_GPL(debugfs_create_u8); 109EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -110,6 +119,8 @@ static int debugfs_u16_get(void *data, u64 *val)
110 return 0; 119 return 0;
111} 120}
112DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); 121DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
122DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n");
123DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
113 124
114/** 125/**
115 * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value 126 * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value
@@ -138,6 +149,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
138struct dentry *debugfs_create_u16(const char *name, mode_t mode, 149struct dentry *debugfs_create_u16(const char *name, mode_t mode,
139 struct dentry *parent, u16 *value) 150 struct dentry *parent, u16 *value)
140{ 151{
152 /* if there are no write bits set, make read only */
153 if (!(mode & S_IWUGO))
154 return debugfs_create_file(name, mode, parent, value, &fops_u16_ro);
155 /* if there are no read bits set, make write only */
156 if (!(mode & S_IRUGO))
157 return debugfs_create_file(name, mode, parent, value, &fops_u16_wo);
158
141 return debugfs_create_file(name, mode, parent, value, &fops_u16); 159 return debugfs_create_file(name, mode, parent, value, &fops_u16);
142} 160}
143EXPORT_SYMBOL_GPL(debugfs_create_u16); 161EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -153,6 +171,8 @@ static int debugfs_u32_get(void *data, u64 *val)
153 return 0; 171 return 0;
154} 172}
155DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); 173DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
174DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n");
175DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
156 176
157/** 177/**
158 * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value 178 * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value
@@ -181,6 +201,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
181struct dentry *debugfs_create_u32(const char *name, mode_t mode, 201struct dentry *debugfs_create_u32(const char *name, mode_t mode,
182 struct dentry *parent, u32 *value) 202 struct dentry *parent, u32 *value)
183{ 203{
204 /* if there are no write bits set, make read only */
205 if (!(mode & S_IWUGO))
206 return debugfs_create_file(name, mode, parent, value, &fops_u32_ro);
207 /* if there are no read bits set, make write only */
208 if (!(mode & S_IRUGO))
209 return debugfs_create_file(name, mode, parent, value, &fops_u32_wo);
210
184 return debugfs_create_file(name, mode, parent, value, &fops_u32); 211 return debugfs_create_file(name, mode, parent, value, &fops_u32);
185} 212}
186EXPORT_SYMBOL_GPL(debugfs_create_u32); 213EXPORT_SYMBOL_GPL(debugfs_create_u32);
@@ -197,6 +224,8 @@ static int debugfs_u64_get(void *data, u64 *val)
197 return 0; 224 return 0;
198} 225}
199DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); 226DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
227DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n");
228DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
200 229
201/** 230/**
202 * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value 231 * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value
@@ -225,15 +254,28 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
225struct dentry *debugfs_create_u64(const char *name, mode_t mode, 254struct dentry *debugfs_create_u64(const char *name, mode_t mode,
226 struct dentry *parent, u64 *value) 255 struct dentry *parent, u64 *value)
227{ 256{
257 /* if there are no write bits set, make read only */
258 if (!(mode & S_IWUGO))
259 return debugfs_create_file(name, mode, parent, value, &fops_u64_ro);
260 /* if there are no read bits set, make write only */
261 if (!(mode & S_IRUGO))
262 return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
263
228 return debugfs_create_file(name, mode, parent, value, &fops_u64); 264 return debugfs_create_file(name, mode, parent, value, &fops_u64);
229} 265}
230EXPORT_SYMBOL_GPL(debugfs_create_u64); 266EXPORT_SYMBOL_GPL(debugfs_create_u64);
231 267
232DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); 268DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
269DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
270DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
233 271
234DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n"); 272DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n");
273DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n");
274DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n");
235 275
236DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"); 276DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n");
277DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
278DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
237 279
238/* 280/*
239 * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value 281 * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value
@@ -256,6 +298,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
256struct dentry *debugfs_create_x8(const char *name, mode_t mode, 298struct dentry *debugfs_create_x8(const char *name, mode_t mode,
257 struct dentry *parent, u8 *value) 299 struct dentry *parent, u8 *value)
258{ 300{
301 /* if there are no write bits set, make read only */
302 if (!(mode & S_IWUGO))
303 return debugfs_create_file(name, mode, parent, value, &fops_x8_ro);
304 /* if there are no read bits set, make write only */
305 if (!(mode & S_IRUGO))
306 return debugfs_create_file(name, mode, parent, value, &fops_x8_wo);
307
259 return debugfs_create_file(name, mode, parent, value, &fops_x8); 308 return debugfs_create_file(name, mode, parent, value, &fops_x8);
260} 309}
261EXPORT_SYMBOL_GPL(debugfs_create_x8); 310EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -273,6 +322,13 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
273struct dentry *debugfs_create_x16(const char *name, mode_t mode, 322struct dentry *debugfs_create_x16(const char *name, mode_t mode,
274 struct dentry *parent, u16 *value) 323 struct dentry *parent, u16 *value)
275{ 324{
325 /* if there are no write bits set, make read only */
326 if (!(mode & S_IWUGO))
327 return debugfs_create_file(name, mode, parent, value, &fops_x16_ro);
328 /* if there are no read bits set, make write only */
329 if (!(mode & S_IRUGO))
330 return debugfs_create_file(name, mode, parent, value, &fops_x16_wo);
331
276 return debugfs_create_file(name, mode, parent, value, &fops_x16); 332 return debugfs_create_file(name, mode, parent, value, &fops_x16);
277} 333}
278EXPORT_SYMBOL_GPL(debugfs_create_x16); 334EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -290,6 +346,13 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
290struct dentry *debugfs_create_x32(const char *name, mode_t mode, 346struct dentry *debugfs_create_x32(const char *name, mode_t mode,
291 struct dentry *parent, u32 *value) 347 struct dentry *parent, u32 *value)
292{ 348{
349 /* if there are no write bits set, make read only */
350 if (!(mode & S_IWUGO))
351 return debugfs_create_file(name, mode, parent, value, &fops_x32_ro);
352 /* if there are no read bits set, make write only */
353 if (!(mode & S_IRUGO))
354 return debugfs_create_file(name, mode, parent, value, &fops_x32_wo);
355
293 return debugfs_create_file(name, mode, parent, value, &fops_x32); 356 return debugfs_create_file(name, mode, parent, value, &fops_x32);
294} 357}
295EXPORT_SYMBOL_GPL(debugfs_create_x32); 358EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -419,7 +482,7 @@ static const struct file_operations fops_blob = {
419}; 482};
420 483
421/** 484/**
422 * debugfs_create_blob - create a debugfs file that is used to read and write a binary blob 485 * debugfs_create_blob - create a debugfs file that is used to read a binary blob
423 * @name: a pointer to a string containing the name of the file to create. 486 * @name: a pointer to a string containing the name of the file to create.
424 * @mode: the permission that the file should have 487 * @mode: the permission that the file should have
425 * @parent: a pointer to the parent dentry for this file. This should be a 488 * @parent: a pointer to the parent dentry for this file. This should be a
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 0662ba6de85a..d22438ef7674 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -403,6 +403,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
403 } 403 }
404 child = list_entry(parent->d_subdirs.next, struct dentry, 404 child = list_entry(parent->d_subdirs.next, struct dentry,
405 d_u.d_child); 405 d_u.d_child);
406 next_sibling:
406 407
407 /* 408 /*
408 * If "child" isn't empty, walk down the tree and 409 * If "child" isn't empty, walk down the tree and
@@ -417,6 +418,16 @@ void debugfs_remove_recursive(struct dentry *dentry)
417 __debugfs_remove(child, parent); 418 __debugfs_remove(child, parent);
418 if (parent->d_subdirs.next == &child->d_u.d_child) { 419 if (parent->d_subdirs.next == &child->d_u.d_child) {
419 /* 420 /*
421 * Try the next sibling.
422 */
423 if (child->d_u.d_child.next != &parent->d_subdirs) {
424 child = list_entry(child->d_u.d_child.next,
425 struct dentry,
426 d_u.d_child);
427 goto next_sibling;
428 }
429
430 /*
420 * Avoid infinite loop if we fail to remove 431 * Avoid infinite loop if we fail to remove
421 * one dentry. 432 * one dentry.
422 */ 433 */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c68edb969441..75efb028974b 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -423,7 +423,6 @@ static void devpts_kill_sb(struct super_block *sb)
423} 423}
424 424
425static struct file_system_type devpts_fs_type = { 425static struct file_system_type devpts_fs_type = {
426 .owner = THIS_MODULE,
427 .name = "devpts", 426 .name = "devpts",
428 .get_sb = devpts_get_sb, 427 .get_sb = devpts_get_sb,
429 .kill_sb = devpts_kill_sb, 428 .kill_sb = devpts_kill_sb,
@@ -557,18 +556,11 @@ static int __init init_devpts_fs(void)
557 int err = register_filesystem(&devpts_fs_type); 556 int err = register_filesystem(&devpts_fs_type);
558 if (!err) { 557 if (!err) {
559 devpts_mnt = kern_mount(&devpts_fs_type); 558 devpts_mnt = kern_mount(&devpts_fs_type);
560 if (IS_ERR(devpts_mnt)) 559 if (IS_ERR(devpts_mnt)) {
561 err = PTR_ERR(devpts_mnt); 560 err = PTR_ERR(devpts_mnt);
561 unregister_filesystem(&devpts_fs_type);
562 }
562 } 563 }
563 return err; 564 return err;
564} 565}
565
566static void __exit exit_devpts_fs(void)
567{
568 unregister_filesystem(&devpts_fs_type);
569 mntput(devpts_mnt);
570}
571
572module_init(init_devpts_fs) 566module_init(init_devpts_fs)
573module_exit(exit_devpts_fs)
574MODULE_LICENSE("GPL");
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 05763bbc2050..8b10b87dc01a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1127,7 +1127,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1127 rw = WRITE_ODIRECT; 1127 rw = WRITE_ODIRECT;
1128 1128
1129 if (bdev) 1129 if (bdev)
1130 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); 1130 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
1131 1131
1132 if (offset & blocksize_mask) { 1132 if (offset & blocksize_mask) {
1133 if (bdev) 1133 if (bdev)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 858fba14aaa6..c4dfa1dcc86f 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,7 +49,8 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
49 spin_unlock(&ls->ls_recover_list_lock); 49 spin_unlock(&ls->ls_recover_list_lock);
50 50
51 if (!found) 51 if (!found)
52 de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL); 52 de = kzalloc(sizeof(struct dlm_direntry) + len,
53 ls->ls_allocation);
53 return de; 54 return de;
54} 55}
55 56
@@ -211,7 +212,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
211 212
212 dlm_dir_clear(ls); 213 dlm_dir_clear(ls);
213 214
214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL); 215 last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation);
215 if (!last_name) 216 if (!last_name)
216 goto out; 217 goto out;
217 218
@@ -322,7 +323,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
322 if (namelen > DLM_RESNAME_MAXLEN) 323 if (namelen > DLM_RESNAME_MAXLEN)
323 return -EINVAL; 324 return -EINVAL;
324 325
325 de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL); 326 de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation);
326 if (!de) 327 if (!de)
327 return -ENOMEM; 328 return -ENOMEM;
328 329
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 205ec95b347e..eb507c453c5f 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -435,7 +435,7 @@ static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
435static int find_rsb(struct dlm_ls *ls, char *name, int namelen, 435static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
436 unsigned int flags, struct dlm_rsb **r_ret) 436 unsigned int flags, struct dlm_rsb **r_ret)
437{ 437{
438 struct dlm_rsb *r, *tmp; 438 struct dlm_rsb *r = NULL, *tmp;
439 uint32_t hash, bucket; 439 uint32_t hash, bucket;
440 int error = -EINVAL; 440 int error = -EINVAL;
441 441
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index cd8e2df3c295..d489fcc86713 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -384,7 +384,7 @@ static void threads_stop(void)
384 dlm_astd_stop(); 384 dlm_astd_stop();
385} 385}
386 386
387static int new_lockspace(char *name, int namelen, void **lockspace, 387static int new_lockspace(const char *name, int namelen, void **lockspace,
388 uint32_t flags, int lvblen) 388 uint32_t flags, int lvblen)
389{ 389{
390 struct dlm_ls *ls; 390 struct dlm_ls *ls;
@@ -419,16 +419,14 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
419 break; 419 break;
420 } 420 }
421 ls->ls_create_count++; 421 ls->ls_create_count++;
422 module_put(THIS_MODULE); 422 *lockspace = ls;
423 error = 1; /* not an error, return 0 */ 423 error = 1;
424 break; 424 break;
425 } 425 }
426 spin_unlock(&lslist_lock); 426 spin_unlock(&lslist_lock);
427 427
428 if (error < 0)
429 goto out;
430 if (error) 428 if (error)
431 goto ret_zero; 429 goto out;
432 430
433 error = -ENOMEM; 431 error = -ENOMEM;
434 432
@@ -583,7 +581,6 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
583 dlm_create_debug_file(ls); 581 dlm_create_debug_file(ls);
584 582
585 log_debug(ls, "join complete"); 583 log_debug(ls, "join complete");
586 ret_zero:
587 *lockspace = ls; 584 *lockspace = ls;
588 return 0; 585 return 0;
589 586
@@ -614,7 +611,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
614 return error; 611 return error;
615} 612}
616 613
617int dlm_new_lockspace(char *name, int namelen, void **lockspace, 614int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
618 uint32_t flags, int lvblen) 615 uint32_t flags, int lvblen)
619{ 616{
620 int error = 0; 617 int error = 0;
@@ -628,7 +625,9 @@ int dlm_new_lockspace(char *name, int namelen, void **lockspace,
628 error = new_lockspace(name, namelen, lockspace, flags, lvblen); 625 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
629 if (!error) 626 if (!error)
630 ls_count++; 627 ls_count++;
631 else if (!ls_count) 628 if (error > 0)
629 error = 0;
630 if (!ls_count)
632 threads_stop(); 631 threads_stop();
633 out: 632 out:
634 mutex_unlock(&ls_lock); 633 mutex_unlock(&ls_lock);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 609108a83267..618a60f03886 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -309,6 +309,20 @@ static void lowcomms_state_change(struct sock *sk)
309 lowcomms_write_space(sk); 309 lowcomms_write_space(sk);
310} 310}
311 311
312int dlm_lowcomms_connect_node(int nodeid)
313{
314 struct connection *con;
315
316 if (nodeid == dlm_our_nodeid())
317 return 0;
318
319 con = nodeid2con(nodeid, GFP_NOFS);
320 if (!con)
321 return -ENOMEM;
322 lowcomms_connect_sock(con);
323 return 0;
324}
325
312/* Make a socket active */ 326/* Make a socket active */
313static int add_sock(struct socket *sock, struct connection *con) 327static int add_sock(struct socket *sock, struct connection *con)
314{ 328{
@@ -486,7 +500,7 @@ static void process_sctp_notification(struct connection *con,
486 return; 500 return;
487 } 501 }
488 502
489 new_con = nodeid2con(nodeid, GFP_KERNEL); 503 new_con = nodeid2con(nodeid, GFP_NOFS);
490 if (!new_con) 504 if (!new_con)
491 return; 505 return;
492 506
@@ -722,7 +736,7 @@ static int tcp_accept_from_sock(struct connection *con)
722 * the same time and the connections cross on the wire. 736 * the same time and the connections cross on the wire.
723 * In this case we store the incoming one in "othercon" 737 * In this case we store the incoming one in "othercon"
724 */ 738 */
725 newcon = nodeid2con(nodeid, GFP_KERNEL); 739 newcon = nodeid2con(nodeid, GFP_NOFS);
726 if (!newcon) { 740 if (!newcon) {
727 result = -ENOMEM; 741 result = -ENOMEM;
728 goto accept_err; 742 goto accept_err;
@@ -732,7 +746,7 @@ static int tcp_accept_from_sock(struct connection *con)
732 struct connection *othercon = newcon->othercon; 746 struct connection *othercon = newcon->othercon;
733 747
734 if (!othercon) { 748 if (!othercon) {
735 othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL); 749 othercon = kmem_cache_zalloc(con_cache, GFP_NOFS);
736 if (!othercon) { 750 if (!othercon) {
737 log_print("failed to allocate incoming socket"); 751 log_print("failed to allocate incoming socket");
738 mutex_unlock(&newcon->sock_mutex); 752 mutex_unlock(&newcon->sock_mutex);
@@ -888,7 +902,7 @@ static void tcp_connect_to_sock(struct connection *con)
888 int result = -EHOSTUNREACH; 902 int result = -EHOSTUNREACH;
889 struct sockaddr_storage saddr, src_addr; 903 struct sockaddr_storage saddr, src_addr;
890 int addr_len; 904 int addr_len;
891 struct socket *sock; 905 struct socket *sock = NULL;
892 906
893 if (con->nodeid == 0) { 907 if (con->nodeid == 0) {
894 log_print("attempt to connect sock 0 foiled"); 908 log_print("attempt to connect sock 0 foiled");
@@ -948,6 +962,8 @@ out_err:
948 if (con->sock) { 962 if (con->sock) {
949 sock_release(con->sock); 963 sock_release(con->sock);
950 con->sock = NULL; 964 con->sock = NULL;
965 } else if (sock) {
966 sock_release(sock);
951 } 967 }
952 /* 968 /*
953 * Some errors are fatal and this list might need adjusting. For other 969 * Some errors are fatal and this list might need adjusting. For other
@@ -1421,7 +1437,7 @@ static int work_start(void)
1421static void stop_conn(struct connection *con) 1437static void stop_conn(struct connection *con)
1422{ 1438{
1423 con->flags |= 0x0F; 1439 con->flags |= 0x0F;
1424 if (con->sock) 1440 if (con->sock && con->sock->sk)
1425 con->sock->sk->sk_user_data = NULL; 1441 con->sock->sk->sk_user_data = NULL;
1426} 1442}
1427 1443
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index a9a9618c0d3f..1311e6426287 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,7 @@ void dlm_lowcomms_stop(void);
19int dlm_lowcomms_close(int nodeid); 19int dlm_lowcomms_close(int nodeid);
20void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); 20void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
21void dlm_lowcomms_commit_buffer(void *mh); 21void dlm_lowcomms_commit_buffer(void *mh);
22int dlm_lowcomms_connect_node(int nodeid);
22 23
23#endif /* __LOWCOMMS_DOT_H__ */ 24#endif /* __LOWCOMMS_DOT_H__ */
24 25
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 26133f05ae3a..b128775913b2 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
17#include "recover.h" 17#include "recover.h"
18#include "rcom.h" 18#include "rcom.h"
19#include "config.h" 19#include "config.h"
20#include "lowcomms.h"
20 21
21static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) 22static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
22{ 23{
@@ -45,9 +46,9 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
45static int dlm_add_member(struct dlm_ls *ls, int nodeid) 46static int dlm_add_member(struct dlm_ls *ls, int nodeid)
46{ 47{
47 struct dlm_member *memb; 48 struct dlm_member *memb;
48 int w; 49 int w, error;
49 50
50 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL); 51 memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
51 if (!memb) 52 if (!memb)
52 return -ENOMEM; 53 return -ENOMEM;
53 54
@@ -57,6 +58,12 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid)
57 return w; 58 return w;
58 } 59 }
59 60
61 error = dlm_lowcomms_connect_node(nodeid);
62 if (error < 0) {
63 kfree(memb);
64 return error;
65 }
66
60 memb->nodeid = nodeid; 67 memb->nodeid = nodeid;
61 memb->weight = w; 68 memb->weight = w;
62 add_ordered_member(ls, memb); 69 add_ordered_member(ls, memb);
@@ -136,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls)
136 143
137 ls->ls_total_weight = total; 144 ls->ls_total_weight = total;
138 145
139 array = kmalloc(sizeof(int) * total, GFP_KERNEL); 146 array = kmalloc(sizeof(int) * total, ls->ls_allocation);
140 if (!array) 147 if (!array)
141 return; 148 return;
142 149
@@ -219,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
219 continue; 226 continue;
220 log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]); 227 log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
221 228
222 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL); 229 memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
223 if (!memb) 230 if (!memb)
224 return -ENOMEM; 231 return -ENOMEM;
225 memb->nodeid = rv->new[i]; 232 memb->nodeid = rv->new[i];
@@ -334,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls)
334 int *ids = NULL, *new = NULL; 341 int *ids = NULL, *new = NULL;
335 int error, ids_count = 0, new_count = 0; 342 int error, ids_count = 0, new_count = 0;
336 343
337 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL); 344 rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation);
338 if (!rv) 345 if (!rv)
339 return -ENOMEM; 346 return -ENOMEM;
340 347
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 894a32d438d5..16f682e26c07 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -353,7 +353,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
353{ 353{
354 struct dlm_plock_info info; 354 struct dlm_plock_info info;
355 struct plock_op *op; 355 struct plock_op *op;
356 int found = 0; 356 int found = 0, do_callback = 0;
357 357
358 if (count != sizeof(info)) 358 if (count != sizeof(info))
359 return -EINVAL; 359 return -EINVAL;
@@ -366,21 +366,24 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
366 366
367 spin_lock(&ops_lock); 367 spin_lock(&ops_lock);
368 list_for_each_entry(op, &recv_list, list) { 368 list_for_each_entry(op, &recv_list, list) {
369 if (op->info.fsid == info.fsid && op->info.number == info.number && 369 if (op->info.fsid == info.fsid &&
370 op->info.number == info.number &&
370 op->info.owner == info.owner) { 371 op->info.owner == info.owner) {
372 struct plock_xop *xop = (struct plock_xop *)op;
371 list_del_init(&op->list); 373 list_del_init(&op->list);
372 found = 1;
373 op->done = 1;
374 memcpy(&op->info, &info, sizeof(info)); 374 memcpy(&op->info, &info, sizeof(info));
375 if (xop->callback)
376 do_callback = 1;
377 else
378 op->done = 1;
379 found = 1;
375 break; 380 break;
376 } 381 }
377 } 382 }
378 spin_unlock(&ops_lock); 383 spin_unlock(&ops_lock);
379 384
380 if (found) { 385 if (found) {
381 struct plock_xop *xop; 386 if (do_callback)
382 xop = (struct plock_xop *)op;
383 if (xop->callback)
384 dlm_plock_callback(op); 387 dlm_plock_callback(op);
385 else 388 else
386 wake_up(&recv_wq); 389 wake_up(&recv_wq);
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index daa4183fbb84..7a2307c08911 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
35 struct rq_entry *e; 35 struct rq_entry *e;
36 int length = ms->m_header.h_length - sizeof(struct dlm_message); 36 int length = ms->m_header.h_length - sizeof(struct dlm_message);
37 37
38 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); 38 e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation);
39 if (!e) { 39 if (!e) {
40 log_print("dlm_add_requestqueue: out of memory len %d", length); 40 log_print("dlm_add_requestqueue: out of memory len %d", length);
41 return; 41 return;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index b6a719a909f8..a2edb7913447 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -24,7 +24,7 @@ static void drop_pagecache_sb(struct super_block *sb)
24 continue; 24 continue;
25 __iget(inode); 25 __iget(inode);
26 spin_unlock(&inode_lock); 26 spin_unlock(&inode_lock);
27 __invalidate_mapping_pages(inode->i_mapping, 0, -1, true); 27 invalidate_mapping_pages(inode->i_mapping, 0, -1);
28 iput(toput_inode); 28 iput(toput_inode);
29 toput_inode = inode; 29 toput_inode = inode;
30 spin_lock(&inode_lock); 30 spin_lock(&inode_lock);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index af737bb56cb7..259525c9abb8 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
1303 } 1303 }
1304 (*new_auth_tok)->session_key.encrypted_key_size = 1304 (*new_auth_tok)->session_key.encrypted_key_size =
1305 (body_size - (ECRYPTFS_SALT_SIZE + 5)); 1305 (body_size - (ECRYPTFS_SALT_SIZE + 5));
1306 if ((*new_auth_tok)->session_key.encrypted_key_size
1307 > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
1308 printk(KERN_WARNING "Tag 3 packet contains key larger "
1309 "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
1310 rc = -EINVAL;
1311 goto out_free;
1312 }
1306 if (unlikely(data[(*packet_size)++] != 0x04)) { 1313 if (unlikely(data[(*packet_size)++] != 0x04)) {
1307 printk(KERN_WARNING "Unknown version number [%d]\n", 1314 printk(KERN_WARNING "Unknown version number [%d]\n",
1308 data[(*packet_size) - 1]); 1315 data[(*packet_size) - 1]);
@@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents,
1449 rc = -EINVAL; 1456 rc = -EINVAL;
1450 goto out; 1457 goto out;
1451 } 1458 }
1459 if (unlikely((*tag_11_contents_size) > max_contents_bytes)) {
1460 printk(KERN_ERR "Literal data section in tag 11 packet exceeds "
1461 "expected size\n");
1462 rc = -EINVAL;
1463 goto out;
1464 }
1452 if (data[(*packet_size)++] != 0x62) { 1465 if (data[(*packet_size)++] != 0x62) {
1453 printk(KERN_WARNING "Unrecognizable packet\n"); 1466 printk(KERN_WARNING "Unrecognizable packet\n");
1454 rc = -EINVAL; 1467 rc = -EINVAL;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index fa4c7e7d15d9..12d649602d3a 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -27,6 +27,7 @@
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/key.h> 28#include <linux/key.h>
29#include <linux/seq_file.h> 29#include <linux/seq_file.h>
30#include <linux/smp_lock.h>
30#include <linux/file.h> 31#include <linux/file.h>
31#include <linux/crypto.h> 32#include <linux/crypto.h>
32#include "ecryptfs_kernel.h" 33#include "ecryptfs_kernel.h"
@@ -120,9 +121,13 @@ static void ecryptfs_put_super(struct super_block *sb)
120{ 121{
121 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); 122 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
122 123
124 lock_kernel();
125
123 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); 126 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
124 kmem_cache_free(ecryptfs_sb_info_cache, sb_info); 127 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
125 ecryptfs_set_superblock_private(sb, NULL); 128 ecryptfs_set_superblock_private(sb, NULL);
129
130 unlock_kernel();
126} 131}
127 132
128/** 133/**
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 49308a29798a..7ee6f7e3a608 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -5,12 +5,12 @@
5 */ 5 */
6 6
7#include <linux/buffer_head.h> 7#include <linux/buffer_head.h>
8#include <linux/smp_lock.h>
9#include "efs.h" 8#include "efs.h"
10 9
11static int efs_readdir(struct file *, void *, filldir_t); 10static int efs_readdir(struct file *, void *, filldir_t);
12 11
13const struct file_operations efs_dir_operations = { 12const struct file_operations efs_dir_operations = {
13 .llseek = generic_file_llseek,
14 .read = generic_read_dir, 14 .read = generic_read_dir,
15 .readdir = efs_readdir, 15 .readdir = efs_readdir,
16}; 16};
@@ -33,8 +33,6 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
33 if (inode->i_size & (EFS_DIRBSIZE-1)) 33 if (inode->i_size & (EFS_DIRBSIZE-1))
34 printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); 34 printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
35 35
36 lock_kernel();
37
38 /* work out where this entry can be found */ 36 /* work out where this entry can be found */
39 block = filp->f_pos >> EFS_DIRBSIZE_BITS; 37 block = filp->f_pos >> EFS_DIRBSIZE_BITS;
40 38
@@ -107,7 +105,6 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
107 105
108 filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; 106 filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
109out: 107out:
110 unlock_kernel();
111 return 0; 108 return 0;
112} 109}
113 110
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index c3fb5f9c4a44..1511bf9e5f80 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -8,7 +8,6 @@
8 8
9#include <linux/buffer_head.h> 9#include <linux/buffer_head.h>
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/smp_lock.h>
12#include <linux/exportfs.h> 11#include <linux/exportfs.h>
13#include "efs.h" 12#include "efs.h"
14 13
@@ -63,16 +62,12 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
63 efs_ino_t inodenum; 62 efs_ino_t inodenum;
64 struct inode * inode = NULL; 63 struct inode * inode = NULL;
65 64
66 lock_kernel();
67 inodenum = efs_find_entry(dir, dentry->d_name.name, dentry->d_name.len); 65 inodenum = efs_find_entry(dir, dentry->d_name.name, dentry->d_name.len);
68 if (inodenum) { 66 if (inodenum) {
69 inode = efs_iget(dir->i_sb, inodenum); 67 inode = efs_iget(dir->i_sb, inodenum);
70 if (IS_ERR(inode)) { 68 if (IS_ERR(inode))
71 unlock_kernel();
72 return ERR_CAST(inode); 69 return ERR_CAST(inode);
73 }
74 } 70 }
75 unlock_kernel();
76 71
77 return d_splice_alias(inode, dentry); 72 return d_splice_alias(inode, dentry);
78} 73}
@@ -115,11 +110,9 @@ struct dentry *efs_get_parent(struct dentry *child)
115 struct dentry *parent = ERR_PTR(-ENOENT); 110 struct dentry *parent = ERR_PTR(-ENOENT);
116 efs_ino_t ino; 111 efs_ino_t ino;
117 112
118 lock_kernel();
119 ino = efs_find_entry(child->d_inode, "..", 2); 113 ino = efs_find_entry(child->d_inode, "..", 2);
120 if (ino) 114 if (ino)
121 parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino)); 115 parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino));
122 unlock_kernel();
123 116
124 return parent; 117 return parent;
125} 118}
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 41911ec83aaf..75117d0dac2b 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -9,7 +9,6 @@
9#include <linux/string.h> 9#include <linux/string.h>
10#include <linux/pagemap.h> 10#include <linux/pagemap.h>
11#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
12#include <linux/smp_lock.h>
13#include "efs.h" 12#include "efs.h"
14 13
15static int efs_symlink_readpage(struct file *file, struct page *page) 14static int efs_symlink_readpage(struct file *file, struct page *page)
@@ -22,9 +21,8 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
22 21
23 err = -ENAMETOOLONG; 22 err = -ENAMETOOLONG;
24 if (size > 2 * EFS_BLOCKSIZE) 23 if (size > 2 * EFS_BLOCKSIZE)
25 goto fail_notlocked; 24 goto fail;
26 25
27 lock_kernel();
28 /* read first 512 bytes of link target */ 26 /* read first 512 bytes of link target */
29 err = -EIO; 27 err = -EIO;
30 bh = sb_bread(inode->i_sb, efs_bmap(inode, 0)); 28 bh = sb_bread(inode->i_sb, efs_bmap(inode, 0));
@@ -40,14 +38,11 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
40 brelse(bh); 38 brelse(bh);
41 } 39 }
42 link[size] = '\0'; 40 link[size] = '\0';
43 unlock_kernel();
44 SetPageUptodate(page); 41 SetPageUptodate(page);
45 kunmap(page); 42 kunmap(page);
46 unlock_page(page); 43 unlock_page(page);
47 return 0; 44 return 0;
48fail: 45fail:
49 unlock_kernel();
50fail_notlocked:
51 SetPageError(page); 46 SetPageError(page);
52 kunmap(page); 47 kunmap(page);
53 unlock_page(page); 48 unlock_page(page);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 2a701d593d35..31d12de83a2a 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -14,34 +14,44 @@
14#include <linux/list.h> 14#include <linux/list.h>
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/anon_inodes.h> 16#include <linux/anon_inodes.h>
17#include <linux/eventfd.h>
18#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/module.h>
19#include <linux/kref.h>
20#include <linux/eventfd.h>
19 21
20struct eventfd_ctx { 22struct eventfd_ctx {
23 struct kref kref;
21 wait_queue_head_t wqh; 24 wait_queue_head_t wqh;
22 /* 25 /*
23 * Every time that a write(2) is performed on an eventfd, the 26 * Every time that a write(2) is performed on an eventfd, the
24 * value of the __u64 being written is added to "count" and a 27 * value of the __u64 being written is added to "count" and a
25 * wakeup is performed on "wqh". A read(2) will return the "count" 28 * wakeup is performed on "wqh". A read(2) will return the "count"
26 * value to userspace, and will reset "count" to zero. The kernel 29 * value to userspace, and will reset "count" to zero. The kernel
27 * size eventfd_signal() also, adds to the "count" counter and 30 * side eventfd_signal() also, adds to the "count" counter and
28 * issue a wakeup. 31 * issue a wakeup.
29 */ 32 */
30 __u64 count; 33 __u64 count;
31 unsigned int flags; 34 unsigned int flags;
32}; 35};
33 36
34/* 37/**
35 * Adds "n" to the eventfd counter "count". Returns "n" in case of 38 * eventfd_signal - Adds @n to the eventfd counter.
36 * success, or a value lower then "n" in case of coutner overflow. 39 * @ctx: [in] Pointer to the eventfd context.
37 * This function is supposed to be called by the kernel in paths 40 * @n: [in] Value of the counter to be added to the eventfd internal counter.
38 * that do not allow sleeping. In this function we allow the counter 41 * The value cannot be negative.
39 * to reach the ULLONG_MAX value, and we signal this as overflow 42 *
40 * condition by returining a POLLERR to poll(2). 43 * This function is supposed to be called by the kernel in paths that do not
44 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
45 * value, and we signal this as overflow condition by returining a POLLERR
46 * to poll(2).
47 *
48 * Returns @n in case of success, a non-negative number lower than @n in case
49 * of overflow, or the following error codes:
50 *
51 * -EINVAL : The value of @n is negative.
41 */ 52 */
42int eventfd_signal(struct file *file, int n) 53int eventfd_signal(struct eventfd_ctx *ctx, int n)
43{ 54{
44 struct eventfd_ctx *ctx = file->private_data;
45 unsigned long flags; 55 unsigned long flags;
46 56
47 if (n < 0) 57 if (n < 0)
@@ -56,10 +66,47 @@ int eventfd_signal(struct file *file, int n)
56 66
57 return n; 67 return n;
58} 68}
69EXPORT_SYMBOL_GPL(eventfd_signal);
70
71static void eventfd_free(struct kref *kref)
72{
73 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
74
75 kfree(ctx);
76}
77
78/**
79 * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
80 * @ctx: [in] Pointer to the eventfd context.
81 *
82 * Returns: In case of success, returns a pointer to the eventfd context.
83 */
84struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
85{
86 kref_get(&ctx->kref);
87 return ctx;
88}
89EXPORT_SYMBOL_GPL(eventfd_ctx_get);
90
91/**
92 * eventfd_ctx_put - Releases a reference to the internal eventfd context.
93 * @ctx: [in] Pointer to eventfd context.
94 *
95 * The eventfd context reference must have been previously acquired either
96 * with eventfd_ctx_get() or eventfd_ctx_fdget()).
97 */
98void eventfd_ctx_put(struct eventfd_ctx *ctx)
99{
100 kref_put(&ctx->kref, eventfd_free);
101}
102EXPORT_SYMBOL_GPL(eventfd_ctx_put);
59 103
60static int eventfd_release(struct inode *inode, struct file *file) 104static int eventfd_release(struct inode *inode, struct file *file)
61{ 105{
62 kfree(file->private_data); 106 struct eventfd_ctx *ctx = file->private_data;
107
108 wake_up_poll(&ctx->wqh, POLLHUP);
109 eventfd_ctx_put(ctx);
63 return 0; 110 return 0;
64} 111}
65 112
@@ -183,6 +230,16 @@ static const struct file_operations eventfd_fops = {
183 .write = eventfd_write, 230 .write = eventfd_write,
184}; 231};
185 232
233/**
234 * eventfd_fget - Acquire a reference of an eventfd file descriptor.
235 * @fd: [in] Eventfd file descriptor.
236 *
237 * Returns a pointer to the eventfd file structure in case of success, or the
238 * following error pointer:
239 *
240 * -EBADF : Invalid @fd file descriptor.
241 * -EINVAL : The @fd file descriptor is not an eventfd file.
242 */
186struct file *eventfd_fget(int fd) 243struct file *eventfd_fget(int fd)
187{ 244{
188 struct file *file; 245 struct file *file;
@@ -197,6 +254,49 @@ struct file *eventfd_fget(int fd)
197 254
198 return file; 255 return file;
199} 256}
257EXPORT_SYMBOL_GPL(eventfd_fget);
258
259/**
260 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
261 * @fd: [in] Eventfd file descriptor.
262 *
263 * Returns a pointer to the internal eventfd context, otherwise the error
264 * pointers returned by the following functions:
265 *
266 * eventfd_fget
267 */
268struct eventfd_ctx *eventfd_ctx_fdget(int fd)
269{
270 struct file *file;
271 struct eventfd_ctx *ctx;
272
273 file = eventfd_fget(fd);
274 if (IS_ERR(file))
275 return (struct eventfd_ctx *) file;
276 ctx = eventfd_ctx_get(file->private_data);
277 fput(file);
278
279 return ctx;
280}
281EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
282
283/**
284 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
285 * @file: [in] Eventfd file pointer.
286 *
287 * Returns a pointer to the internal eventfd context, otherwise the error
288 * pointer:
289 *
290 * -EINVAL : The @fd file descriptor is not an eventfd file.
291 */
292struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
293{
294 if (file->f_op != &eventfd_fops)
295 return ERR_PTR(-EINVAL);
296
297 return eventfd_ctx_get(file->private_data);
298}
299EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
200 300
201SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) 301SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
202{ 302{
@@ -214,6 +314,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
214 if (!ctx) 314 if (!ctx)
215 return -ENOMEM; 315 return -ENOMEM;
216 316
317 kref_init(&ctx->kref);
217 init_waitqueue_head(&ctx->wqh); 318 init_waitqueue_head(&ctx->wqh);
218 ctx->count = count; 319 ctx->count = count;
219 ctx->flags = flags; 320 ctx->flags = flags;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5458e80fc558..085c5c063420 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -98,7 +98,7 @@ struct epoll_filefd {
98struct nested_call_node { 98struct nested_call_node {
99 struct list_head llink; 99 struct list_head llink;
100 void *cookie; 100 void *cookie;
101 int cpu; 101 void *ctx;
102}; 102};
103 103
104/* 104/*
@@ -317,17 +317,17 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
317 * @nproc: Nested call core function pointer. 317 * @nproc: Nested call core function pointer.
318 * @priv: Opaque data to be passed to the @nproc callback. 318 * @priv: Opaque data to be passed to the @nproc callback.
319 * @cookie: Cookie to be used to identify this nested call. 319 * @cookie: Cookie to be used to identify this nested call.
320 * @ctx: This instance context.
320 * 321 *
321 * Returns: Returns the code returned by the @nproc callback, or -1 if 322 * Returns: Returns the code returned by the @nproc callback, or -1 if
322 * the maximum recursion limit has been exceeded. 323 * the maximum recursion limit has been exceeded.
323 */ 324 */
324static int ep_call_nested(struct nested_calls *ncalls, int max_nests, 325static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
325 int (*nproc)(void *, void *, int), void *priv, 326 int (*nproc)(void *, void *, int), void *priv,
326 void *cookie) 327 void *cookie, void *ctx)
327{ 328{
328 int error, call_nests = 0; 329 int error, call_nests = 0;
329 unsigned long flags; 330 unsigned long flags;
330 int this_cpu = get_cpu();
331 struct list_head *lsthead = &ncalls->tasks_call_list; 331 struct list_head *lsthead = &ncalls->tasks_call_list;
332 struct nested_call_node *tncur; 332 struct nested_call_node *tncur;
333 struct nested_call_node tnode; 333 struct nested_call_node tnode;
@@ -340,7 +340,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
340 * very much limited. 340 * very much limited.
341 */ 341 */
342 list_for_each_entry(tncur, lsthead, llink) { 342 list_for_each_entry(tncur, lsthead, llink) {
343 if (tncur->cpu == this_cpu && 343 if (tncur->ctx == ctx &&
344 (tncur->cookie == cookie || ++call_nests > max_nests)) { 344 (tncur->cookie == cookie || ++call_nests > max_nests)) {
345 /* 345 /*
346 * Ops ... loop detected or maximum nest level reached. 346 * Ops ... loop detected or maximum nest level reached.
@@ -352,7 +352,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
352 } 352 }
353 353
354 /* Add the current task and cookie to the list */ 354 /* Add the current task and cookie to the list */
355 tnode.cpu = this_cpu; 355 tnode.ctx = ctx;
356 tnode.cookie = cookie; 356 tnode.cookie = cookie;
357 list_add(&tnode.llink, lsthead); 357 list_add(&tnode.llink, lsthead);
358 358
@@ -364,10 +364,9 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
364 /* Remove the current task from the list */ 364 /* Remove the current task from the list */
365 spin_lock_irqsave(&ncalls->lock, flags); 365 spin_lock_irqsave(&ncalls->lock, flags);
366 list_del(&tnode.llink); 366 list_del(&tnode.llink);
367 out_unlock: 367out_unlock:
368 spin_unlock_irqrestore(&ncalls->lock, flags); 368 spin_unlock_irqrestore(&ncalls->lock, flags);
369 369
370 put_cpu();
371 return error; 370 return error;
372} 371}
373 372
@@ -408,8 +407,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
408 */ 407 */
409static void ep_poll_safewake(wait_queue_head_t *wq) 408static void ep_poll_safewake(wait_queue_head_t *wq)
410{ 409{
410 int this_cpu = get_cpu();
411
411 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, 412 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
412 ep_poll_wakeup_proc, NULL, wq); 413 ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
414
415 put_cpu();
413} 416}
414 417
415/* 418/*
@@ -663,7 +666,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
663 * could re-enter here. 666 * could re-enter here.
664 */ 667 */
665 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, 668 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
666 ep_poll_readyevents_proc, ep, ep); 669 ep_poll_readyevents_proc, ep, ep, current);
667 670
668 return pollflags != -1 ? pollflags : 0; 671 return pollflags != -1 ? pollflags : 0;
669} 672}
diff --git a/fs/exec.c b/fs/exec.c
index 895823d0149d..fb4f3cdda78c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h>
36#include <linux/highmem.h> 37#include <linux/highmem.h>
37#include <linux/spinlock.h> 38#include <linux/spinlock.h>
38#include <linux/key.h> 39#include <linux/key.h>
@@ -677,8 +678,8 @@ exit:
677} 678}
678EXPORT_SYMBOL(open_exec); 679EXPORT_SYMBOL(open_exec);
679 680
680int kernel_read(struct file *file, unsigned long offset, 681int kernel_read(struct file *file, loff_t offset,
681 char *addr, unsigned long count) 682 char *addr, unsigned long count)
682{ 683{
683 mm_segment_t old_fs; 684 mm_segment_t old_fs;
684 loff_t pos = offset; 685 loff_t pos = offset;
@@ -922,6 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
922 task_lock(tsk); 923 task_lock(tsk);
923 strlcpy(tsk->comm, buf, sizeof(tsk->comm)); 924 strlcpy(tsk->comm, buf, sizeof(tsk->comm));
924 task_unlock(tsk); 925 task_unlock(tsk);
926 perf_counter_comm(tsk);
925} 927}
926 928
927int flush_old_exec(struct linux_binprm * bprm) 929int flush_old_exec(struct linux_binprm * bprm)
@@ -990,6 +992,13 @@ int flush_old_exec(struct linux_binprm * bprm)
990 992
991 current->personality &= ~bprm->per_clear; 993 current->personality &= ~bprm->per_clear;
992 994
995 /*
996 * Flush performance counters when crossing a
997 * security domain:
998 */
999 if (!get_dumpable(current->mm))
1000 perf_counter_exit_task(current);
1001
993 /* An exec changes our domain. We are no longer part of the thread 1002 /* An exec changes our domain. We are no longer part of the thread
994 group */ 1003 group */
995 1004
@@ -1016,7 +1025,7 @@ void install_exec_creds(struct linux_binprm *bprm)
1016 commit_creds(bprm->cred); 1025 commit_creds(bprm->cred);
1017 bprm->cred = NULL; 1026 bprm->cred = NULL;
1018 1027
1019 /* cred_exec_mutex must be held at least to this point to prevent 1028 /* cred_guard_mutex must be held at least to this point to prevent
1020 * ptrace_attach() from altering our determination of the task's 1029 * ptrace_attach() from altering our determination of the task's
1021 * credentials; any time after this it may be unlocked */ 1030 * credentials; any time after this it may be unlocked */
1022 1031
@@ -1026,7 +1035,7 @@ EXPORT_SYMBOL(install_exec_creds);
1026 1035
1027/* 1036/*
1028 * determine how safe it is to execute the proposed program 1037 * determine how safe it is to execute the proposed program
1029 * - the caller must hold current->cred_exec_mutex to protect against 1038 * - the caller must hold current->cred_guard_mutex to protect against
1030 * PTRACE_ATTACH 1039 * PTRACE_ATTACH
1031 */ 1040 */
1032int check_unsafe_exec(struct linux_binprm *bprm) 1041int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1268,8 +1277,8 @@ int do_execve(char * filename,
1268 if (!bprm) 1277 if (!bprm)
1269 goto out_files; 1278 goto out_files;
1270 1279
1271 retval = mutex_lock_interruptible(&current->cred_exec_mutex); 1280 retval = -ERESTARTNOINTR;
1272 if (retval < 0) 1281 if (mutex_lock_interruptible(&current->cred_guard_mutex))
1273 goto out_free; 1282 goto out_free;
1274 current->in_execve = 1; 1283 current->in_execve = 1;
1275 1284
@@ -1331,7 +1340,7 @@ int do_execve(char * filename,
1331 /* execve succeeded */ 1340 /* execve succeeded */
1332 current->fs->in_exec = 0; 1341 current->fs->in_exec = 0;
1333 current->in_execve = 0; 1342 current->in_execve = 0;
1334 mutex_unlock(&current->cred_exec_mutex); 1343 mutex_unlock(&current->cred_guard_mutex);
1335 acct_update_integrals(current); 1344 acct_update_integrals(current);
1336 free_bprm(bprm); 1345 free_bprm(bprm);
1337 if (displaced) 1346 if (displaced)
@@ -1354,7 +1363,7 @@ out_unmark:
1354 1363
1355out_unlock: 1364out_unlock:
1356 current->in_execve = 0; 1365 current->in_execve = 0;
1357 mutex_unlock(&current->cred_exec_mutex); 1366 mutex_unlock(&current->cred_guard_mutex);
1358 1367
1359out_free: 1368out_free:
1360 free_bprm(bprm); 1369 free_bprm(bprm);
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index b1512c4bb8c7..c6718e4817fe 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -2,9 +2,7 @@
2 * common.h - Common definitions for both Kernel and user-mode utilities 2 * common.h - Common definitions for both Kernel and user-mode utilities
3 * 3 *
4 * Copyright (C) 2005, 2006 4 * Copyright (C) 2005, 2006
5 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) 5 * Avishay Traeger (avishay@gmail.com)
6 * Copyright (C) 2005, 2006
7 * International Business Machines
8 * Copyright (C) 2008, 2009 6 * Copyright (C) 2008, 2009
9 * Boaz Harrosh <bharrosh@panasas.com> 7 * Boaz Harrosh <bharrosh@panasas.com>
10 * 8 *
@@ -175,10 +173,4 @@ int exofs_async_op(struct osd_request *or,
175 173
176int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr); 174int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
177 175
178int osd_req_read_kern(struct osd_request *or,
179 const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
180
181int osd_req_write_kern(struct osd_request *or,
182 const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
183
184#endif /*ifndef __EXOFS_COM_H__*/ 176#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 65b0c8c776a1..4cfab1cc75c0 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <bharrosh@panasas.com>
8 * 6 *
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 0fd4c7859679..5ec72e020b22 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -1,8 +1,6 @@
1/* 1/*
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <bharrosh@panasas.com>
8 * 6 *
@@ -156,6 +154,9 @@ ino_t exofs_parent_ino(struct dentry *child);
156int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, 154int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
157 struct inode *); 155 struct inode *);
158 156
157/* super.c */
158int exofs_sync_fs(struct super_block *sb, int wait);
159
159/********************* 160/*********************
160 * operation vectors * 161 * operation vectors *
161 *********************/ 162 *********************/
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 6ed7fe484752..839b9dc1e70f 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <bharrosh@panasas.com>
8 * 6 *
@@ -47,16 +45,23 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
47{ 45{
48 int ret; 46 int ret;
49 struct address_space *mapping = filp->f_mapping; 47 struct address_space *mapping = filp->f_mapping;
48 struct inode *inode = dentry->d_inode;
49 struct super_block *sb;
50 50
51 ret = filemap_write_and_wait(mapping); 51 ret = filemap_write_and_wait(mapping);
52 if (ret) 52 if (ret)
53 return ret; 53 return ret;
54 54
55 /*Note: file_fsync below also calles sync_blockdev, which is a no-op 55 /* sync the inode attributes */
56 * for exofs, but other then that it does sync_inode and 56 ret = write_inode_now(inode, 1);
57 * sync_superblock which is what we need here. 57
58 */ 58 /* This is a good place to write the sb */
59 return file_fsync(filp, dentry, datasync); 59 /* TODO: Sechedule an sb-sync on create */
60 sb = inode->i_sb;
61 if (sb->s_dirt)
62 exofs_sync_fs(sb, 1);
63
64 return ret;
60} 65}
61 66
62static int exofs_flush(struct file *file, fl_owner_t id) 67static int exofs_flush(struct file *file, fl_owner_t id)
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index ba8d9fab4693..6c10f7476699 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <bharrosh@panasas.com>
8 * 6 *
@@ -59,10 +57,9 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
59 struct inode *inode) 57 struct inode *inode)
60{ 58{
61 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 59 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
62 struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
63 60
64 pcol->sbi = sbi; 61 pcol->sbi = sbi;
65 pcol->req_q = req_q; 62 pcol->req_q = osd_request_queue(sbi->s_dev);
66 pcol->inode = inode; 63 pcol->inode = inode;
67 pcol->expected_pages = expected_pages; 64 pcol->expected_pages = expected_pages;
68 65
@@ -266,7 +263,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
266 goto err; 263 goto err;
267 } 264 }
268 265
269 osd_req_read(or, &obj, pcol->bio, i_start); 266 osd_req_read(or, &obj, i_start, pcol->bio, pcol->length);
270 267
271 if (is_sync) { 268 if (is_sync) {
272 exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred); 269 exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
@@ -296,6 +293,9 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
296err: 293err:
297 if (!is_sync) 294 if (!is_sync)
298 _unlock_pcol_pages(pcol, ret, READ); 295 _unlock_pcol_pages(pcol, ret, READ);
296 else /* Pages unlocked by caller in sync mode only free bio */
297 pcol_free(pcol);
298
299 kfree(pcol_copy); 299 kfree(pcol_copy);
300 if (or) 300 if (or)
301 osd_end_request(or); 301 osd_end_request(or);
@@ -522,7 +522,8 @@ static int write_exec(struct page_collect *pcol)
522 522
523 *pcol_copy = *pcol; 523 *pcol_copy = *pcol;
524 524
525 osd_req_write(or, &obj, pcol_copy->bio, i_start); 525 pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
526 osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length);
526 ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred); 527 ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
527 if (unlikely(ret)) { 528 if (unlikely(ret)) {
528 EXOFS_ERR("write_exec: exofs_async_op() Faild\n"); 529 EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 77fdd765e76d..b7dd0c236863 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <bharrosh@panasas.com>
8 * 6 *
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
index b249ae97fb15..4372542df284 100644
--- a/fs/exofs/osd.c
+++ b/fs/exofs/osd.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <bharrosh@panasas.com>
8 * 6 *
@@ -50,10 +48,10 @@ int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
50 48
51 /* FIXME: should be include in osd_sense_info */ 49 /* FIXME: should be include in osd_sense_info */
52 if (in_resid) 50 if (in_resid)
53 *in_resid = or->in.req ? or->in.req->data_len : 0; 51 *in_resid = or->in.req ? or->in.req->resid_len : 0;
54 52
55 if (out_resid) 53 if (out_resid)
56 *out_resid = or->out.req ? or->out.req->data_len : 0; 54 *out_resid = or->out.req ? or->out.req->resid_len : 0;
57 55
58 return ret; 56 return ret;
59} 57}
@@ -125,29 +123,3 @@ int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
125 123
126 return -EIO; 124 return -EIO;
127} 125}
128
129int osd_req_read_kern(struct osd_request *or,
130 const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
131{
132 struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
133 struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
134
135 if (!bio)
136 return -ENOMEM;
137
138 osd_req_read(or, obj, bio, offset);
139 return 0;
140}
141
142int osd_req_write_kern(struct osd_request *or,
143 const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
144{
145 struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
146 struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
147
148 if (!bio)
149 return -ENOMEM;
150
151 osd_req_write(or, obj, bio, offset);
152 return 0;
153}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f1985e857e2..5ab10c3bbebe 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <bharrosh@panasas.com>
8 * 6 *
@@ -33,6 +31,7 @@
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */ 32 */
35 33
34#include <linux/smp_lock.h>
36#include <linux/string.h> 35#include <linux/string.h>
37#include <linux/parser.h> 36#include <linux/parser.h>
38#include <linux/vfs.h> 37#include <linux/vfs.h>
@@ -200,20 +199,21 @@ static const struct export_operations exofs_export_ops;
200/* 199/*
201 * Write the superblock to the OSD 200 * Write the superblock to the OSD
202 */ 201 */
203static void exofs_write_super(struct super_block *sb) 202int exofs_sync_fs(struct super_block *sb, int wait)
204{ 203{
205 struct exofs_sb_info *sbi; 204 struct exofs_sb_info *sbi;
206 struct exofs_fscb *fscb; 205 struct exofs_fscb *fscb;
207 struct osd_request *or; 206 struct osd_request *or;
208 struct osd_obj_id obj; 207 struct osd_obj_id obj;
209 int ret; 208 int ret = -ENOMEM;
210 209
211 fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL); 210 fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
212 if (!fscb) { 211 if (!fscb) {
213 EXOFS_ERR("exofs_write_super: memory allocation failed.\n"); 212 EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
214 return; 213 return -ENOMEM;
215 } 214 }
216 215
216 lock_super(sb);
217 lock_kernel(); 217 lock_kernel();
218 sbi = sb->s_fs_info; 218 sbi = sb->s_fs_info;
219 fscb->s_nextid = cpu_to_le64(sbi->s_nextid); 219 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -246,7 +246,17 @@ out:
246 if (or) 246 if (or)
247 osd_end_request(or); 247 osd_end_request(or);
248 unlock_kernel(); 248 unlock_kernel();
249 unlock_super(sb);
249 kfree(fscb); 250 kfree(fscb);
251 return ret;
252}
253
254static void exofs_write_super(struct super_block *sb)
255{
256 if (!(sb->s_flags & MS_RDONLY))
257 exofs_sync_fs(sb, 1);
258 else
259 sb->s_dirt = 0;
250} 260}
251 261
252/* 262/*
@@ -258,6 +268,11 @@ static void exofs_put_super(struct super_block *sb)
258 int num_pend; 268 int num_pend;
259 struct exofs_sb_info *sbi = sb->s_fs_info; 269 struct exofs_sb_info *sbi = sb->s_fs_info;
260 270
271 lock_kernel();
272
273 if (sb->s_dirt)
274 exofs_write_super(sb);
275
261 /* make sure there are no pending commands */ 276 /* make sure there are no pending commands */
262 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; 277 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
263 num_pend = atomic_read(&sbi->s_curr_pending)) { 278 num_pend = atomic_read(&sbi->s_curr_pending)) {
@@ -271,6 +286,8 @@ static void exofs_put_super(struct super_block *sb)
271 osduld_put_device(sbi->s_dev); 286 osduld_put_device(sbi->s_dev);
272 kfree(sb->s_fs_info); 287 kfree(sb->s_fs_info);
273 sb->s_fs_info = NULL; 288 sb->s_fs_info = NULL;
289
290 unlock_kernel();
274} 291}
275 292
276/* 293/*
@@ -484,6 +501,7 @@ static const struct super_operations exofs_sops = {
484 .delete_inode = exofs_delete_inode, 501 .delete_inode = exofs_delete_inode,
485 .put_super = exofs_put_super, 502 .put_super = exofs_put_super,
486 .write_super = exofs_write_super, 503 .write_super = exofs_write_super,
504 .sync_fs = exofs_sync_fs,
487 .statfs = exofs_statfs, 505 .statfs = exofs_statfs,
488}; 506};
489 507
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
index 36e2d7bc7f7b..4dd687c3e747 100644
--- a/fs/exofs/symlink.c
+++ b/fs/exofs/symlink.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <bharrosh@panasas.com>
8 * 6 *
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index e0b2b43c1fdb..f42af45cfd88 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-$(CONFIG_EXT2_FS) += ext2.o 5obj-$(CONFIG_EXT2_FS) += ext2.o
6 6
7ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o 8 ioctl.o namei.o super.o symlink.o
9 9
10ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 10ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d46e38cb85c5..d636e1297cad 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -125,37 +125,12 @@ fail:
125 return ERR_PTR(-EINVAL); 125 return ERR_PTR(-EINVAL);
126} 126}
127 127
128static inline struct posix_acl *
129ext2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
130{
131 struct posix_acl *acl = EXT2_ACL_NOT_CACHED;
132
133 spin_lock(&inode->i_lock);
134 if (*i_acl != EXT2_ACL_NOT_CACHED)
135 acl = posix_acl_dup(*i_acl);
136 spin_unlock(&inode->i_lock);
137
138 return acl;
139}
140
141static inline void
142ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl,
143 struct posix_acl *acl)
144{
145 spin_lock(&inode->i_lock);
146 if (*i_acl != EXT2_ACL_NOT_CACHED)
147 posix_acl_release(*i_acl);
148 *i_acl = posix_acl_dup(acl);
149 spin_unlock(&inode->i_lock);
150}
151
152/* 128/*
153 * inode->i_mutex: don't care 129 * inode->i_mutex: don't care
154 */ 130 */
155static struct posix_acl * 131static struct posix_acl *
156ext2_get_acl(struct inode *inode, int type) 132ext2_get_acl(struct inode *inode, int type)
157{ 133{
158 struct ext2_inode_info *ei = EXT2_I(inode);
159 int name_index; 134 int name_index;
160 char *value = NULL; 135 char *value = NULL;
161 struct posix_acl *acl; 136 struct posix_acl *acl;
@@ -164,23 +139,19 @@ ext2_get_acl(struct inode *inode, int type)
164 if (!test_opt(inode->i_sb, POSIX_ACL)) 139 if (!test_opt(inode->i_sb, POSIX_ACL))
165 return NULL; 140 return NULL;
166 141
167 switch(type) { 142 acl = get_cached_acl(inode, type);
168 case ACL_TYPE_ACCESS: 143 if (acl != ACL_NOT_CACHED)
169 acl = ext2_iget_acl(inode, &ei->i_acl); 144 return acl;
170 if (acl != EXT2_ACL_NOT_CACHED) 145
171 return acl; 146 switch (type) {
172 name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; 147 case ACL_TYPE_ACCESS:
173 break; 148 name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
174 149 break;
175 case ACL_TYPE_DEFAULT: 150 case ACL_TYPE_DEFAULT:
176 acl = ext2_iget_acl(inode, &ei->i_default_acl); 151 name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
177 if (acl != EXT2_ACL_NOT_CACHED) 152 break;
178 return acl; 153 default:
179 name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT; 154 BUG();
180 break;
181
182 default:
183 return ERR_PTR(-EINVAL);
184 } 155 }
185 retval = ext2_xattr_get(inode, name_index, "", NULL, 0); 156 retval = ext2_xattr_get(inode, name_index, "", NULL, 0);
186 if (retval > 0) { 157 if (retval > 0) {
@@ -197,17 +168,9 @@ ext2_get_acl(struct inode *inode, int type)
197 acl = ERR_PTR(retval); 168 acl = ERR_PTR(retval);
198 kfree(value); 169 kfree(value);
199 170
200 if (!IS_ERR(acl)) { 171 if (!IS_ERR(acl))
201 switch(type) { 172 set_cached_acl(inode, type, acl);
202 case ACL_TYPE_ACCESS:
203 ext2_iset_acl(inode, &ei->i_acl, acl);
204 break;
205 173
206 case ACL_TYPE_DEFAULT:
207 ext2_iset_acl(inode, &ei->i_default_acl, acl);
208 break;
209 }
210 }
211 return acl; 174 return acl;
212} 175}
213 176
@@ -217,7 +180,6 @@ ext2_get_acl(struct inode *inode, int type)
217static int 180static int
218ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) 181ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
219{ 182{
220 struct ext2_inode_info *ei = EXT2_I(inode);
221 int name_index; 183 int name_index;
222 void *value = NULL; 184 void *value = NULL;
223 size_t size = 0; 185 size_t size = 0;
@@ -263,17 +225,8 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
263 error = ext2_xattr_set(inode, name_index, "", value, size, 0); 225 error = ext2_xattr_set(inode, name_index, "", value, size, 0);
264 226
265 kfree(value); 227 kfree(value);
266 if (!error) { 228 if (!error)
267 switch(type) { 229 set_cached_acl(inode, type, acl);
268 case ACL_TYPE_ACCESS:
269 ext2_iset_acl(inode, &ei->i_acl, acl);
270 break;
271
272 case ACL_TYPE_DEFAULT:
273 ext2_iset_acl(inode, &ei->i_default_acl, acl);
274 break;
275 }
276 }
277 return error; 230 return error;
278} 231}
279 232
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index b42cf578554b..ecefe478898f 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -53,10 +53,6 @@ static inline int ext2_acl_count(size_t size)
53 53
54#ifdef CONFIG_EXT2_FS_POSIX_ACL 54#ifdef CONFIG_EXT2_FS_POSIX_ACL
55 55
56/* Value for inode->u.ext2_i.i_acl and inode->u.ext2_i.i_default_acl
57 if the ACL has not been cached */
58#define EXT2_ACL_NOT_CACHED ((void *)-1)
59
60/* acl.c */ 56/* acl.c */
61extern int ext2_permission (struct inode *, int); 57extern int ext2_permission (struct inode *, int);
62extern int ext2_acl_chmod (struct inode *); 58extern int ext2_acl_chmod (struct inode *);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2999d72153b7..6cde970b0a1a 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -450,7 +450,7 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
450 450
451/* Releases the page */ 451/* Releases the page */
452void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, 452void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
453 struct page *page, struct inode *inode) 453 struct page *page, struct inode *inode, int update_times)
454{ 454{
455 loff_t pos = page_offset(page) + 455 loff_t pos = page_offset(page) +
456 (char *) de - (char *) page_address(page); 456 (char *) de - (char *) page_address(page);
@@ -465,7 +465,8 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
465 ext2_set_de_type(de, inode); 465 ext2_set_de_type(de, inode);
466 err = ext2_commit_chunk(page, pos, len); 466 err = ext2_commit_chunk(page, pos, len);
467 ext2_put_page(page); 467 ext2_put_page(page);
468 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; 468 if (update_times)
469 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
469 EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; 470 EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
470 mark_inode_dirty(dir); 471 mark_inode_dirty(dir);
471} 472}
@@ -720,5 +721,5 @@ const struct file_operations ext2_dir_operations = {
720#ifdef CONFIG_COMPAT 721#ifdef CONFIG_COMPAT
721 .compat_ioctl = ext2_compat_ioctl, 722 .compat_ioctl = ext2_compat_ioctl,
722#endif 723#endif
723 .fsync = ext2_sync_file, 724 .fsync = simple_fsync,
724}; 725};
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 3203042b36ef..9a8a8e27a063 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -27,7 +27,7 @@ struct ext2_inode_info {
27 /* 27 /*
28 * i_block_group is the number of the block group which contains 28 * i_block_group is the number of the block group which contains
29 * this file's inode. Constant across the lifetime of the inode, 29 * this file's inode. Constant across the lifetime of the inode,
30 * it is ued for making block allocation decisions - we try to 30 * it is used for making block allocation decisions - we try to
31 * place a file's data blocks near its inode block, and new inodes 31 * place a file's data blocks near its inode block, and new inodes
32 * near to their parent directory's inode. 32 * near to their parent directory's inode.
33 */ 33 */
@@ -47,10 +47,6 @@ struct ext2_inode_info {
47 */ 47 */
48 struct rw_semaphore xattr_sem; 48 struct rw_semaphore xattr_sem;
49#endif 49#endif
50#ifdef CONFIG_EXT2_FS_POSIX_ACL
51 struct posix_acl *i_acl;
52 struct posix_acl *i_default_acl;
53#endif
54 rwlock_t i_meta_lock; 50 rwlock_t i_meta_lock;
55 51
56 /* 52 /*
@@ -111,10 +107,7 @@ extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *,
111extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *); 107extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
112extern int ext2_empty_dir (struct inode *); 108extern int ext2_empty_dir (struct inode *);
113extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); 109extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
114extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *); 110extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
115
116/* fsync.c */
117extern int ext2_sync_file (struct file *, struct dentry *, int);
118 111
119/* ialloc.c */ 112/* ialloc.c */
120extern struct inode * ext2_new_inode (struct inode *, int); 113extern struct inode * ext2_new_inode (struct inode *, int);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 45ed07122182..2b9e47dc9222 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -55,7 +55,7 @@ const struct file_operations ext2_file_operations = {
55 .mmap = generic_file_mmap, 55 .mmap = generic_file_mmap,
56 .open = generic_file_open, 56 .open = generic_file_open,
57 .release = ext2_release_file, 57 .release = ext2_release_file,
58 .fsync = ext2_sync_file, 58 .fsync = simple_fsync,
59 .splice_read = generic_file_splice_read, 59 .splice_read = generic_file_splice_read,
60 .splice_write = generic_file_splice_write, 60 .splice_write = generic_file_splice_write,
61}; 61};
@@ -72,7 +72,7 @@ const struct file_operations ext2_xip_file_operations = {
72 .mmap = xip_file_mmap, 72 .mmap = xip_file_mmap,
73 .open = generic_file_open, 73 .open = generic_file_open,
74 .release = ext2_release_file, 74 .release = ext2_release_file,
75 .fsync = ext2_sync_file, 75 .fsync = simple_fsync,
76}; 76};
77#endif 77#endif
78 78
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
deleted file mode 100644
index fc66c93fcb5c..000000000000
--- a/fs/ext2/fsync.c
+++ /dev/null
@@ -1,50 +0,0 @@
1/*
2 * linux/fs/ext2/fsync.c
3 *
4 * Copyright (C) 1993 Stephen Tweedie (sct@dcs.ed.ac.uk)
5 * from
6 * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
7 * Laboratoire MASI - Institut Blaise Pascal
8 * Universite Pierre et Marie Curie (Paris VI)
9 * from
10 * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
11 *
12 * ext2fs fsync primitive
13 *
14 * Big-endian to little-endian byte-swapping/bitmaps by
15 * David S. Miller (davem@caip.rutgers.edu), 1995
16 *
17 * Removed unnecessary code duplication for little endian machines
18 * and excessive __inline__s.
19 * Andi Kleen, 1997
20 *
21 * Major simplications and cleanup - we only need to do the metadata, because
22 * we can depend on generic_block_fdatasync() to sync the data blocks.
23 */
24
25#include "ext2.h"
26#include <linux/buffer_head.h> /* for sync_mapping_buffers() */
27
28
29/*
30 * File may be NULL when we are called. Perhaps we shouldn't
31 * even pass file to fsync ?
32 */
33
34int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync)
35{
36 struct inode *inode = dentry->d_inode;
37 int err;
38 int ret;
39
40 ret = sync_mapping_buffers(inode->i_mapping);
41 if (!(inode->i_state & I_DIRTY))
42 return ret;
43 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
44 return ret;
45
46 err = ext2_sync_inode(inode);
47 if (ret == 0)
48 ret = err;
49 return ret;
50}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index acf678831103..e27130341d4f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,8 +41,6 @@ MODULE_AUTHOR("Remy Card and others");
41MODULE_DESCRIPTION("Second Extended Filesystem"); 41MODULE_DESCRIPTION("Second Extended Filesystem");
42MODULE_LICENSE("GPL"); 42MODULE_LICENSE("GPL");
43 43
44static int ext2_update_inode(struct inode * inode, int do_sync);
45
46/* 44/*
47 * Test whether an inode is a fast symlink. 45 * Test whether an inode is a fast symlink.
48 */ 46 */
@@ -66,7 +64,7 @@ void ext2_delete_inode (struct inode * inode)
66 goto no_delete; 64 goto no_delete;
67 EXT2_I(inode)->i_dtime = get_seconds(); 65 EXT2_I(inode)->i_dtime = get_seconds();
68 mark_inode_dirty(inode); 66 mark_inode_dirty(inode);
69 ext2_update_inode(inode, inode_needs_sync(inode)); 67 ext2_write_inode(inode, inode_needs_sync(inode));
70 68
71 inode->i_size = 0; 69 inode->i_size = 0;
72 if (inode->i_blocks) 70 if (inode->i_blocks)
@@ -1226,10 +1224,6 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
1226 return inode; 1224 return inode;
1227 1225
1228 ei = EXT2_I(inode); 1226 ei = EXT2_I(inode);
1229#ifdef CONFIG_EXT2_FS_POSIX_ACL
1230 ei->i_acl = EXT2_ACL_NOT_CACHED;
1231 ei->i_default_acl = EXT2_ACL_NOT_CACHED;
1232#endif
1233 ei->i_block_alloc_info = NULL; 1227 ei->i_block_alloc_info = NULL;
1234 1228
1235 raw_inode = ext2_get_inode(inode->i_sb, ino, &bh); 1229 raw_inode = ext2_get_inode(inode->i_sb, ino, &bh);
@@ -1337,7 +1331,7 @@ bad_inode:
1337 return ERR_PTR(ret); 1331 return ERR_PTR(ret);
1338} 1332}
1339 1333
1340static int ext2_update_inode(struct inode * inode, int do_sync) 1334int ext2_write_inode(struct inode *inode, int do_sync)
1341{ 1335{
1342 struct ext2_inode_info *ei = EXT2_I(inode); 1336 struct ext2_inode_info *ei = EXT2_I(inode);
1343 struct super_block *sb = inode->i_sb; 1337 struct super_block *sb = inode->i_sb;
@@ -1442,11 +1436,6 @@ static int ext2_update_inode(struct inode * inode, int do_sync)
1442 return err; 1436 return err;
1443} 1437}
1444 1438
1445int ext2_write_inode(struct inode *inode, int wait)
1446{
1447 return ext2_update_inode(inode, wait);
1448}
1449
1450int ext2_sync_inode(struct inode *inode) 1439int ext2_sync_inode(struct inode *inode)
1451{ 1440{
1452 struct writeback_control wbc = { 1441 struct writeback_control wbc = {
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 7cb4badef927..e7431309bdca 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -13,7 +13,6 @@
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/compat.h> 14#include <linux/compat.h>
15#include <linux/mount.h> 15#include <linux/mount.h>
16#include <linux/smp_lock.h>
17#include <asm/current.h> 16#include <asm/current.h>
18#include <asm/uaccess.h> 17#include <asm/uaccess.h>
19 18
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 90ea17998a73..e1dedb0f7873 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -66,8 +66,16 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
66 inode = NULL; 66 inode = NULL;
67 if (ino) { 67 if (ino) {
68 inode = ext2_iget(dir->i_sb, ino); 68 inode = ext2_iget(dir->i_sb, ino);
69 if (IS_ERR(inode)) 69 if (unlikely(IS_ERR(inode))) {
70 return ERR_CAST(inode); 70 if (PTR_ERR(inode) == -ESTALE) {
71 ext2_error(dir->i_sb, __func__,
72 "deleted inode referenced: %lu",
73 ino);
74 return ERR_PTR(-EIO);
75 } else {
76 return ERR_CAST(inode);
77 }
78 }
71 } 79 }
72 return d_splice_alias(inode, dentry); 80 return d_splice_alias(inode, dentry);
73} 81}
@@ -320,7 +328,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
320 if (!new_de) 328 if (!new_de)
321 goto out_dir; 329 goto out_dir;
322 inode_inc_link_count(old_inode); 330 inode_inc_link_count(old_inode);
323 ext2_set_link(new_dir, new_de, new_page, old_inode); 331 ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
324 new_inode->i_ctime = CURRENT_TIME_SEC; 332 new_inode->i_ctime = CURRENT_TIME_SEC;
325 if (dir_de) 333 if (dir_de)
326 drop_nlink(new_inode); 334 drop_nlink(new_inode);
@@ -352,7 +360,8 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
352 inode_dec_link_count(old_inode); 360 inode_dec_link_count(old_inode);
353 361
354 if (dir_de) { 362 if (dir_de) {
355 ext2_set_link(old_inode, dir_de, dir_page, new_dir); 363 if (old_dir != new_dir)
364 ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
356 inode_dec_link_count(old_dir); 365 inode_dec_link_count(old_dir);
357 } 366 }
358 return 0; 367 return 0;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 5c4afe652245..1a9ffee47d56 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -42,6 +42,7 @@ static void ext2_sync_super(struct super_block *sb,
42 struct ext2_super_block *es); 42 struct ext2_super_block *es);
43static int ext2_remount (struct super_block * sb, int * flags, char * data); 43static int ext2_remount (struct super_block * sb, int * flags, char * data);
44static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); 44static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
45static int ext2_sync_fs(struct super_block *sb, int wait);
45 46
46void ext2_error (struct super_block * sb, const char * function, 47void ext2_error (struct super_block * sb, const char * function,
47 const char * fmt, ...) 48 const char * fmt, ...)
@@ -114,6 +115,11 @@ static void ext2_put_super (struct super_block * sb)
114 int i; 115 int i;
115 struct ext2_sb_info *sbi = EXT2_SB(sb); 116 struct ext2_sb_info *sbi = EXT2_SB(sb);
116 117
118 lock_kernel();
119
120 if (sb->s_dirt)
121 ext2_write_super(sb);
122
117 ext2_xattr_put_super(sb); 123 ext2_xattr_put_super(sb);
118 if (!(sb->s_flags & MS_RDONLY)) { 124 if (!(sb->s_flags & MS_RDONLY)) {
119 struct ext2_super_block *es = sbi->s_es; 125 struct ext2_super_block *es = sbi->s_es;
@@ -135,7 +141,7 @@ static void ext2_put_super (struct super_block * sb)
135 kfree(sbi->s_blockgroup_lock); 141 kfree(sbi->s_blockgroup_lock);
136 kfree(sbi); 142 kfree(sbi);
137 143
138 return; 144 unlock_kernel();
139} 145}
140 146
141static struct kmem_cache * ext2_inode_cachep; 147static struct kmem_cache * ext2_inode_cachep;
@@ -146,10 +152,6 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
146 ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL); 152 ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
147 if (!ei) 153 if (!ei)
148 return NULL; 154 return NULL;
149#ifdef CONFIG_EXT2_FS_POSIX_ACL
150 ei->i_acl = EXT2_ACL_NOT_CACHED;
151 ei->i_default_acl = EXT2_ACL_NOT_CACHED;
152#endif
153 ei->i_block_alloc_info = NULL; 155 ei->i_block_alloc_info = NULL;
154 ei->vfs_inode.i_version = 1; 156 ei->vfs_inode.i_version = 1;
155 return &ei->vfs_inode; 157 return &ei->vfs_inode;
@@ -192,18 +194,6 @@ static void destroy_inodecache(void)
192static void ext2_clear_inode(struct inode *inode) 194static void ext2_clear_inode(struct inode *inode)
193{ 195{
194 struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info; 196 struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
195#ifdef CONFIG_EXT2_FS_POSIX_ACL
196 struct ext2_inode_info *ei = EXT2_I(inode);
197
198 if (ei->i_acl && ei->i_acl != EXT2_ACL_NOT_CACHED) {
199 posix_acl_release(ei->i_acl);
200 ei->i_acl = EXT2_ACL_NOT_CACHED;
201 }
202 if (ei->i_default_acl && ei->i_default_acl != EXT2_ACL_NOT_CACHED) {
203 posix_acl_release(ei->i_default_acl);
204 ei->i_default_acl = EXT2_ACL_NOT_CACHED;
205 }
206#endif
207 ext2_discard_reservation(inode); 197 ext2_discard_reservation(inode);
208 EXT2_I(inode)->i_block_alloc_info = NULL; 198 EXT2_I(inode)->i_block_alloc_info = NULL;
209 if (unlikely(rsv)) 199 if (unlikely(rsv))
@@ -304,6 +294,7 @@ static const struct super_operations ext2_sops = {
304 .delete_inode = ext2_delete_inode, 294 .delete_inode = ext2_delete_inode,
305 .put_super = ext2_put_super, 295 .put_super = ext2_put_super,
306 .write_super = ext2_write_super, 296 .write_super = ext2_write_super,
297 .sync_fs = ext2_sync_fs,
307 .statfs = ext2_statfs, 298 .statfs = ext2_statfs,
308 .remount_fs = ext2_remount, 299 .remount_fs = ext2_remount,
309 .clear_inode = ext2_clear_inode, 300 .clear_inode = ext2_clear_inode,
@@ -1093,6 +1084,7 @@ failed_mount:
1093 brelse(bh); 1084 brelse(bh);
1094failed_sbi: 1085failed_sbi:
1095 sb->s_fs_info = NULL; 1086 sb->s_fs_info = NULL;
1087 kfree(sbi->s_blockgroup_lock);
1096 kfree(sbi); 1088 kfree(sbi);
1097 return ret; 1089 return ret;
1098} 1090}
@@ -1126,25 +1118,36 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
1126 * set s_state to EXT2_VALID_FS after some corrections. 1118 * set s_state to EXT2_VALID_FS after some corrections.
1127 */ 1119 */
1128 1120
1129void ext2_write_super (struct super_block * sb) 1121static int ext2_sync_fs(struct super_block *sb, int wait)
1130{ 1122{
1131 struct ext2_super_block * es; 1123 struct ext2_super_block *es = EXT2_SB(sb)->s_es;
1124
1132 lock_kernel(); 1125 lock_kernel();
1133 if (!(sb->s_flags & MS_RDONLY)) { 1126 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
1134 es = EXT2_SB(sb)->s_es; 1127 ext2_debug("setting valid to 0\n");
1135 1128 es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
1136 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { 1129 es->s_free_blocks_count =
1137 ext2_debug ("setting valid to 0\n"); 1130 cpu_to_le32(ext2_count_free_blocks(sb));
1138 es->s_state &= cpu_to_le16(~EXT2_VALID_FS); 1131 es->s_free_inodes_count =
1139 es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); 1132 cpu_to_le32(ext2_count_free_inodes(sb));
1140 es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); 1133 es->s_mtime = cpu_to_le32(get_seconds());
1141 es->s_mtime = cpu_to_le32(get_seconds()); 1134 ext2_sync_super(sb, es);
1142 ext2_sync_super(sb, es); 1135 } else {
1143 } else 1136 ext2_commit_super(sb, es);
1144 ext2_commit_super (sb, es);
1145 } 1137 }
1146 sb->s_dirt = 0; 1138 sb->s_dirt = 0;
1147 unlock_kernel(); 1139 unlock_kernel();
1140
1141 return 0;
1142}
1143
1144
1145void ext2_write_super(struct super_block *sb)
1146{
1147 if (!(sb->s_flags & MS_RDONLY))
1148 ext2_sync_fs(sb, 1);
1149 else
1150 sb->s_dirt = 0;
1148} 1151}
1149 1152
1150static int ext2_remount (struct super_block * sb, int * flags, char * data) 1153static int ext2_remount (struct super_block * sb, int * flags, char * data)
@@ -1156,6 +1159,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1156 unsigned long old_sb_flags; 1159 unsigned long old_sb_flags;
1157 int err; 1160 int err;
1158 1161
1162 lock_kernel();
1163
1159 /* Store the old options */ 1164 /* Store the old options */
1160 old_sb_flags = sb->s_flags; 1165 old_sb_flags = sb->s_flags;
1161 old_opts.s_mount_opt = sbi->s_mount_opt; 1166 old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -1191,12 +1196,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1191 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; 1196 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
1192 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; 1197 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1193 } 1198 }
1194 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1199 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1200 unlock_kernel();
1195 return 0; 1201 return 0;
1202 }
1196 if (*flags & MS_RDONLY) { 1203 if (*flags & MS_RDONLY) {
1197 if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || 1204 if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
1198 !(sbi->s_mount_state & EXT2_VALID_FS)) 1205 !(sbi->s_mount_state & EXT2_VALID_FS)) {
1206 unlock_kernel();
1199 return 0; 1207 return 0;
1208 }
1200 /* 1209 /*
1201 * OK, we are remounting a valid rw partition rdonly, so set 1210 * OK, we are remounting a valid rw partition rdonly, so set
1202 * the rdonly flag and then mark the partition as valid again. 1211 * the rdonly flag and then mark the partition as valid again.
@@ -1223,12 +1232,14 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1223 sb->s_flags &= ~MS_RDONLY; 1232 sb->s_flags &= ~MS_RDONLY;
1224 } 1233 }
1225 ext2_sync_super(sb, es); 1234 ext2_sync_super(sb, es);
1235 unlock_kernel();
1226 return 0; 1236 return 0;
1227restore_opts: 1237restore_opts:
1228 sbi->s_mount_opt = old_opts.s_mount_opt; 1238 sbi->s_mount_opt = old_opts.s_mount_opt;
1229 sbi->s_resuid = old_opts.s_resuid; 1239 sbi->s_resuid = old_opts.s_resuid;
1230 sbi->s_resgid = old_opts.s_resgid; 1240 sbi->s_resgid = old_opts.s_resgid;
1231 sb->s_flags = old_sb_flags; 1241 sb->s_flags = old_sb_flags;
1242 unlock_kernel();
1232 return err; 1243 return err;
1233} 1244}
1234 1245
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index fb3c1a21b135..522b15498f45 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -29,23 +29,25 @@ config EXT3_FS
29 module will be called ext3. 29 module will be called ext3.
30 30
31config EXT3_DEFAULTS_TO_ORDERED 31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3 (legacy option)" 32 bool "Default to 'data=ordered' in ext3"
33 depends on EXT3_FS 33 depends on EXT3_FS
34 help 34 help
35 If a filesystem does not explicitly specify a data ordering 35 The journal mode options for ext3 have different tradeoffs
36 mode, and the journal capability allowed it, ext3 used to 36 between when data is guaranteed to be on disk and
37 historically default to 'data=ordered'. 37 performance. The use of "data=writeback" can cause
38 38 unwritten data to appear in files after an system crash or
39 That was a rather unfortunate choice, because it leads to all 39 power failure, which can be a security issue. However,
40 kinds of latency problems, and the 'data=writeback' mode is more 40 "data=ordered" mode can also result in major performance
41 appropriate these days. 41 problems, including seconds-long delays before an fsync()
42 42 call returns. For details, see:
43 You should probably always answer 'n' here, and if you really 43
44 want to use 'data=ordered' mode, set it in the filesystem itself 44 http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
45 with 'tune2fs -o journal_data_ordered'. 45
46 46 If you have been historically happy with ext3's performance,
47 But if you really want to enable the legacy default, you can do 47 data=ordered mode will be a safe choice and you should
48 so by answering 'y' to this question. 48 answer 'y' here. If you understand the reliability and data
49 privacy issues of data=writeback and are willing to make
50 that trade off, answer 'n'.
49 51
50config EXT3_FS_XATTR 52config EXT3_FS_XATTR
51 bool "Ext3 extended attributes" 53 bool "Ext3 extended attributes"
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index d81ef2fdb08e..e167bae37ef0 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -126,30 +126,6 @@ fail:
126 return ERR_PTR(-EINVAL); 126 return ERR_PTR(-EINVAL);
127} 127}
128 128
129static inline struct posix_acl *
130ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
131{
132 struct posix_acl *acl = EXT3_ACL_NOT_CACHED;
133
134 spin_lock(&inode->i_lock);
135 if (*i_acl != EXT3_ACL_NOT_CACHED)
136 acl = posix_acl_dup(*i_acl);
137 spin_unlock(&inode->i_lock);
138
139 return acl;
140}
141
142static inline void
143ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
144 struct posix_acl *acl)
145{
146 spin_lock(&inode->i_lock);
147 if (*i_acl != EXT3_ACL_NOT_CACHED)
148 posix_acl_release(*i_acl);
149 *i_acl = posix_acl_dup(acl);
150 spin_unlock(&inode->i_lock);
151}
152
153/* 129/*
154 * Inode operation get_posix_acl(). 130 * Inode operation get_posix_acl().
155 * 131 *
@@ -158,7 +134,6 @@ ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
158static struct posix_acl * 134static struct posix_acl *
159ext3_get_acl(struct inode *inode, int type) 135ext3_get_acl(struct inode *inode, int type)
160{ 136{
161 struct ext3_inode_info *ei = EXT3_I(inode);
162 int name_index; 137 int name_index;
163 char *value = NULL; 138 char *value = NULL;
164 struct posix_acl *acl; 139 struct posix_acl *acl;
@@ -167,24 +142,21 @@ ext3_get_acl(struct inode *inode, int type)
167 if (!test_opt(inode->i_sb, POSIX_ACL)) 142 if (!test_opt(inode->i_sb, POSIX_ACL))
168 return NULL; 143 return NULL;
169 144
170 switch(type) { 145 acl = get_cached_acl(inode, type);
171 case ACL_TYPE_ACCESS: 146 if (acl != ACL_NOT_CACHED)
172 acl = ext3_iget_acl(inode, &ei->i_acl); 147 return acl;
173 if (acl != EXT3_ACL_NOT_CACHED) 148
174 return acl; 149 switch (type) {
175 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; 150 case ACL_TYPE_ACCESS:
176 break; 151 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
177 152 break;
178 case ACL_TYPE_DEFAULT: 153 case ACL_TYPE_DEFAULT:
179 acl = ext3_iget_acl(inode, &ei->i_default_acl); 154 name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
180 if (acl != EXT3_ACL_NOT_CACHED) 155 break;
181 return acl; 156 default:
182 name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; 157 BUG();
183 break;
184
185 default:
186 return ERR_PTR(-EINVAL);
187 } 158 }
159
188 retval = ext3_xattr_get(inode, name_index, "", NULL, 0); 160 retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
189 if (retval > 0) { 161 if (retval > 0) {
190 value = kmalloc(retval, GFP_NOFS); 162 value = kmalloc(retval, GFP_NOFS);
@@ -200,17 +172,9 @@ ext3_get_acl(struct inode *inode, int type)
200 acl = ERR_PTR(retval); 172 acl = ERR_PTR(retval);
201 kfree(value); 173 kfree(value);
202 174
203 if (!IS_ERR(acl)) { 175 if (!IS_ERR(acl))
204 switch(type) { 176 set_cached_acl(inode, type, acl);
205 case ACL_TYPE_ACCESS:
206 ext3_iset_acl(inode, &ei->i_acl, acl);
207 break;
208 177
209 case ACL_TYPE_DEFAULT:
210 ext3_iset_acl(inode, &ei->i_default_acl, acl);
211 break;
212 }
213 }
214 return acl; 178 return acl;
215} 179}
216 180
@@ -223,7 +187,6 @@ static int
223ext3_set_acl(handle_t *handle, struct inode *inode, int type, 187ext3_set_acl(handle_t *handle, struct inode *inode, int type,
224 struct posix_acl *acl) 188 struct posix_acl *acl)
225{ 189{
226 struct ext3_inode_info *ei = EXT3_I(inode);
227 int name_index; 190 int name_index;
228 void *value = NULL; 191 void *value = NULL;
229 size_t size = 0; 192 size_t size = 0;
@@ -268,17 +231,10 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
268 value, size, 0); 231 value, size, 0);
269 232
270 kfree(value); 233 kfree(value);
271 if (!error) {
272 switch(type) {
273 case ACL_TYPE_ACCESS:
274 ext3_iset_acl(inode, &ei->i_acl, acl);
275 break;
276 234
277 case ACL_TYPE_DEFAULT: 235 if (!error)
278 ext3_iset_acl(inode, &ei->i_default_acl, acl); 236 set_cached_acl(inode, type, acl);
279 break; 237
280 }
281 }
282 return error; 238 return error;
283} 239}
284 240
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 42da16b8cac0..07d15a3a5969 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -53,10 +53,6 @@ static inline int ext3_acl_count(size_t size)
53 53
54#ifdef CONFIG_EXT3_FS_POSIX_ACL 54#ifdef CONFIG_EXT3_FS_POSIX_ACL
55 55
56/* Value for inode->u.ext3_i.i_acl and inode->u.ext3_i.i_default_acl
57 if the ACL has not been cached */
58#define EXT3_ACL_NOT_CACHED ((void *)-1)
59
60/* acl.c */ 56/* acl.c */
61extern int ext3_permission (struct inode *, int); 57extern int ext3_permission (struct inode *, int);
62extern int ext3_acl_chmod (struct inode *); 58extern int ext3_acl_chmod (struct inode *);
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 225202db8974..27967f92e820 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -649,7 +649,7 @@ do_more:
649 count = overflow; 649 count = overflow;
650 goto do_more; 650 goto do_more;
651 } 651 }
652 sb->s_dirt = 1; 652
653error_return: 653error_return:
654 brelse(bitmap_bh); 654 brelse(bitmap_bh);
655 ext3_std_error(sb, err); 655 ext3_std_error(sb, err);
@@ -1708,7 +1708,6 @@ allocated:
1708 if (!fatal) 1708 if (!fatal)
1709 fatal = err; 1709 fatal = err;
1710 1710
1711 sb->s_dirt = 1;
1712 if (fatal) 1711 if (fatal)
1713 goto out; 1712 goto out;
1714 1713
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 3d724a95882f..373fa90c796a 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp,
130 struct buffer_head *bh = NULL; 130 struct buffer_head *bh = NULL;
131 131
132 map_bh.b_state = 0; 132 map_bh.b_state = 0;
133 err = ext3_get_blocks_handle(NULL, inode, blk, 1, 133 err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
134 &map_bh, 0, 0);
135 if (err > 0) { 134 if (err > 0) {
136 pgoff_t index = map_bh.b_blocknr >> 135 pgoff_t index = map_bh.b_blocknr >>
137 (PAGE_CACHE_SHIFT - inode->i_blkbits); 136 (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dd13d60d524b..b39991285136 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -181,7 +181,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
181 err = ext3_journal_dirty_metadata(handle, bitmap_bh); 181 err = ext3_journal_dirty_metadata(handle, bitmap_bh);
182 if (!fatal) 182 if (!fatal)
183 fatal = err; 183 fatal = err;
184 sb->s_dirt = 1; 184
185error_return: 185error_return:
186 brelse(bitmap_bh); 186 brelse(bitmap_bh);
187 ext3_std_error(sb, fatal); 187 ext3_std_error(sb, fatal);
@@ -537,7 +537,6 @@ got:
537 percpu_counter_dec(&sbi->s_freeinodes_counter); 537 percpu_counter_dec(&sbi->s_freeinodes_counter);
538 if (S_ISDIR(mode)) 538 if (S_ISDIR(mode))
539 percpu_counter_inc(&sbi->s_dirs_counter); 539 percpu_counter_inc(&sbi->s_dirs_counter);
540 sb->s_dirt = 1;
541 540
542 inode->i_uid = current_fsuid(); 541 inode->i_uid = current_fsuid();
543 if (test_opt (sb, GRPID)) 542 if (test_opt (sb, GRPID))
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index fcfa24361856..b49908a167ae 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -788,7 +788,7 @@ err_out:
788int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, 788int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
789 sector_t iblock, unsigned long maxblocks, 789 sector_t iblock, unsigned long maxblocks,
790 struct buffer_head *bh_result, 790 struct buffer_head *bh_result,
791 int create, int extend_disksize) 791 int create)
792{ 792{
793 int err = -EIO; 793 int err = -EIO;
794 int offsets[4]; 794 int offsets[4];
@@ -820,7 +820,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
820 while (count < maxblocks && count <= blocks_to_boundary) { 820 while (count < maxblocks && count <= blocks_to_boundary) {
821 ext3_fsblk_t blk; 821 ext3_fsblk_t blk;
822 822
823 if (!verify_chain(chain, partial)) { 823 if (!verify_chain(chain, chain + depth - 1)) {
824 /* 824 /*
825 * Indirect block might be removed by 825 * Indirect block might be removed by
826 * truncate while we were reading it. 826 * truncate while we were reading it.
@@ -911,13 +911,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
911 if (!err) 911 if (!err)
912 err = ext3_splice_branch(handle, inode, iblock, 912 err = ext3_splice_branch(handle, inode, iblock,
913 partial, indirect_blks, count); 913 partial, indirect_blks, count);
914 /*
915 * i_disksize growing is protected by truncate_mutex. Don't forget to
916 * protect it if you're about to implement concurrent
917 * ext3_get_block() -bzzz
918 */
919 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
920 ei->i_disksize = inode->i_size;
921 mutex_unlock(&ei->truncate_mutex); 914 mutex_unlock(&ei->truncate_mutex);
922 if (err) 915 if (err)
923 goto cleanup; 916 goto cleanup;
@@ -972,7 +965,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
972 } 965 }
973 966
974 ret = ext3_get_blocks_handle(handle, inode, iblock, 967 ret = ext3_get_blocks_handle(handle, inode, iblock,
975 max_blocks, bh_result, create, 0); 968 max_blocks, bh_result, create);
976 if (ret > 0) { 969 if (ret > 0) {
977 bh_result->b_size = (ret << inode->i_blkbits); 970 bh_result->b_size = (ret << inode->i_blkbits);
978 ret = 0; 971 ret = 0;
@@ -1005,7 +998,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
1005 dummy.b_blocknr = -1000; 998 dummy.b_blocknr = -1000;
1006 buffer_trace_init(&dummy.b_history); 999 buffer_trace_init(&dummy.b_history);
1007 err = ext3_get_blocks_handle(handle, inode, block, 1, 1000 err = ext3_get_blocks_handle(handle, inode, block, 1,
1008 &dummy, create, 1); 1001 &dummy, create);
1009 /* 1002 /*
1010 * ext3_get_blocks_handle() returns number of blocks 1003 * ext3_get_blocks_handle() returns number of blocks
1011 * mapped. 0 in case of a HOLE. 1004 * mapped. 0 in case of a HOLE.
@@ -1193,15 +1186,16 @@ write_begin_failed:
1193 * i_size_read because we hold i_mutex. 1186 * i_size_read because we hold i_mutex.
1194 * 1187 *
1195 * Add inode to orphan list in case we crash before truncate 1188 * Add inode to orphan list in case we crash before truncate
1196 * finishes. 1189 * finishes. Do this only if ext3_can_truncate() agrees so
1190 * that orphan processing code is happy.
1197 */ 1191 */
1198 if (pos + len > inode->i_size) 1192 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1199 ext3_orphan_add(handle, inode); 1193 ext3_orphan_add(handle, inode);
1200 ext3_journal_stop(handle); 1194 ext3_journal_stop(handle);
1201 unlock_page(page); 1195 unlock_page(page);
1202 page_cache_release(page); 1196 page_cache_release(page);
1203 if (pos + len > inode->i_size) 1197 if (pos + len > inode->i_size)
1204 vmtruncate(inode, inode->i_size); 1198 ext3_truncate(inode);
1205 } 1199 }
1206 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1200 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1207 goto retry; 1201 goto retry;
@@ -1287,7 +1281,7 @@ static int ext3_ordered_write_end(struct file *file,
1287 * There may be allocated blocks outside of i_size because 1281 * There may be allocated blocks outside of i_size because
1288 * we failed to copy some data. Prepare for truncate. 1282 * we failed to copy some data. Prepare for truncate.
1289 */ 1283 */
1290 if (pos + len > inode->i_size) 1284 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1291 ext3_orphan_add(handle, inode); 1285 ext3_orphan_add(handle, inode);
1292 ret2 = ext3_journal_stop(handle); 1286 ret2 = ext3_journal_stop(handle);
1293 if (!ret) 1287 if (!ret)
@@ -1296,7 +1290,7 @@ static int ext3_ordered_write_end(struct file *file,
1296 page_cache_release(page); 1290 page_cache_release(page);
1297 1291
1298 if (pos + len > inode->i_size) 1292 if (pos + len > inode->i_size)
1299 vmtruncate(inode, inode->i_size); 1293 ext3_truncate(inode);
1300 return ret ? ret : copied; 1294 return ret ? ret : copied;
1301} 1295}
1302 1296
@@ -1315,14 +1309,14 @@ static int ext3_writeback_write_end(struct file *file,
1315 * There may be allocated blocks outside of i_size because 1309 * There may be allocated blocks outside of i_size because
1316 * we failed to copy some data. Prepare for truncate. 1310 * we failed to copy some data. Prepare for truncate.
1317 */ 1311 */
1318 if (pos + len > inode->i_size) 1312 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1319 ext3_orphan_add(handle, inode); 1313 ext3_orphan_add(handle, inode);
1320 ret = ext3_journal_stop(handle); 1314 ret = ext3_journal_stop(handle);
1321 unlock_page(page); 1315 unlock_page(page);
1322 page_cache_release(page); 1316 page_cache_release(page);
1323 1317
1324 if (pos + len > inode->i_size) 1318 if (pos + len > inode->i_size)
1325 vmtruncate(inode, inode->i_size); 1319 ext3_truncate(inode);
1326 return ret ? ret : copied; 1320 return ret ? ret : copied;
1327} 1321}
1328 1322
@@ -1358,7 +1352,7 @@ static int ext3_journalled_write_end(struct file *file,
1358 * There may be allocated blocks outside of i_size because 1352 * There may be allocated blocks outside of i_size because
1359 * we failed to copy some data. Prepare for truncate. 1353 * we failed to copy some data. Prepare for truncate.
1360 */ 1354 */
1361 if (pos + len > inode->i_size) 1355 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1362 ext3_orphan_add(handle, inode); 1356 ext3_orphan_add(handle, inode);
1363 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1357 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1364 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1358 if (inode->i_size > EXT3_I(inode)->i_disksize) {
@@ -1375,7 +1369,7 @@ static int ext3_journalled_write_end(struct file *file,
1375 page_cache_release(page); 1369 page_cache_release(page);
1376 1370
1377 if (pos + len > inode->i_size) 1371 if (pos + len > inode->i_size)
1378 vmtruncate(inode, inode->i_size); 1372 ext3_truncate(inode);
1379 return ret ? ret : copied; 1373 return ret ? ret : copied;
1380} 1374}
1381 1375
@@ -2374,7 +2368,7 @@ void ext3_truncate(struct inode *inode)
2374 struct page *page; 2368 struct page *page;
2375 2369
2376 if (!ext3_can_truncate(inode)) 2370 if (!ext3_can_truncate(inode))
2377 return; 2371 goto out_notrans;
2378 2372
2379 if (inode->i_size == 0 && ext3_should_writeback_data(inode)) 2373 if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2380 ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE; 2374 ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
@@ -2390,7 +2384,7 @@ void ext3_truncate(struct inode *inode)
2390 page = grab_cache_page(mapping, 2384 page = grab_cache_page(mapping,
2391 inode->i_size >> PAGE_CACHE_SHIFT); 2385 inode->i_size >> PAGE_CACHE_SHIFT);
2392 if (!page) 2386 if (!page)
2393 return; 2387 goto out_notrans;
2394 } 2388 }
2395 2389
2396 handle = start_transaction(inode); 2390 handle = start_transaction(inode);
@@ -2401,7 +2395,7 @@ void ext3_truncate(struct inode *inode)
2401 unlock_page(page); 2395 unlock_page(page);
2402 page_cache_release(page); 2396 page_cache_release(page);
2403 } 2397 }
2404 return; /* AKPM: return what? */ 2398 goto out_notrans;
2405 } 2399 }
2406 2400
2407 last_block = (inode->i_size + blocksize-1) 2401 last_block = (inode->i_size + blocksize-1)
@@ -2525,6 +2519,14 @@ out_stop:
2525 ext3_orphan_del(handle, inode); 2519 ext3_orphan_del(handle, inode);
2526 2520
2527 ext3_journal_stop(handle); 2521 ext3_journal_stop(handle);
2522 return;
2523out_notrans:
2524 /*
2525 * Delete the inode from orphan list so that it doesn't stay there
2526 * forever and trigger assertion on umount.
2527 */
2528 if (inode->i_nlink)
2529 ext3_orphan_del(NULL, inode);
2528} 2530}
2529 2531
2530static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, 2532static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
@@ -2744,10 +2746,6 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2744 return inode; 2746 return inode;
2745 2747
2746 ei = EXT3_I(inode); 2748 ei = EXT3_I(inode);
2747#ifdef CONFIG_EXT3_FS_POSIX_ACL
2748 ei->i_acl = EXT3_ACL_NOT_CACHED;
2749 ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2750#endif
2751 ei->i_block_alloc_info = NULL; 2749 ei->i_block_alloc_info = NULL;
2752 2750
2753 ret = __ext3_get_inode_loc(inode, &iloc, 0); 2751 ret = __ext3_get_inode_loc(inode, &iloc, 0);
@@ -2960,7 +2958,6 @@ static int ext3_do_update_inode(handle_t *handle,
2960 ext3_update_dynamic_rev(sb); 2958 ext3_update_dynamic_rev(sb);
2961 EXT3_SET_RO_COMPAT_FEATURE(sb, 2959 EXT3_SET_RO_COMPAT_FEATURE(sb,
2962 EXT3_FEATURE_RO_COMPAT_LARGE_FILE); 2960 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2963 sb->s_dirt = 1;
2964 handle->h_sync = 1; 2961 handle->h_sync = 1;
2965 err = ext3_journal_dirty_metadata(handle, 2962 err = ext3_journal_dirty_metadata(handle,
2966 EXT3_SB(sb)->s_sbh); 2963 EXT3_SB(sb)->s_sbh);
@@ -3123,12 +3120,6 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3123 3120
3124 rc = inode_setattr(inode, attr); 3121 rc = inode_setattr(inode, attr);
3125 3122
3126 /* If inode_setattr's call to ext3_truncate failed to get a
3127 * transaction handle at all, we need to clean up the in-core
3128 * orphan list manually. */
3129 if (inode->i_nlink)
3130 ext3_orphan_del(NULL, inode);
3131
3132 if (!rc && (ia_valid & ATTR_MODE)) 3123 if (!rc && (ia_valid & ATTR_MODE))
3133 rc = ext3_acl_chmod(inode); 3124 rc = ext3_acl_chmod(inode);
3134 3125
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 78fdf3836370..8359e7b3dc89 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -934,7 +934,6 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
934 EXT3_INODES_PER_GROUP(sb)); 934 EXT3_INODES_PER_GROUP(sb));
935 935
936 ext3_journal_dirty_metadata(handle, sbi->s_sbh); 936 ext3_journal_dirty_metadata(handle, sbi->s_sbh);
937 sb->s_dirt = 1;
938 937
939exit_journal: 938exit_journal:
940 unlock_super(sb); 939 unlock_super(sb);
@@ -991,7 +990,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
991 sb->s_id, n_blocks_count); 990 sb->s_id, n_blocks_count);
992 if (sizeof(sector_t) < 8) 991 if (sizeof(sector_t) < 8)
993 ext3_warning(sb, __func__, 992 ext3_warning(sb, __func__,
994 "CONFIG_LBD not enabled\n"); 993 "CONFIG_LBDAF not enabled\n");
995 return -EINVAL; 994 return -EINVAL;
996 } 995 }
997 996
@@ -1066,7 +1065,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
1066 } 1065 }
1067 es->s_blocks_count = cpu_to_le32(o_blocks_count + add); 1066 es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
1068 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 1067 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1069 sb->s_dirt = 1;
1070 unlock_super(sb); 1068 unlock_super(sb);
1071 ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, 1069 ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
1072 o_blocks_count + add); 1070 o_blocks_count + add);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe504c3..a8d80a7f1105 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -67,7 +67,6 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
67static int ext3_remount (struct super_block * sb, int * flags, char * data); 67static int ext3_remount (struct super_block * sb, int * flags, char * data);
68static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); 68static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
69static int ext3_unfreeze(struct super_block *sb); 69static int ext3_unfreeze(struct super_block *sb);
70static void ext3_write_super (struct super_block * sb);
71static int ext3_freeze(struct super_block *sb); 70static int ext3_freeze(struct super_block *sb);
72 71
73/* 72/*
@@ -399,6 +398,8 @@ static void ext3_put_super (struct super_block * sb)
399 struct ext3_super_block *es = sbi->s_es; 398 struct ext3_super_block *es = sbi->s_es;
400 int i, err; 399 int i, err;
401 400
401 lock_kernel();
402
402 ext3_xattr_put_super(sb); 403 ext3_xattr_put_super(sb);
403 err = journal_destroy(sbi->s_journal); 404 err = journal_destroy(sbi->s_journal);
404 sbi->s_journal = NULL; 405 sbi->s_journal = NULL;
@@ -447,7 +448,8 @@ static void ext3_put_super (struct super_block * sb)
447 sb->s_fs_info = NULL; 448 sb->s_fs_info = NULL;
448 kfree(sbi->s_blockgroup_lock); 449 kfree(sbi->s_blockgroup_lock);
449 kfree(sbi); 450 kfree(sbi);
450 return; 451
452 unlock_kernel();
451} 453}
452 454
453static struct kmem_cache *ext3_inode_cachep; 455static struct kmem_cache *ext3_inode_cachep;
@@ -462,10 +464,6 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
462 ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS); 464 ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
463 if (!ei) 465 if (!ei)
464 return NULL; 466 return NULL;
465#ifdef CONFIG_EXT3_FS_POSIX_ACL
466 ei->i_acl = EXT3_ACL_NOT_CACHED;
467 ei->i_default_acl = EXT3_ACL_NOT_CACHED;
468#endif
469 ei->i_block_alloc_info = NULL; 467 ei->i_block_alloc_info = NULL;
470 ei->vfs_inode.i_version = 1; 468 ei->vfs_inode.i_version = 1;
471 return &ei->vfs_inode; 469 return &ei->vfs_inode;
@@ -516,18 +514,6 @@ static void destroy_inodecache(void)
516static void ext3_clear_inode(struct inode *inode) 514static void ext3_clear_inode(struct inode *inode)
517{ 515{
518 struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info; 516 struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
519#ifdef CONFIG_EXT3_FS_POSIX_ACL
520 if (EXT3_I(inode)->i_acl &&
521 EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
522 posix_acl_release(EXT3_I(inode)->i_acl);
523 EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
524 }
525 if (EXT3_I(inode)->i_default_acl &&
526 EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
527 posix_acl_release(EXT3_I(inode)->i_default_acl);
528 EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
529 }
530#endif
531 ext3_discard_reservation(inode); 517 ext3_discard_reservation(inode);
532 EXT3_I(inode)->i_block_alloc_info = NULL; 518 EXT3_I(inode)->i_block_alloc_info = NULL;
533 if (unlikely(rsv)) 519 if (unlikely(rsv))
@@ -557,6 +543,19 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
557#endif 543#endif
558} 544}
559 545
546static char *data_mode_string(unsigned long mode)
547{
548 switch (mode) {
549 case EXT3_MOUNT_JOURNAL_DATA:
550 return "journal";
551 case EXT3_MOUNT_ORDERED_DATA:
552 return "ordered";
553 case EXT3_MOUNT_WRITEBACK_DATA:
554 return "writeback";
555 }
556 return "unknown";
557}
558
560/* 559/*
561 * Show an option if 560 * Show an option if
562 * - it's set to a non-default value OR 561 * - it's set to a non-default value OR
@@ -630,13 +629,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
630 if (test_opt(sb, NOBH)) 629 if (test_opt(sb, NOBH))
631 seq_puts(seq, ",nobh"); 630 seq_puts(seq, ",nobh");
632 631
633 if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) 632 seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt &
634 seq_puts(seq, ",data=journal"); 633 EXT3_MOUNT_DATA_FLAGS));
635 else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
636 seq_puts(seq, ",data=ordered");
637 else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
638 seq_puts(seq, ",data=writeback");
639
640 if (test_opt(sb, DATA_ERR_ABORT)) 634 if (test_opt(sb, DATA_ERR_ABORT))
641 seq_puts(seq, ",data_err=abort"); 635 seq_puts(seq, ",data_err=abort");
642 636
@@ -761,7 +755,6 @@ static const struct super_operations ext3_sops = {
761 .dirty_inode = ext3_dirty_inode, 755 .dirty_inode = ext3_dirty_inode,
762 .delete_inode = ext3_delete_inode, 756 .delete_inode = ext3_delete_inode,
763 .put_super = ext3_put_super, 757 .put_super = ext3_put_super,
764 .write_super = ext3_write_super,
765 .sync_fs = ext3_sync_fs, 758 .sync_fs = ext3_sync_fs,
766 .freeze_fs = ext3_freeze, 759 .freeze_fs = ext3_freeze,
767 .unfreeze_fs = ext3_unfreeze, 760 .unfreeze_fs = ext3_unfreeze,
@@ -1039,12 +1032,18 @@ static int parse_options (char *options, struct super_block *sb,
1039 datacheck: 1032 datacheck:
1040 if (is_remount) { 1033 if (is_remount) {
1041 if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) 1034 if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
1042 != data_opt) { 1035 == data_opt)
1043 printk(KERN_ERR 1036 break;
1044 "EXT3-fs: cannot change data " 1037 printk(KERN_ERR
1045 "mode on remount\n"); 1038 "EXT3-fs (device %s): Cannot change "
1046 return 0; 1039 "data mode on remount. The filesystem "
1047 } 1040 "is mounted in data=%s mode and you "
1041 "try to remount it in data=%s mode.\n",
1042 sb->s_id,
1043 data_mode_string(sbi->s_mount_opt &
1044 EXT3_MOUNT_DATA_FLAGS),
1045 data_mode_string(data_opt));
1046 return 0;
1048 } else { 1047 } else {
1049 sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; 1048 sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
1050 sbi->s_mount_opt |= data_opt; 1049 sbi->s_mount_opt |= data_opt;
@@ -1696,7 +1695,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1696 goto failed_mount; 1695 goto failed_mount;
1697 } 1696 }
1698 1697
1699 hblock = bdev_hardsect_size(sb->s_bdev); 1698 hblock = bdev_logical_block_size(sb->s_bdev);
1700 if (sb->s_blocksize != blocksize) { 1699 if (sb->s_blocksize != blocksize) {
1701 /* 1700 /*
1702 * Make sure the blocksize for the filesystem is larger 1701 * Make sure the blocksize for the filesystem is larger
@@ -1785,7 +1784,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1785#else 1784#else
1786 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); 1785 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
1787#endif 1786#endif
1788 sb->s_dirt = 1;
1789 } 1787 }
1790 1788
1791 if (sbi->s_blocks_per_group > blocksize * 8) { 1789 if (sbi->s_blocks_per_group > blocksize * 8) {
@@ -1812,7 +1810,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1812 printk(KERN_ERR "EXT3-fs: filesystem on %s:" 1810 printk(KERN_ERR "EXT3-fs: filesystem on %s:"
1813 " too large to mount safely\n", sb->s_id); 1811 " too large to mount safely\n", sb->s_id);
1814 if (sizeof(sector_t) < 8) 1812 if (sizeof(sector_t) < 8)
1815 printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not " 1813 printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not "
1816 "enabled\n"); 1814 "enabled\n");
1817 goto failed_mount; 1815 goto failed_mount;
1818 } 1816 }
@@ -2021,6 +2019,7 @@ failed_mount:
2021 brelse(bh); 2019 brelse(bh);
2022out_fail: 2020out_fail:
2023 sb->s_fs_info = NULL; 2021 sb->s_fs_info = NULL;
2022 kfree(sbi->s_blockgroup_lock);
2024 kfree(sbi); 2023 kfree(sbi);
2025 lock_kernel(); 2024 lock_kernel();
2026 return ret; 2025 return ret;
@@ -2119,7 +2118,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2119 } 2118 }
2120 2119
2121 blocksize = sb->s_blocksize; 2120 blocksize = sb->s_blocksize;
2122 hblock = bdev_hardsect_size(bdev); 2121 hblock = bdev_logical_block_size(bdev);
2123 if (blocksize < hblock) { 2122 if (blocksize < hblock) {
2124 printk(KERN_ERR 2123 printk(KERN_ERR
2125 "EXT3-fs: blocksize too small for journal device.\n"); 2124 "EXT3-fs: blocksize too small for journal device.\n");
@@ -2264,7 +2263,6 @@ static int ext3_load_journal(struct super_block *sb,
2264 if (journal_devnum && 2263 if (journal_devnum &&
2265 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 2264 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2266 es->s_journal_dev = cpu_to_le32(journal_devnum); 2265 es->s_journal_dev = cpu_to_le32(journal_devnum);
2267 sb->s_dirt = 1;
2268 2266
2269 /* Make sure we flush the recovery flag to disk. */ 2267 /* Make sure we flush the recovery flag to disk. */
2270 ext3_commit_super(sb, es, 1); 2268 ext3_commit_super(sb, es, 1);
@@ -2307,7 +2305,6 @@ static int ext3_create_journal(struct super_block * sb,
2307 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); 2305 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
2308 2306
2309 es->s_journal_inum = cpu_to_le32(journal_inum); 2307 es->s_journal_inum = cpu_to_le32(journal_inum);
2310 sb->s_dirt = 1;
2311 2308
2312 /* Make sure we flush the recovery flag to disk. */ 2309 /* Make sure we flush the recovery flag to disk. */
2313 ext3_commit_super(sb, es, 1); 2310 ext3_commit_super(sb, es, 1);
@@ -2353,7 +2350,6 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
2353 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && 2350 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
2354 sb->s_flags & MS_RDONLY) { 2351 sb->s_flags & MS_RDONLY) {
2355 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); 2352 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2356 sb->s_dirt = 0;
2357 ext3_commit_super(sb, es, 1); 2353 ext3_commit_super(sb, es, 1);
2358 } 2354 }
2359 unlock_super(sb); 2355 unlock_super(sb);
@@ -2412,29 +2408,14 @@ int ext3_force_commit(struct super_block *sb)
2412 return 0; 2408 return 0;
2413 2409
2414 journal = EXT3_SB(sb)->s_journal; 2410 journal = EXT3_SB(sb)->s_journal;
2415 sb->s_dirt = 0;
2416 ret = ext3_journal_force_commit(journal); 2411 ret = ext3_journal_force_commit(journal);
2417 return ret; 2412 return ret;
2418} 2413}
2419 2414
2420/*
2421 * Ext3 always journals updates to the superblock itself, so we don't
2422 * have to propagate any other updates to the superblock on disk at this
2423 * point. (We can probably nuke this function altogether, and remove
2424 * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...)
2425 */
2426static void ext3_write_super (struct super_block * sb)
2427{
2428 if (mutex_trylock(&sb->s_lock) != 0)
2429 BUG();
2430 sb->s_dirt = 0;
2431}
2432
2433static int ext3_sync_fs(struct super_block *sb, int wait) 2415static int ext3_sync_fs(struct super_block *sb, int wait)
2434{ 2416{
2435 tid_t target; 2417 tid_t target;
2436 2418
2437 sb->s_dirt = 0;
2438 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { 2419 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
2439 if (wait) 2420 if (wait)
2440 log_wait_commit(EXT3_SB(sb)->s_journal, target); 2421 log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -2450,7 +2431,6 @@ static int ext3_freeze(struct super_block *sb)
2450{ 2431{
2451 int error = 0; 2432 int error = 0;
2452 journal_t *journal; 2433 journal_t *journal;
2453 sb->s_dirt = 0;
2454 2434
2455 if (!(sb->s_flags & MS_RDONLY)) { 2435 if (!(sb->s_flags & MS_RDONLY)) {
2456 journal = EXT3_SB(sb)->s_journal; 2436 journal = EXT3_SB(sb)->s_journal;
@@ -2508,7 +2488,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2508 int i; 2488 int i;
2509#endif 2489#endif
2510 2490
2491 lock_kernel();
2492
2511 /* Store the original options */ 2493 /* Store the original options */
2494 lock_super(sb);
2512 old_sb_flags = sb->s_flags; 2495 old_sb_flags = sb->s_flags;
2513 old_opts.s_mount_opt = sbi->s_mount_opt; 2496 old_opts.s_mount_opt = sbi->s_mount_opt;
2514 old_opts.s_resuid = sbi->s_resuid; 2497 old_opts.s_resuid = sbi->s_resuid;
@@ -2616,6 +2599,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2616 old_opts.s_qf_names[i] != sbi->s_qf_names[i]) 2599 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
2617 kfree(old_opts.s_qf_names[i]); 2600 kfree(old_opts.s_qf_names[i]);
2618#endif 2601#endif
2602 unlock_super(sb);
2603 unlock_kernel();
2619 return 0; 2604 return 0;
2620restore_opts: 2605restore_opts:
2621 sb->s_flags = old_sb_flags; 2606 sb->s_flags = old_sb_flags;
@@ -2632,6 +2617,8 @@ restore_opts:
2632 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 2617 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
2633 } 2618 }
2634#endif 2619#endif
2620 unlock_super(sb);
2621 unlock_kernel();
2635 return err; 2622 return err;
2636} 2623}
2637 2624
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 83b7be849bd5..545e37c4b91e 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -463,7 +463,6 @@ static void ext3_xattr_update_super_block(handle_t *handle,
463 463
464 if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) { 464 if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
465 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR); 465 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
466 sb->s_dirt = 1;
467 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 466 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
468 } 467 }
469} 468}
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index a8ff003a00f7..8867b2a1e5fe 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -5,8 +5,8 @@
5obj-$(CONFIG_EXT4_FS) += ext4.o 5obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
10 10
11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 647e0d65a284..f6d8967149ca 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -126,30 +126,6 @@ fail:
126 return ERR_PTR(-EINVAL); 126 return ERR_PTR(-EINVAL);
127} 127}
128 128
129static inline struct posix_acl *
130ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
131{
132 struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
133
134 spin_lock(&inode->i_lock);
135 if (*i_acl != EXT4_ACL_NOT_CACHED)
136 acl = posix_acl_dup(*i_acl);
137 spin_unlock(&inode->i_lock);
138
139 return acl;
140}
141
142static inline void
143ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
144 struct posix_acl *acl)
145{
146 spin_lock(&inode->i_lock);
147 if (*i_acl != EXT4_ACL_NOT_CACHED)
148 posix_acl_release(*i_acl);
149 *i_acl = posix_acl_dup(acl);
150 spin_unlock(&inode->i_lock);
151}
152
153/* 129/*
154 * Inode operation get_posix_acl(). 130 * Inode operation get_posix_acl().
155 * 131 *
@@ -158,7 +134,6 @@ ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
158static struct posix_acl * 134static struct posix_acl *
159ext4_get_acl(struct inode *inode, int type) 135ext4_get_acl(struct inode *inode, int type)
160{ 136{
161 struct ext4_inode_info *ei = EXT4_I(inode);
162 int name_index; 137 int name_index;
163 char *value = NULL; 138 char *value = NULL;
164 struct posix_acl *acl; 139 struct posix_acl *acl;
@@ -167,23 +142,19 @@ ext4_get_acl(struct inode *inode, int type)
167 if (!test_opt(inode->i_sb, POSIX_ACL)) 142 if (!test_opt(inode->i_sb, POSIX_ACL))
168 return NULL; 143 return NULL;
169 144
145 acl = get_cached_acl(inode, type);
146 if (acl != ACL_NOT_CACHED)
147 return acl;
148
170 switch (type) { 149 switch (type) {
171 case ACL_TYPE_ACCESS: 150 case ACL_TYPE_ACCESS:
172 acl = ext4_iget_acl(inode, &ei->i_acl);
173 if (acl != EXT4_ACL_NOT_CACHED)
174 return acl;
175 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; 151 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
176 break; 152 break;
177
178 case ACL_TYPE_DEFAULT: 153 case ACL_TYPE_DEFAULT:
179 acl = ext4_iget_acl(inode, &ei->i_default_acl);
180 if (acl != EXT4_ACL_NOT_CACHED)
181 return acl;
182 name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; 154 name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
183 break; 155 break;
184
185 default: 156 default:
186 return ERR_PTR(-EINVAL); 157 BUG();
187 } 158 }
188 retval = ext4_xattr_get(inode, name_index, "", NULL, 0); 159 retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
189 if (retval > 0) { 160 if (retval > 0) {
@@ -200,17 +171,9 @@ ext4_get_acl(struct inode *inode, int type)
200 acl = ERR_PTR(retval); 171 acl = ERR_PTR(retval);
201 kfree(value); 172 kfree(value);
202 173
203 if (!IS_ERR(acl)) { 174 if (!IS_ERR(acl))
204 switch (type) { 175 set_cached_acl(inode, type, acl);
205 case ACL_TYPE_ACCESS:
206 ext4_iset_acl(inode, &ei->i_acl, acl);
207 break;
208 176
209 case ACL_TYPE_DEFAULT:
210 ext4_iset_acl(inode, &ei->i_default_acl, acl);
211 break;
212 }
213 }
214 return acl; 177 return acl;
215} 178}
216 179
@@ -223,7 +186,6 @@ static int
223ext4_set_acl(handle_t *handle, struct inode *inode, int type, 186ext4_set_acl(handle_t *handle, struct inode *inode, int type,
224 struct posix_acl *acl) 187 struct posix_acl *acl)
225{ 188{
226 struct ext4_inode_info *ei = EXT4_I(inode);
227 int name_index; 189 int name_index;
228 void *value = NULL; 190 void *value = NULL;
229 size_t size = 0; 191 size_t size = 0;
@@ -268,17 +230,9 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
268 value, size, 0); 230 value, size, 0);
269 231
270 kfree(value); 232 kfree(value);
271 if (!error) { 233 if (!error)
272 switch (type) { 234 set_cached_acl(inode, type, acl);
273 case ACL_TYPE_ACCESS:
274 ext4_iset_acl(inode, &ei->i_acl, acl);
275 break;
276 235
277 case ACL_TYPE_DEFAULT:
278 ext4_iset_acl(inode, &ei->i_default_acl, acl);
279 break;
280 }
281 }
282 return error; 236 return error;
283} 237}
284 238
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index cb45257a246e..949789d2bba6 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -53,10 +53,6 @@ static inline int ext4_acl_count(size_t size)
53 53
54#ifdef CONFIG_EXT4_FS_POSIX_ACL 54#ifdef CONFIG_EXT4_FS_POSIX_ACL
55 55
56/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
57 if the ACL has not been cached */
58#define EXT4_ACL_NOT_CACHED ((void *)-1)
59
60/* acl.c */ 56/* acl.c */
61extern int ext4_permission(struct inode *, int); 57extern int ext4_permission(struct inode *, int);
62extern int ext4_acl_chmod(struct inode *); 58extern int ext4_acl_chmod(struct inode *);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 53c72ad85877..e2126d70dff5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -19,7 +19,6 @@
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20#include "ext4.h" 20#include "ext4.h"
21#include "ext4_jbd2.h" 21#include "ext4_jbd2.h"
22#include "group.h"
23#include "mballoc.h" 22#include "mballoc.h"
24 23
25/* 24/*
@@ -88,6 +87,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
88 ext4_group_t block_group, struct ext4_group_desc *gdp) 87 ext4_group_t block_group, struct ext4_group_desc *gdp)
89{ 88{
90 int bit, bit_max; 89 int bit, bit_max;
90 ext4_group_t ngroups = ext4_get_groups_count(sb);
91 unsigned free_blocks, group_blocks; 91 unsigned free_blocks, group_blocks;
92 struct ext4_sb_info *sbi = EXT4_SB(sb); 92 struct ext4_sb_info *sbi = EXT4_SB(sb);
93 93
@@ -123,7 +123,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
123 bit_max += ext4_bg_num_gdb(sb, block_group); 123 bit_max += ext4_bg_num_gdb(sb, block_group);
124 } 124 }
125 125
126 if (block_group == sbi->s_groups_count - 1) { 126 if (block_group == ngroups - 1) {
127 /* 127 /*
128 * Even though mke2fs always initialize first and last group 128 * Even though mke2fs always initialize first and last group
129 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need 129 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
@@ -131,7 +131,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
131 */ 131 */
132 group_blocks = ext4_blocks_count(sbi->s_es) - 132 group_blocks = ext4_blocks_count(sbi->s_es) -
133 le32_to_cpu(sbi->s_es->s_first_data_block) - 133 le32_to_cpu(sbi->s_es->s_first_data_block) -
134 (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1)); 134 (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
135 } else { 135 } else {
136 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 136 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
137 } 137 }
@@ -205,18 +205,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
205{ 205{
206 unsigned int group_desc; 206 unsigned int group_desc;
207 unsigned int offset; 207 unsigned int offset;
208 ext4_group_t ngroups = ext4_get_groups_count(sb);
208 struct ext4_group_desc *desc; 209 struct ext4_group_desc *desc;
209 struct ext4_sb_info *sbi = EXT4_SB(sb); 210 struct ext4_sb_info *sbi = EXT4_SB(sb);
210 211
211 if (block_group >= sbi->s_groups_count) { 212 if (block_group >= ngroups) {
212 ext4_error(sb, "ext4_get_group_desc", 213 ext4_error(sb, "ext4_get_group_desc",
213 "block_group >= groups_count - " 214 "block_group >= groups_count - "
214 "block_group = %u, groups_count = %u", 215 "block_group = %u, groups_count = %u",
215 block_group, sbi->s_groups_count); 216 block_group, ngroups);
216 217
217 return NULL; 218 return NULL;
218 } 219 }
219 smp_rmb();
220 220
221 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); 221 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
222 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); 222 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
@@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
326 unlock_buffer(bh); 326 unlock_buffer(bh);
327 return bh; 327 return bh;
328 } 328 }
329 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 329 ext4_lock_group(sb, block_group);
330 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 330 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
331 ext4_init_block_bitmap(sb, bh, block_group, desc); 331 ext4_init_block_bitmap(sb, bh, block_group, desc);
332 set_bitmap_uptodate(bh); 332 set_bitmap_uptodate(bh);
333 set_buffer_uptodate(bh); 333 set_buffer_uptodate(bh);
334 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 334 ext4_unlock_group(sb, block_group);
335 unlock_buffer(bh); 335 unlock_buffer(bh);
336 return bh; 336 return bh;
337 } 337 }
338 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 338 ext4_unlock_group(sb, block_group);
339 if (buffer_uptodate(bh)) { 339 if (buffer_uptodate(bh)) {
340 /* 340 /*
341 * if not uninit if bh is uptodate, 341 * if not uninit if bh is uptodate,
@@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
451 down_write(&grp->alloc_sem); 451 down_write(&grp->alloc_sem);
452 for (i = 0, blocks_freed = 0; i < count; i++) { 452 for (i = 0, blocks_freed = 0; i < count; i++) {
453 BUFFER_TRACE(bitmap_bh, "clear bit"); 453 BUFFER_TRACE(bitmap_bh, "clear bit");
454 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 454 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
455 bit + i, bitmap_bh->b_data)) { 455 bit + i, bitmap_bh->b_data)) {
456 ext4_error(sb, __func__, 456 ext4_error(sb, __func__,
457 "bit already cleared for block %llu", 457 "bit already cleared for block %llu",
@@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
461 blocks_freed++; 461 blocks_freed++;
462 } 462 }
463 } 463 }
464 spin_lock(sb_bgl_lock(sbi, block_group)); 464 ext4_lock_group(sb, block_group);
465 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); 465 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
466 ext4_free_blks_set(sb, desc, blk_free_count); 466 ext4_free_blks_set(sb, desc, blk_free_count);
467 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); 467 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
468 spin_unlock(sb_bgl_lock(sbi, block_group)); 468 ext4_unlock_group(sb, block_group);
469 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); 469 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
470 470
471 if (sbi->s_log_groups_per_flex) { 471 if (sbi->s_log_groups_per_flex) {
@@ -665,7 +665,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
665 ext4_fsblk_t desc_count; 665 ext4_fsblk_t desc_count;
666 struct ext4_group_desc *gdp; 666 struct ext4_group_desc *gdp;
667 ext4_group_t i; 667 ext4_group_t i;
668 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 668 ext4_group_t ngroups = ext4_get_groups_count(sb);
669#ifdef EXT4FS_DEBUG 669#ifdef EXT4FS_DEBUG
670 struct ext4_super_block *es; 670 struct ext4_super_block *es;
671 ext4_fsblk_t bitmap_count; 671 ext4_fsblk_t bitmap_count;
@@ -677,7 +677,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
677 bitmap_count = 0; 677 bitmap_count = 0;
678 gdp = NULL; 678 gdp = NULL;
679 679
680 smp_rmb();
681 for (i = 0; i < ngroups; i++) { 680 for (i = 0; i < ngroups; i++) {
682 gdp = ext4_get_group_desc(sb, i, NULL); 681 gdp = ext4_get_group_desc(sb, i, NULL);
683 if (!gdp) 682 if (!gdp)
@@ -700,7 +699,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
700 return bitmap_count; 699 return bitmap_count;
701#else 700#else
702 desc_count = 0; 701 desc_count = 0;
703 smp_rmb();
704 for (i = 0; i < ngroups; i++) { 702 for (i = 0; i < ngroups; i++) {
705 gdp = ext4_get_group_desc(sb, i, NULL); 703 gdp = ext4_get_group_desc(sb, i, NULL);
706 if (!gdp) 704 if (!gdp)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 000000000000..50784ef07563
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,244 @@
1/*
2 * linux/fs/ext4/block_validity.c
3 *
4 * Copyright (C) 2009
5 * Theodore Ts'o (tytso@mit.edu)
6 *
7 * Track which blocks in the filesystem are metadata blocks that
8 * should never be used as data blocks by files or directories.
9 */
10
11#include <linux/time.h>
12#include <linux/fs.h>
13#include <linux/namei.h>
14#include <linux/quotaops.h>
15#include <linux/buffer_head.h>
16#include <linux/module.h>
17#include <linux/swap.h>
18#include <linux/pagemap.h>
19#include <linux/version.h>
20#include <linux/blkdev.h>
21#include <linux/mutex.h>
22#include "ext4.h"
23
24struct ext4_system_zone {
25 struct rb_node node;
26 ext4_fsblk_t start_blk;
27 unsigned int count;
28};
29
30static struct kmem_cache *ext4_system_zone_cachep;
31
32int __init init_ext4_system_zone(void)
33{
34 ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
35 SLAB_RECLAIM_ACCOUNT);
36 if (ext4_system_zone_cachep == NULL)
37 return -ENOMEM;
38 return 0;
39}
40
41void exit_ext4_system_zone(void)
42{
43 kmem_cache_destroy(ext4_system_zone_cachep);
44}
45
46static inline int can_merge(struct ext4_system_zone *entry1,
47 struct ext4_system_zone *entry2)
48{
49 if ((entry1->start_blk + entry1->count) == entry2->start_blk)
50 return 1;
51 return 0;
52}
53
54/*
55 * Mark a range of blocks as belonging to the "system zone" --- that
56 * is, filesystem metadata blocks which should never be used by
57 * inodes.
58 */
59static int add_system_zone(struct ext4_sb_info *sbi,
60 ext4_fsblk_t start_blk,
61 unsigned int count)
62{
63 struct ext4_system_zone *new_entry = NULL, *entry;
64 struct rb_node **n = &sbi->system_blks.rb_node, *node;
65 struct rb_node *parent = NULL, *new_node = NULL;
66
67 while (*n) {
68 parent = *n;
69 entry = rb_entry(parent, struct ext4_system_zone, node);
70 if (start_blk < entry->start_blk)
71 n = &(*n)->rb_left;
72 else if (start_blk >= (entry->start_blk + entry->count))
73 n = &(*n)->rb_right;
74 else {
75 if (start_blk + count > (entry->start_blk +
76 entry->count))
77 entry->count = (start_blk + count -
78 entry->start_blk);
79 new_node = *n;
80 new_entry = rb_entry(new_node, struct ext4_system_zone,
81 node);
82 break;
83 }
84 }
85
86 if (!new_entry) {
87 new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
88 GFP_KERNEL);
89 if (!new_entry)
90 return -ENOMEM;
91 new_entry->start_blk = start_blk;
92 new_entry->count = count;
93 new_node = &new_entry->node;
94
95 rb_link_node(new_node, parent, n);
96 rb_insert_color(new_node, &sbi->system_blks);
97 }
98
99 /* Can we merge to the left? */
100 node = rb_prev(new_node);
101 if (node) {
102 entry = rb_entry(node, struct ext4_system_zone, node);
103 if (can_merge(entry, new_entry)) {
104 new_entry->start_blk = entry->start_blk;
105 new_entry->count += entry->count;
106 rb_erase(node, &sbi->system_blks);
107 kmem_cache_free(ext4_system_zone_cachep, entry);
108 }
109 }
110
111 /* Can we merge to the right? */
112 node = rb_next(new_node);
113 if (node) {
114 entry = rb_entry(node, struct ext4_system_zone, node);
115 if (can_merge(new_entry, entry)) {
116 new_entry->count += entry->count;
117 rb_erase(node, &sbi->system_blks);
118 kmem_cache_free(ext4_system_zone_cachep, entry);
119 }
120 }
121 return 0;
122}
123
124static void debug_print_tree(struct ext4_sb_info *sbi)
125{
126 struct rb_node *node;
127 struct ext4_system_zone *entry;
128 int first = 1;
129
130 printk(KERN_INFO "System zones: ");
131 node = rb_first(&sbi->system_blks);
132 while (node) {
133 entry = rb_entry(node, struct ext4_system_zone, node);
134 printk("%s%llu-%llu", first ? "" : ", ",
135 entry->start_blk, entry->start_blk + entry->count - 1);
136 first = 0;
137 node = rb_next(node);
138 }
139 printk("\n");
140}
141
142int ext4_setup_system_zone(struct super_block *sb)
143{
144 ext4_group_t ngroups = ext4_get_groups_count(sb);
145 struct ext4_sb_info *sbi = EXT4_SB(sb);
146 struct ext4_group_desc *gdp;
147 ext4_group_t i;
148 int flex_size = ext4_flex_bg_size(sbi);
149 int ret;
150
151 if (!test_opt(sb, BLOCK_VALIDITY)) {
152 if (EXT4_SB(sb)->system_blks.rb_node)
153 ext4_release_system_zone(sb);
154 return 0;
155 }
156 if (EXT4_SB(sb)->system_blks.rb_node)
157 return 0;
158
159 for (i=0; i < ngroups; i++) {
160 if (ext4_bg_has_super(sb, i) &&
161 ((i < 5) || ((i % flex_size) == 0)))
162 add_system_zone(sbi, ext4_group_first_block_no(sb, i),
163 sbi->s_gdb_count + 1);
164 gdp = ext4_get_group_desc(sb, i, NULL);
165 ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
166 if (ret)
167 return ret;
168 ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
169 if (ret)
170 return ret;
171 ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
172 sbi->s_itb_per_group);
173 if (ret)
174 return ret;
175 }
176
177 if (test_opt(sb, DEBUG))
178 debug_print_tree(EXT4_SB(sb));
179 return 0;
180}
181
182/* Called when the filesystem is unmounted */
183void ext4_release_system_zone(struct super_block *sb)
184{
185 struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node;
186 struct rb_node *parent;
187 struct ext4_system_zone *entry;
188
189 while (n) {
190 /* Do the node's children first */
191 if (n->rb_left) {
192 n = n->rb_left;
193 continue;
194 }
195 if (n->rb_right) {
196 n = n->rb_right;
197 continue;
198 }
199 /*
200 * The node has no children; free it, and then zero
201 * out parent's link to it. Finally go to the
202 * beginning of the loop and try to free the parent
203 * node.
204 */
205 parent = rb_parent(n);
206 entry = rb_entry(n, struct ext4_system_zone, node);
207 kmem_cache_free(ext4_system_zone_cachep, entry);
208 if (!parent)
209 EXT4_SB(sb)->system_blks.rb_node = NULL;
210 else if (parent->rb_left == n)
211 parent->rb_left = NULL;
212 else if (parent->rb_right == n)
213 parent->rb_right = NULL;
214 n = parent;
215 }
216 EXT4_SB(sb)->system_blks.rb_node = NULL;
217}
218
219/*
220 * Returns 1 if the passed-in block region (start_blk,
221 * start_blk+count) is valid; 0 if some part of the block region
222 * overlaps with filesystem metadata blocks.
223 */
224int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
225 unsigned int count)
226{
227 struct ext4_system_zone *entry;
228 struct rb_node *n = sbi->system_blks.rb_node;
229
230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
231 (start_blk + count > ext4_blocks_count(sbi->s_es)))
232 return 0;
233 while (n) {
234 entry = rb_entry(n, struct ext4_system_zone, node);
235 if (start_blk + count - 1 < entry->start_blk)
236 n = n->rb_left;
237 else if (start_blk >= (entry->start_blk + entry->count))
238 n = n->rb_right;
239 else
240 return 0;
241 }
242 return 1;
243}
244
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b64789929a65..9dc93168e262 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp,
131 struct buffer_head *bh = NULL; 131 struct buffer_head *bh = NULL;
132 132
133 map_bh.b_state = 0; 133 map_bh.b_state = 0;
134 err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 134 err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
135 0, 0, 0);
136 if (err > 0) { 135 if (err > 0) {
137 pgoff_t index = map_bh.b_blocknr >> 136 pgoff_t index = map_bh.b_blocknr >>
138 (PAGE_CACHE_SHIFT - inode->i_blkbits); 137 (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d0f15ef56de1..9714db393efe 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -21,7 +21,14 @@
21#include <linux/magic.h> 21#include <linux/magic.h>
22#include <linux/jbd2.h> 22#include <linux/jbd2.h>
23#include <linux/quota.h> 23#include <linux/quota.h>
24#include "ext4_i.h" 24#include <linux/rwsem.h>
25#include <linux/rbtree.h>
26#include <linux/seqlock.h>
27#include <linux/mutex.h>
28#include <linux/timer.h>
29#include <linux/wait.h>
30#include <linux/blockgroup_lock.h>
31#include <linux/percpu_counter.h>
25 32
26/* 33/*
27 * The fourth extended filesystem constants/structures 34 * The fourth extended filesystem constants/structures
@@ -46,6 +53,19 @@
46#define ext4_debug(f, a...) do {} while (0) 53#define ext4_debug(f, a...) do {} while (0)
47#endif 54#endif
48 55
56/* data type for block offset of block group */
57typedef int ext4_grpblk_t;
58
59/* data type for filesystem-wide blocks number */
60typedef unsigned long long ext4_fsblk_t;
61
62/* data type for file logical block number */
63typedef __u32 ext4_lblk_t;
64
65/* data type for block group number */
66typedef unsigned int ext4_group_t;
67
68
49/* prefer goal again. length */ 69/* prefer goal again. length */
50#define EXT4_MB_HINT_MERGE 1 70#define EXT4_MB_HINT_MERGE 1
51/* blocks already reserved */ 71/* blocks already reserved */
@@ -73,20 +93,20 @@
73struct ext4_allocation_request { 93struct ext4_allocation_request {
74 /* target inode for block we're allocating */ 94 /* target inode for block we're allocating */
75 struct inode *inode; 95 struct inode *inode;
96 /* how many blocks we want to allocate */
97 unsigned int len;
76 /* logical block in target inode */ 98 /* logical block in target inode */
77 ext4_lblk_t logical; 99 ext4_lblk_t logical;
78 /* phys. target (a hint) */
79 ext4_fsblk_t goal;
80 /* the closest logical allocated block to the left */ 100 /* the closest logical allocated block to the left */
81 ext4_lblk_t lleft; 101 ext4_lblk_t lleft;
82 /* phys. block for ^^^ */
83 ext4_fsblk_t pleft;
84 /* the closest logical allocated block to the right */ 102 /* the closest logical allocated block to the right */
85 ext4_lblk_t lright; 103 ext4_lblk_t lright;
86 /* phys. block for ^^^ */ 104 /* phys. target (a hint) */
105 ext4_fsblk_t goal;
106 /* phys. block for the closest logical allocated block to the left */
107 ext4_fsblk_t pleft;
108 /* phys. block for the closest logical allocated block to the right */
87 ext4_fsblk_t pright; 109 ext4_fsblk_t pright;
88 /* how many blocks we want to allocate */
89 unsigned int len;
90 /* flags. see above EXT4_MB_HINT_* */ 110 /* flags. see above EXT4_MB_HINT_* */
91 unsigned int flags; 111 unsigned int flags;
92}; 112};
@@ -179,9 +199,6 @@ struct flex_groups {
179#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ 199#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
180#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ 200#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
181 201
182#ifdef __KERNEL__
183#include "ext4_sb.h"
184#endif
185/* 202/*
186 * Macro-instructions used to manage group descriptors 203 * Macro-instructions used to manage group descriptors
187 */ 204 */
@@ -297,10 +314,23 @@ struct ext4_new_group_data {
297}; 314};
298 315
299/* 316/*
300 * Following is used by preallocation code to tell get_blocks() that we 317 * Flags used by ext4_get_blocks()
301 * want uninitialzed extents.
302 */ 318 */
303#define EXT4_CREATE_UNINITIALIZED_EXT 2 319 /* Allocate any needed blocks and/or convert an unitialized
320 extent to be an initialized ext4 */
321#define EXT4_GET_BLOCKS_CREATE 0x0001
322 /* Request the creation of an unitialized extent */
323#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
324#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
325 EXT4_GET_BLOCKS_CREATE)
326 /* Caller is from the delayed allocation writeout path,
327 so set the magic i_delalloc_reserve_flag after taking the
328 inode allocation semaphore for */
329#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
330 /* Call ext4_da_update_reserve_space() after successfully
331 allocating the blocks */
332#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
333
304 334
305/* 335/*
306 * ioctl commands 336 * ioctl commands
@@ -322,6 +352,7 @@ struct ext4_new_group_data {
322 /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ 352 /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
323 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ 353 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
324#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 354#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
355#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
325 356
326/* 357/*
327 * ioctl commands in 32 bit emulation 358 * ioctl commands in 32 bit emulation
@@ -417,6 +448,15 @@ struct ext4_inode {
417 __le32 i_version_hi; /* high 32 bits for 64-bit version */ 448 __le32 i_version_hi; /* high 32 bits for 64-bit version */
418}; 449};
419 450
451struct move_extent {
452 __u32 reserved; /* should be zero */
453 __u32 donor_fd; /* donor file descriptor */
454 __u64 orig_start; /* logical start offset in block for orig */
455 __u64 donor_start; /* logical start offset in block for donor */
456 __u64 len; /* block length to be moved */
457 __u64 moved_len; /* moved block length */
458};
459#define MAX_DEFRAG_SIZE ((1UL<<31) - 1)
420 460
421#define EXT4_EPOCH_BITS 2 461#define EXT4_EPOCH_BITS 2
422#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) 462#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -516,6 +556,106 @@ do { \
516#endif /* defined(__KERNEL__) || defined(__linux__) */ 556#endif /* defined(__KERNEL__) || defined(__linux__) */
517 557
518/* 558/*
559 * storage for cached extent
560 */
561struct ext4_ext_cache {
562 ext4_fsblk_t ec_start;
563 ext4_lblk_t ec_block;
564 __u32 ec_len; /* must be 32bit to return holes */
565 __u32 ec_type;
566};
567
568/*
569 * fourth extended file system inode data in memory
570 */
571struct ext4_inode_info {
572 __le32 i_data[15]; /* unconverted */
573 __u32 i_flags;
574 ext4_fsblk_t i_file_acl;
575 __u32 i_dtime;
576
577 /*
578 * i_block_group is the number of the block group which contains
579 * this file's inode. Constant across the lifetime of the inode,
580 * it is ued for making block allocation decisions - we try to
581 * place a file's data blocks near its inode block, and new inodes
582 * near to their parent directory's inode.
583 */
584 ext4_group_t i_block_group;
585 __u32 i_state; /* Dynamic state flags for ext4 */
586
587 ext4_lblk_t i_dir_start_lookup;
588#ifdef CONFIG_EXT4_FS_XATTR
589 /*
590 * Extended attributes can be read independently of the main file
591 * data. Taking i_mutex even when reading would cause contention
592 * between readers of EAs and writers of regular file data, so
593 * instead we synchronize on xattr_sem when reading or changing
594 * EAs.
595 */
596 struct rw_semaphore xattr_sem;
597#endif
598
599 struct list_head i_orphan; /* unlinked but open inodes */
600
601 /*
602 * i_disksize keeps track of what the inode size is ON DISK, not
603 * in memory. During truncate, i_size is set to the new size by
604 * the VFS prior to calling ext4_truncate(), but the filesystem won't
605 * set i_disksize to 0 until the truncate is actually under way.
606 *
607 * The intent is that i_disksize always represents the blocks which
608 * are used by this file. This allows recovery to restart truncate
609 * on orphans if we crash during truncate. We actually write i_disksize
610 * into the on-disk inode when writing inodes out, instead of i_size.
611 *
612 * The only time when i_disksize and i_size may be different is when
613 * a truncate is in progress. The only things which change i_disksize
614 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
615 */
616 loff_t i_disksize;
617
618 /*
619 * i_data_sem is for serialising ext4_truncate() against
620 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
621 * data tree are chopped off during truncate. We can't do that in
622 * ext4 because whenever we perform intermediate commits during
623 * truncate, the inode and all the metadata blocks *must* be in a
624 * consistent state which allows truncation of the orphans to restart
625 * during recovery. Hence we must fix the get_block-vs-truncate race
626 * by other means, so we have i_data_sem.
627 */
628 struct rw_semaphore i_data_sem;
629 struct inode vfs_inode;
630 struct jbd2_inode jinode;
631
632 struct ext4_ext_cache i_cached_extent;
633 /*
634 * File creation time. Its function is same as that of
635 * struct timespec i_{a,c,m}time in the generic inode.
636 */
637 struct timespec i_crtime;
638
639 /* mballoc */
640 struct list_head i_prealloc_list;
641 spinlock_t i_prealloc_lock;
642
643 /* ialloc */
644 ext4_group_t i_last_alloc_group;
645
646 /* allocation reservation info for delalloc */
647 unsigned int i_reserved_data_blocks;
648 unsigned int i_reserved_meta_blocks;
649 unsigned int i_allocated_meta_blocks;
650 unsigned short i_delalloc_reserved_flag;
651
652 /* on-disk additional length */
653 __u16 i_extra_isize;
654
655 spinlock_t i_block_reservation_lock;
656};
657
658/*
519 * File system states 659 * File system states
520 */ 660 */
521#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ 661#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */
@@ -540,7 +680,6 @@ do { \
540#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ 680#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
541#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ 681#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
542#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ 682#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
543#define EXT4_MOUNT_ABORT 0x00200 /* Fatal error detected */
544#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ 683#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
545#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ 684#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
546#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ 685#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
@@ -560,18 +699,12 @@ do { \
560#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 699#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
561#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 700#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
562#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 701#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
702#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
563 703
564/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
565#ifndef _LINUX_EXT2_FS_H
566#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 704#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
567#define set_opt(o, opt) o |= EXT4_MOUNT_##opt 705#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
568#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ 706#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \
569 EXT4_MOUNT_##opt) 707 EXT4_MOUNT_##opt)
570#else
571#define EXT2_MOUNT_NOLOAD EXT4_MOUNT_NOLOAD
572#define EXT2_MOUNT_ABORT EXT4_MOUNT_ABORT
573#define EXT2_MOUNT_DATA_FLAGS EXT4_MOUNT_DATA_FLAGS
574#endif
575 708
576#define ext4_set_bit ext2_set_bit 709#define ext4_set_bit ext2_set_bit
577#define ext4_set_bit_atomic ext2_set_bit_atomic 710#define ext4_set_bit_atomic ext2_set_bit_atomic
@@ -689,6 +822,146 @@ struct ext4_super_block {
689}; 822};
690 823
691#ifdef __KERNEL__ 824#ifdef __KERNEL__
825
826/*
827 * run-time mount flags
828 */
829#define EXT4_MF_MNTDIR_SAMPLED 0x0001
830#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
831
832/*
833 * fourth extended-fs super-block data in memory
834 */
835struct ext4_sb_info {
836 unsigned long s_desc_size; /* Size of a group descriptor in bytes */
837 unsigned long s_inodes_per_block;/* Number of inodes per block */
838 unsigned long s_blocks_per_group;/* Number of blocks in a group */
839 unsigned long s_inodes_per_group;/* Number of inodes in a group */
840 unsigned long s_itb_per_group; /* Number of inode table blocks per group */
841 unsigned long s_gdb_count; /* Number of group descriptor blocks */
842 unsigned long s_desc_per_block; /* Number of group descriptors per block */
843 ext4_group_t s_groups_count; /* Number of groups in the fs */
844 unsigned long s_overhead_last; /* Last calculated overhead */
845 unsigned long s_blocks_last; /* Last seen block count */
846 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
847 struct buffer_head * s_sbh; /* Buffer containing the super block */
848 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
849 struct buffer_head **s_group_desc;
850 unsigned int s_mount_opt;
851 unsigned int s_mount_flags;
852 ext4_fsblk_t s_sb_block;
853 uid_t s_resuid;
854 gid_t s_resgid;
855 unsigned short s_mount_state;
856 unsigned short s_pad;
857 int s_addr_per_block_bits;
858 int s_desc_per_block_bits;
859 int s_inode_size;
860 int s_first_ino;
861 unsigned int s_inode_readahead_blks;
862 unsigned int s_inode_goal;
863 spinlock_t s_next_gen_lock;
864 u32 s_next_generation;
865 u32 s_hash_seed[4];
866 int s_def_hash_version;
867 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
868 struct percpu_counter s_freeblocks_counter;
869 struct percpu_counter s_freeinodes_counter;
870 struct percpu_counter s_dirs_counter;
871 struct percpu_counter s_dirtyblocks_counter;
872 struct blockgroup_lock *s_blockgroup_lock;
873 struct proc_dir_entry *s_proc;
874 struct kobject s_kobj;
875 struct completion s_kobj_unregister;
876
877 /* Journaling */
878 struct inode *s_journal_inode;
879 struct journal_s *s_journal;
880 struct list_head s_orphan;
881 struct mutex s_orphan_lock;
882 struct mutex s_resize_lock;
883 unsigned long s_commit_interval;
884 u32 s_max_batch_time;
885 u32 s_min_batch_time;
886 struct block_device *journal_bdev;
887#ifdef CONFIG_JBD2_DEBUG
888 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
889 wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
890#endif
891#ifdef CONFIG_QUOTA
892 char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
893 int s_jquota_fmt; /* Format of quota to use */
894#endif
895 unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
896 struct rb_root system_blks;
897
898#ifdef EXTENTS_STATS
899 /* ext4 extents stats */
900 unsigned long s_ext_min;
901 unsigned long s_ext_max;
902 unsigned long s_depth_max;
903 spinlock_t s_ext_stats_lock;
904 unsigned long s_ext_blocks;
905 unsigned long s_ext_extents;
906#endif
907
908 /* for buddy allocator */
909 struct ext4_group_info ***s_group_info;
910 struct inode *s_buddy_cache;
911 long s_blocks_reserved;
912 spinlock_t s_reserve_lock;
913 spinlock_t s_md_lock;
914 tid_t s_last_transaction;
915 unsigned short *s_mb_offsets;
916 unsigned int *s_mb_maxs;
917
918 /* tunables */
919 unsigned long s_stripe;
920 unsigned int s_mb_stream_request;
921 unsigned int s_mb_max_to_scan;
922 unsigned int s_mb_min_to_scan;
923 unsigned int s_mb_stats;
924 unsigned int s_mb_order2_reqs;
925 unsigned int s_mb_group_prealloc;
926 /* where last allocation was done - for stream allocation */
927 unsigned long s_mb_last_group;
928 unsigned long s_mb_last_start;
929
930 /* history to debug policy */
931 struct ext4_mb_history *s_mb_history;
932 int s_mb_history_cur;
933 int s_mb_history_max;
934 int s_mb_history_num;
935 spinlock_t s_mb_history_lock;
936 int s_mb_history_filter;
937
938 /* stats for buddy allocator */
939 spinlock_t s_mb_pa_lock;
940 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
941 atomic_t s_bal_success; /* we found long enough chunks */
942 atomic_t s_bal_allocated; /* in blocks */
943 atomic_t s_bal_ex_scanned; /* total extents scanned */
944 atomic_t s_bal_goals; /* goal hits */
945 atomic_t s_bal_breaks; /* too long searches */
946 atomic_t s_bal_2orders; /* 2^order hits */
947 spinlock_t s_bal_lock;
948 unsigned long s_mb_buddies_generated;
949 unsigned long long s_mb_generation_time;
950 atomic_t s_mb_lost_chunks;
951 atomic_t s_mb_preallocated;
952 atomic_t s_mb_discarded;
953
954 /* locality groups */
955 struct ext4_locality_group *s_locality_groups;
956
957 /* for write statistics */
958 unsigned long s_sectors_written_start;
959 u64 s_kbytes_written;
960
961 unsigned int s_log_groups_per_flex;
962 struct flex_groups *s_flex_groups;
963};
964
692static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 965static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
693{ 966{
694 return sb->s_fs_info; 967 return sb->s_fs_info;
@@ -704,7 +977,6 @@ static inline struct timespec ext4_current_time(struct inode *inode)
704 current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; 977 current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
705} 978}
706 979
707
708static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) 980static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
709{ 981{
710 return ino == EXT4_ROOT_INO || 982 return ino == EXT4_ROOT_INO ||
@@ -1014,6 +1286,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
1014 ext4_group_t block_group, 1286 ext4_group_t block_group,
1015 struct buffer_head ** bh); 1287 struct buffer_head ** bh);
1016extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); 1288extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
1289struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
1290 ext4_group_t block_group);
1291extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1292 struct buffer_head *bh,
1293 ext4_group_t group,
1294 struct ext4_group_desc *desc);
1295#define ext4_free_blocks_after_init(sb, group, desc) \
1296 ext4_init_block_bitmap(sb, NULL, group, desc)
1017 1297
1018/* dir.c */ 1298/* dir.c */
1019extern int ext4_check_dir_entry(const char *, struct inode *, 1299extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1032,12 +1312,18 @@ extern int ext4fs_dirhash(const char *name, int len, struct
1032 dx_hash_info *hinfo); 1312 dx_hash_info *hinfo);
1033 1313
1034/* ialloc.c */ 1314/* ialloc.c */
1035extern struct inode * ext4_new_inode(handle_t *, struct inode *, int); 1315extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
1316 const struct qstr *qstr, __u32 goal);
1036extern void ext4_free_inode(handle_t *, struct inode *); 1317extern void ext4_free_inode(handle_t *, struct inode *);
1037extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); 1318extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1038extern unsigned long ext4_count_free_inodes(struct super_block *); 1319extern unsigned long ext4_count_free_inodes(struct super_block *);
1039extern unsigned long ext4_count_dirs(struct super_block *); 1320extern unsigned long ext4_count_dirs(struct super_block *);
1040extern void ext4_check_inodes_bitmap(struct super_block *); 1321extern void ext4_check_inodes_bitmap(struct super_block *);
1322extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
1323 struct buffer_head *bh,
1324 ext4_group_t group,
1325 struct ext4_group_desc *desc);
1326extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1041 1327
1042/* mballoc.c */ 1328/* mballoc.c */
1043extern long ext4_mb_stats; 1329extern long ext4_mb_stats;
@@ -1051,7 +1337,7 @@ extern void ext4_discard_preallocations(struct inode *);
1051extern int __init init_ext4_mballoc(void); 1337extern int __init init_ext4_mballoc(void);
1052extern void exit_ext4_mballoc(void); 1338extern void exit_ext4_mballoc(void);
1053extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1339extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1054 unsigned long, unsigned long, int, unsigned long *); 1340 ext4_fsblk_t, unsigned long, int, unsigned long *);
1055extern int ext4_mb_add_groupinfo(struct super_block *sb, 1341extern int ext4_mb_add_groupinfo(struct super_block *sb,
1056 ext4_group_t i, struct ext4_group_desc *desc); 1342 ext4_group_t i, struct ext4_group_desc *desc);
1057extern void ext4_mb_update_group_info(struct ext4_group_info *grp, 1343extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
@@ -1123,6 +1409,8 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1123 __attribute__ ((format (printf, 3, 4))); 1409 __attribute__ ((format (printf, 3, 4)));
1124extern void ext4_warning(struct super_block *, const char *, const char *, ...) 1410extern void ext4_warning(struct super_block *, const char *, const char *, ...)
1125 __attribute__ ((format (printf, 3, 4))); 1411 __attribute__ ((format (printf, 3, 4)));
1412extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1413 __attribute__ ((format (printf, 3, 4)));
1126extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, 1414extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
1127 const char *, const char *, ...) 1415 const char *, const char *, ...)
1128 __attribute__ ((format (printf, 4, 5))); 1416 __attribute__ ((format (printf, 4, 5)));
@@ -1161,6 +1449,10 @@ extern void ext4_used_dirs_set(struct super_block *sb,
1161 struct ext4_group_desc *bg, __u32 count); 1449 struct ext4_group_desc *bg, __u32 count);
1162extern void ext4_itable_unused_set(struct super_block *sb, 1450extern void ext4_itable_unused_set(struct super_block *sb,
1163 struct ext4_group_desc *bg, __u32 count); 1451 struct ext4_group_desc *bg, __u32 count);
1452extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
1453 struct ext4_group_desc *gdp);
1454extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
1455 struct ext4_group_desc *gdp);
1164 1456
1165static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) 1457static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
1166{ 1458{
@@ -1228,6 +1520,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1228 return grp_info[indexv][indexh]; 1520 return grp_info[indexv][indexh];
1229} 1521}
1230 1522
1523/*
1524 * Reading s_groups_count requires using smp_rmb() afterwards. See
1525 * the locking protocol documented in the comments of ext4_group_add()
1526 * in resize.c
1527 */
1528static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
1529{
1530 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
1531
1532 smp_rmb();
1533 return ngroups;
1534}
1231 1535
1232static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, 1536static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
1233 ext4_group_t block_group) 1537 ext4_group_t block_group)
@@ -1283,33 +1587,25 @@ struct ext4_group_info {
1283}; 1587};
1284 1588
1285#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 1589#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
1286#define EXT4_GROUP_INFO_LOCKED_BIT 1
1287 1590
1288#define EXT4_MB_GRP_NEED_INIT(grp) \ 1591#define EXT4_MB_GRP_NEED_INIT(grp) \
1289 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) 1592 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
1290 1593
1291static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) 1594static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
1595 ext4_group_t group)
1292{ 1596{
1293 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); 1597 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
1294
1295 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
1296} 1598}
1297 1599
1298static inline void ext4_unlock_group(struct super_block *sb, 1600static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
1299 ext4_group_t group)
1300{ 1601{
1301 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); 1602 spin_lock(ext4_group_lock_ptr(sb, group));
1302
1303 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
1304} 1603}
1305 1604
1306static inline int ext4_is_group_locked(struct super_block *sb, 1605static inline void ext4_unlock_group(struct super_block *sb,
1307 ext4_group_t group) 1606 ext4_group_t group)
1308{ 1607{
1309 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); 1608 spin_unlock(ext4_group_lock_ptr(sb, group));
1310
1311 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
1312 &(grinfo->bb_state));
1313} 1609}
1314 1610
1315/* 1611/*
@@ -1326,11 +1622,21 @@ extern const struct file_operations ext4_file_operations;
1326/* namei.c */ 1622/* namei.c */
1327extern const struct inode_operations ext4_dir_inode_operations; 1623extern const struct inode_operations ext4_dir_inode_operations;
1328extern const struct inode_operations ext4_special_inode_operations; 1624extern const struct inode_operations ext4_special_inode_operations;
1625extern struct dentry *ext4_get_parent(struct dentry *child);
1329 1626
1330/* symlink.c */ 1627/* symlink.c */
1331extern const struct inode_operations ext4_symlink_inode_operations; 1628extern const struct inode_operations ext4_symlink_inode_operations;
1332extern const struct inode_operations ext4_fast_symlink_inode_operations; 1629extern const struct inode_operations ext4_fast_symlink_inode_operations;
1333 1630
1631/* block_validity */
1632extern void ext4_release_system_zone(struct super_block *sb);
1633extern int ext4_setup_system_zone(struct super_block *sb);
1634extern int __init init_ext4_system_zone(void);
1635extern void exit_ext4_system_zone(void);
1636extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
1637 ext4_fsblk_t start_blk,
1638 unsigned int count);
1639
1334/* extents.c */ 1640/* extents.c */
1335extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 1641extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1336extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 1642extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
@@ -1338,19 +1644,22 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1338 int chunk); 1644 int chunk);
1339extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1645extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1340 ext4_lblk_t iblock, unsigned int max_blocks, 1646 ext4_lblk_t iblock, unsigned int max_blocks,
1341 struct buffer_head *bh_result, 1647 struct buffer_head *bh_result, int flags);
1342 int create, int extend_disksize);
1343extern void ext4_ext_truncate(struct inode *); 1648extern void ext4_ext_truncate(struct inode *);
1344extern void ext4_ext_init(struct super_block *); 1649extern void ext4_ext_init(struct super_block *);
1345extern void ext4_ext_release(struct super_block *); 1650extern void ext4_ext_release(struct super_block *);
1346extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1651extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1347 loff_t len); 1652 loff_t len);
1348extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, 1653extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1349 sector_t block, unsigned int max_blocks, 1654 sector_t block, unsigned int max_blocks,
1350 struct buffer_head *bh, int create, 1655 struct buffer_head *bh, int flags);
1351 int extend_disksize, int flag);
1352extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1656extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1353 __u64 start, __u64 len); 1657 __u64 start, __u64 len);
1658/* move_extent.c */
1659extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
1660 __u64 start_orig, __u64 start_donor,
1661 __u64 len, __u64 *moved_len);
1662
1354 1663
1355/* 1664/*
1356 * Add new method to test wether block and inode bitmaps are properly 1665 * Add new method to test wether block and inode bitmaps are properly
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index f0c3ec85bd48..20a84105a10b 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -221,12 +221,16 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
221} 221}
222 222
223extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); 223extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
224extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
224extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 225extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
225extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); 226extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
226extern int ext4_extent_tree_init(handle_t *, struct inode *); 227extern int ext4_extent_tree_init(handle_t *, struct inode *);
227extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, 228extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
228 int num, 229 int num,
229 struct ext4_ext_path *path); 230 struct ext4_ext_path *path);
231extern int ext4_can_extents_be_merged(struct inode *inode,
232 struct ext4_extent *ex1,
233 struct ext4_extent *ex2);
230extern int ext4_ext_try_to_merge(struct inode *inode, 234extern int ext4_ext_try_to_merge(struct inode *inode,
231 struct ext4_ext_path *path, 235 struct ext4_ext_path *path,
232 struct ext4_extent *); 236 struct ext4_extent *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
deleted file mode 100644
index 4ce2187123aa..000000000000
--- a/fs/ext4/ext4_i.h
+++ /dev/null
@@ -1,140 +0,0 @@
1/*
2 * ext4_i.h
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/include/linux/minix_fs_i.h
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 */
15
16#ifndef _EXT4_I
17#define _EXT4_I
18
19#include <linux/rwsem.h>
20#include <linux/rbtree.h>
21#include <linux/seqlock.h>
22#include <linux/mutex.h>
23
24/* data type for block offset of block group */
25typedef int ext4_grpblk_t;
26
27/* data type for filesystem-wide blocks number */
28typedef unsigned long long ext4_fsblk_t;
29
30/* data type for file logical block number */
31typedef __u32 ext4_lblk_t;
32
33/* data type for block group number */
34typedef unsigned int ext4_group_t;
35
36/*
37 * storage for cached extent
38 */
39struct ext4_ext_cache {
40 ext4_fsblk_t ec_start;
41 ext4_lblk_t ec_block;
42 __u32 ec_len; /* must be 32bit to return holes */
43 __u32 ec_type;
44};
45
46/*
47 * fourth extended file system inode data in memory
48 */
49struct ext4_inode_info {
50 __le32 i_data[15]; /* unconverted */
51 __u32 i_flags;
52 ext4_fsblk_t i_file_acl;
53 __u32 i_dtime;
54
55 /*
56 * i_block_group is the number of the block group which contains
57 * this file's inode. Constant across the lifetime of the inode,
58 * it is ued for making block allocation decisions - we try to
59 * place a file's data blocks near its inode block, and new inodes
60 * near to their parent directory's inode.
61 */
62 ext4_group_t i_block_group;
63 __u32 i_state; /* Dynamic state flags for ext4 */
64
65 ext4_lblk_t i_dir_start_lookup;
66#ifdef CONFIG_EXT4_FS_XATTR
67 /*
68 * Extended attributes can be read independently of the main file
69 * data. Taking i_mutex even when reading would cause contention
70 * between readers of EAs and writers of regular file data, so
71 * instead we synchronize on xattr_sem when reading or changing
72 * EAs.
73 */
74 struct rw_semaphore xattr_sem;
75#endif
76#ifdef CONFIG_EXT4_FS_POSIX_ACL
77 struct posix_acl *i_acl;
78 struct posix_acl *i_default_acl;
79#endif
80
81 struct list_head i_orphan; /* unlinked but open inodes */
82
83 /*
84 * i_disksize keeps track of what the inode size is ON DISK, not
85 * in memory. During truncate, i_size is set to the new size by
86 * the VFS prior to calling ext4_truncate(), but the filesystem won't
87 * set i_disksize to 0 until the truncate is actually under way.
88 *
89 * The intent is that i_disksize always represents the blocks which
90 * are used by this file. This allows recovery to restart truncate
91 * on orphans if we crash during truncate. We actually write i_disksize
92 * into the on-disk inode when writing inodes out, instead of i_size.
93 *
94 * The only time when i_disksize and i_size may be different is when
95 * a truncate is in progress. The only things which change i_disksize
96 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
97 */
98 loff_t i_disksize;
99
100 /*
101 * i_data_sem is for serialising ext4_truncate() against
102 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
103 * data tree are chopped off during truncate. We can't do that in
104 * ext4 because whenever we perform intermediate commits during
105 * truncate, the inode and all the metadata blocks *must* be in a
106 * consistent state which allows truncation of the orphans to restart
107 * during recovery. Hence we must fix the get_block-vs-truncate race
108 * by other means, so we have i_data_sem.
109 */
110 struct rw_semaphore i_data_sem;
111 struct inode vfs_inode;
112 struct jbd2_inode jinode;
113
114 struct ext4_ext_cache i_cached_extent;
115 /*
116 * File creation time. Its function is same as that of
117 * struct timespec i_{a,c,m}time in the generic inode.
118 */
119 struct timespec i_crtime;
120
121 /* mballoc */
122 struct list_head i_prealloc_list;
123 spinlock_t i_prealloc_lock;
124
125 /* ialloc */
126 ext4_group_t i_last_alloc_group;
127
128 /* allocation reservation info for delalloc */
129 unsigned int i_reserved_data_blocks;
130 unsigned int i_reserved_meta_blocks;
131 unsigned int i_allocated_meta_blocks;
132 unsigned short i_delalloc_reserved_flag;
133
134 /* on-disk additional length */
135 __u16 i_extra_isize;
136
137 spinlock_t i_block_reservation_lock;
138};
139
140#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index ad13a84644e1..eb27fd0f2ee8 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -43,6 +43,8 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
43 ext4_journal_abort_handle(where, __func__, bh, 43 ext4_journal_abort_handle(where, __func__, bh,
44 handle, err); 44 handle, err);
45 } 45 }
46 else
47 brelse(bh);
46 return err; 48 return err;
47} 49}
48 50
@@ -57,6 +59,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
57 ext4_journal_abort_handle(where, __func__, bh, 59 ext4_journal_abort_handle(where, __func__, bh,
58 handle, err); 60 handle, err);
59 } 61 }
62 else
63 brelse(bh);
60 return err; 64 return err;
61} 65}
62 66
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index be2f426f6805..139fb8cb87e4 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -131,9 +131,11 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
131int __ext4_journal_get_write_access(const char *where, handle_t *handle, 131int __ext4_journal_get_write_access(const char *where, handle_t *handle,
132 struct buffer_head *bh); 132 struct buffer_head *bh);
133 133
134/* When called with an invalid handle, this will still do a put on the BH */
134int __ext4_journal_forget(const char *where, handle_t *handle, 135int __ext4_journal_forget(const char *where, handle_t *handle,
135 struct buffer_head *bh); 136 struct buffer_head *bh);
136 137
138/* When called with an invalid handle, this will still do a put on the BH */
137int __ext4_journal_revoke(const char *where, handle_t *handle, 139int __ext4_journal_revoke(const char *where, handle_t *handle,
138 ext4_fsblk_t blocknr, struct buffer_head *bh); 140 ext4_fsblk_t blocknr, struct buffer_head *bh);
139 141
@@ -281,10 +283,10 @@ static inline int ext4_should_order_data(struct inode *inode)
281 283
282static inline int ext4_should_writeback_data(struct inode *inode) 284static inline int ext4_should_writeback_data(struct inode *inode)
283{ 285{
284 if (EXT4_JOURNAL(inode) == NULL)
285 return 0;
286 if (!S_ISREG(inode->i_mode)) 286 if (!S_ISREG(inode->i_mode))
287 return 0; 287 return 0;
288 if (EXT4_JOURNAL(inode) == NULL)
289 return 1;
288 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 290 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
289 return 0; 291 return 0;
290 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 292 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
deleted file mode 100644
index 57b71fefbccf..000000000000
--- a/fs/ext4/ext4_sb.h
+++ /dev/null
@@ -1,161 +0,0 @@
1/*
2 * ext4_sb.h
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/include/linux/minix_fs_sb.h
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 */
15
16#ifndef _EXT4_SB
17#define _EXT4_SB
18
19#ifdef __KERNEL__
20#include <linux/timer.h>
21#include <linux/wait.h>
22#include <linux/blockgroup_lock.h>
23#include <linux/percpu_counter.h>
24#endif
25#include <linux/rbtree.h>
26
27/*
28 * fourth extended-fs super-block data in memory
29 */
30struct ext4_sb_info {
31 unsigned long s_desc_size; /* Size of a group descriptor in bytes */
32 unsigned long s_inodes_per_block;/* Number of inodes per block */
33 unsigned long s_blocks_per_group;/* Number of blocks in a group */
34 unsigned long s_inodes_per_group;/* Number of inodes in a group */
35 unsigned long s_itb_per_group; /* Number of inode table blocks per group */
36 unsigned long s_gdb_count; /* Number of group descriptor blocks */
37 unsigned long s_desc_per_block; /* Number of group descriptors per block */
38 ext4_group_t s_groups_count; /* Number of groups in the fs */
39 unsigned long s_overhead_last; /* Last calculated overhead */
40 unsigned long s_blocks_last; /* Last seen block count */
41 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
42 struct buffer_head * s_sbh; /* Buffer containing the super block */
43 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
44 struct buffer_head **s_group_desc;
45 unsigned long s_mount_opt;
46 ext4_fsblk_t s_sb_block;
47 uid_t s_resuid;
48 gid_t s_resgid;
49 unsigned short s_mount_state;
50 unsigned short s_pad;
51 int s_addr_per_block_bits;
52 int s_desc_per_block_bits;
53 int s_inode_size;
54 int s_first_ino;
55 unsigned int s_inode_readahead_blks;
56 spinlock_t s_next_gen_lock;
57 u32 s_next_generation;
58 u32 s_hash_seed[4];
59 int s_def_hash_version;
60 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
61 struct percpu_counter s_freeblocks_counter;
62 struct percpu_counter s_freeinodes_counter;
63 struct percpu_counter s_dirs_counter;
64 struct percpu_counter s_dirtyblocks_counter;
65 struct blockgroup_lock *s_blockgroup_lock;
66 struct proc_dir_entry *s_proc;
67 struct kobject s_kobj;
68 struct completion s_kobj_unregister;
69
70 /* Journaling */
71 struct inode *s_journal_inode;
72 struct journal_s *s_journal;
73 struct list_head s_orphan;
74 unsigned long s_commit_interval;
75 u32 s_max_batch_time;
76 u32 s_min_batch_time;
77 struct block_device *journal_bdev;
78#ifdef CONFIG_JBD2_DEBUG
79 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
80 wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
81#endif
82#ifdef CONFIG_QUOTA
83 char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
84 int s_jquota_fmt; /* Format of quota to use */
85#endif
86 unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
87
88#ifdef EXTENTS_STATS
89 /* ext4 extents stats */
90 unsigned long s_ext_min;
91 unsigned long s_ext_max;
92 unsigned long s_depth_max;
93 spinlock_t s_ext_stats_lock;
94 unsigned long s_ext_blocks;
95 unsigned long s_ext_extents;
96#endif
97
98 /* for buddy allocator */
99 struct ext4_group_info ***s_group_info;
100 struct inode *s_buddy_cache;
101 long s_blocks_reserved;
102 spinlock_t s_reserve_lock;
103 spinlock_t s_md_lock;
104 tid_t s_last_transaction;
105 unsigned short *s_mb_offsets;
106 unsigned int *s_mb_maxs;
107
108 /* tunables */
109 unsigned long s_stripe;
110 unsigned int s_mb_stream_request;
111 unsigned int s_mb_max_to_scan;
112 unsigned int s_mb_min_to_scan;
113 unsigned int s_mb_stats;
114 unsigned int s_mb_order2_reqs;
115 unsigned int s_mb_group_prealloc;
116 /* where last allocation was done - for stream allocation */
117 unsigned long s_mb_last_group;
118 unsigned long s_mb_last_start;
119
120 /* history to debug policy */
121 struct ext4_mb_history *s_mb_history;
122 int s_mb_history_cur;
123 int s_mb_history_max;
124 int s_mb_history_num;
125 spinlock_t s_mb_history_lock;
126 int s_mb_history_filter;
127
128 /* stats for buddy allocator */
129 spinlock_t s_mb_pa_lock;
130 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
131 atomic_t s_bal_success; /* we found long enough chunks */
132 atomic_t s_bal_allocated; /* in blocks */
133 atomic_t s_bal_ex_scanned; /* total extents scanned */
134 atomic_t s_bal_goals; /* goal hits */
135 atomic_t s_bal_breaks; /* too long searches */
136 atomic_t s_bal_2orders; /* 2^order hits */
137 spinlock_t s_bal_lock;
138 unsigned long s_mb_buddies_generated;
139 unsigned long long s_mb_generation_time;
140 atomic_t s_mb_lost_chunks;
141 atomic_t s_mb_preallocated;
142 atomic_t s_mb_discarded;
143
144 /* locality groups */
145 struct ext4_locality_group *s_locality_groups;
146
147 /* for write statistics */
148 unsigned long s_sectors_written_start;
149 u64 s_kbytes_written;
150
151 unsigned int s_log_groups_per_flex;
152 struct flex_groups *s_flex_groups;
153};
154
155static inline spinlock_t *
156sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
157{
158 return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
159}
160
161#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e3a55eb8b26a..73ebfb44ad75 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -49,7 +49,7 @@
49 * ext_pblock: 49 * ext_pblock:
50 * combine low and high parts of physical block number into ext4_fsblk_t 50 * combine low and high parts of physical block number into ext4_fsblk_t
51 */ 51 */
52static ext4_fsblk_t ext_pblock(struct ext4_extent *ex) 52ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
53{ 53{
54 ext4_fsblk_t block; 54 ext4_fsblk_t block;
55 55
@@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth)
326 326
327static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 327static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
328{ 328{
329 ext4_fsblk_t block = ext_pblock(ext), valid_block; 329 ext4_fsblk_t block = ext_pblock(ext);
330 int len = ext4_ext_get_actual_len(ext); 330 int len = ext4_ext_get_actual_len(ext);
331 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
332 331
333 valid_block = le32_to_cpu(es->s_first_data_block) + 332 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
334 EXT4_SB(inode->i_sb)->s_gdb_count;
335 if (unlikely(block <= valid_block ||
336 ((block + len) > ext4_blocks_count(es))))
337 return 0;
338 else
339 return 1;
340} 333}
341 334
342static int ext4_valid_extent_idx(struct inode *inode, 335static int ext4_valid_extent_idx(struct inode *inode,
343 struct ext4_extent_idx *ext_idx) 336 struct ext4_extent_idx *ext_idx)
344{ 337{
345 ext4_fsblk_t block = idx_pblock(ext_idx), valid_block; 338 ext4_fsblk_t block = idx_pblock(ext_idx);
346 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
347 339
348 valid_block = le32_to_cpu(es->s_first_data_block) + 340 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
349 EXT4_SB(inode->i_sb)->s_gdb_count;
350 if (unlikely(block <= valid_block ||
351 (block >= ext4_blocks_count(es))))
352 return 0;
353 else
354 return 1;
355} 341}
356 342
357static int ext4_valid_extent_entries(struct inode *inode, 343static int ext4_valid_extent_entries(struct inode *inode,
@@ -1431,7 +1417,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1431 return err; 1417 return err;
1432} 1418}
1433 1419
1434static int 1420int
1435ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, 1421ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1436 struct ext4_extent *ex2) 1422 struct ext4_extent *ex2)
1437{ 1423{
@@ -1991,6 +1977,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
1991 */ 1977 */
1992 /* 1 bitmap, 1 block group descriptor */ 1978 /* 1 bitmap, 1 block group descriptor */
1993 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 1979 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
1980 return ret;
1994 } 1981 }
1995 } 1982 }
1996 1983
@@ -2097,12 +2084,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2097 ex = EXT_LAST_EXTENT(eh); 2084 ex = EXT_LAST_EXTENT(eh);
2098 2085
2099 ex_ee_block = le32_to_cpu(ex->ee_block); 2086 ex_ee_block = le32_to_cpu(ex->ee_block);
2100 if (ext4_ext_is_uninitialized(ex))
2101 uninitialized = 1;
2102 ex_ee_len = ext4_ext_get_actual_len(ex); 2087 ex_ee_len = ext4_ext_get_actual_len(ex);
2103 2088
2104 while (ex >= EXT_FIRST_EXTENT(eh) && 2089 while (ex >= EXT_FIRST_EXTENT(eh) &&
2105 ex_ee_block + ex_ee_len > start) { 2090 ex_ee_block + ex_ee_len > start) {
2091
2092 if (ext4_ext_is_uninitialized(ex))
2093 uninitialized = 1;
2094 else
2095 uninitialized = 0;
2096
2106 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); 2097 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
2107 path[depth].p_ext = ex; 2098 path[depth].p_ext = ex;
2108 2099
@@ -2784,7 +2775,7 @@ fix_extent_len:
2784int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 2775int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2785 ext4_lblk_t iblock, 2776 ext4_lblk_t iblock,
2786 unsigned int max_blocks, struct buffer_head *bh_result, 2777 unsigned int max_blocks, struct buffer_head *bh_result,
2787 int create, int extend_disksize) 2778 int flags)
2788{ 2779{
2789 struct ext4_ext_path *path = NULL; 2780 struct ext4_ext_path *path = NULL;
2790 struct ext4_extent_header *eh; 2781 struct ext4_extent_header *eh;
@@ -2793,7 +2784,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2793 int err = 0, depth, ret, cache_type; 2784 int err = 0, depth, ret, cache_type;
2794 unsigned int allocated = 0; 2785 unsigned int allocated = 0;
2795 struct ext4_allocation_request ar; 2786 struct ext4_allocation_request ar;
2796 loff_t disksize;
2797 2787
2798 __clear_bit(BH_New, &bh_result->b_state); 2788 __clear_bit(BH_New, &bh_result->b_state);
2799 ext_debug("blocks %u/%u requested for inode %u\n", 2789 ext_debug("blocks %u/%u requested for inode %u\n",
@@ -2803,7 +2793,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2803 cache_type = ext4_ext_in_cache(inode, iblock, &newex); 2793 cache_type = ext4_ext_in_cache(inode, iblock, &newex);
2804 if (cache_type) { 2794 if (cache_type) {
2805 if (cache_type == EXT4_EXT_CACHE_GAP) { 2795 if (cache_type == EXT4_EXT_CACHE_GAP) {
2806 if (!create) { 2796 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
2807 /* 2797 /*
2808 * block isn't allocated yet and 2798 * block isn't allocated yet and
2809 * user doesn't want to allocate it 2799 * user doesn't want to allocate it
@@ -2869,9 +2859,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2869 EXT4_EXT_CACHE_EXTENT); 2859 EXT4_EXT_CACHE_EXTENT);
2870 goto out; 2860 goto out;
2871 } 2861 }
2872 if (create == EXT4_CREATE_UNINITIALIZED_EXT) 2862 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
2873 goto out; 2863 goto out;
2874 if (!create) { 2864 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
2865 if (allocated > max_blocks)
2866 allocated = max_blocks;
2875 /* 2867 /*
2876 * We have blocks reserved already. We 2868 * We have blocks reserved already. We
2877 * return allocated blocks so that delalloc 2869 * return allocated blocks so that delalloc
@@ -2879,8 +2871,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2879 * the buffer head will be unmapped so that 2871 * the buffer head will be unmapped so that
2880 * a read from the block returns 0s. 2872 * a read from the block returns 0s.
2881 */ 2873 */
2882 if (allocated > max_blocks)
2883 allocated = max_blocks;
2884 set_buffer_unwritten(bh_result); 2874 set_buffer_unwritten(bh_result);
2885 bh_result->b_bdev = inode->i_sb->s_bdev; 2875 bh_result->b_bdev = inode->i_sb->s_bdev;
2886 bh_result->b_blocknr = newblock; 2876 bh_result->b_blocknr = newblock;
@@ -2903,7 +2893,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2903 * requested block isn't allocated yet; 2893 * requested block isn't allocated yet;
2904 * we couldn't try to create block if create flag is zero 2894 * we couldn't try to create block if create flag is zero
2905 */ 2895 */
2906 if (!create) { 2896 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
2907 /* 2897 /*
2908 * put just found gap into cache to speed up 2898 * put just found gap into cache to speed up
2909 * subsequent requests 2899 * subsequent requests
@@ -2932,10 +2922,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2932 * EXT_UNINIT_MAX_LEN. 2922 * EXT_UNINIT_MAX_LEN.
2933 */ 2923 */
2934 if (max_blocks > EXT_INIT_MAX_LEN && 2924 if (max_blocks > EXT_INIT_MAX_LEN &&
2935 create != EXT4_CREATE_UNINITIALIZED_EXT) 2925 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
2936 max_blocks = EXT_INIT_MAX_LEN; 2926 max_blocks = EXT_INIT_MAX_LEN;
2937 else if (max_blocks > EXT_UNINIT_MAX_LEN && 2927 else if (max_blocks > EXT_UNINIT_MAX_LEN &&
2938 create == EXT4_CREATE_UNINITIALIZED_EXT) 2928 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
2939 max_blocks = EXT_UNINIT_MAX_LEN; 2929 max_blocks = EXT_UNINIT_MAX_LEN;
2940 2930
2941 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ 2931 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
@@ -2966,7 +2956,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2966 /* try to insert new extent into found leaf and return */ 2956 /* try to insert new extent into found leaf and return */
2967 ext4_ext_store_pblock(&newex, newblock); 2957 ext4_ext_store_pblock(&newex, newblock);
2968 newex.ee_len = cpu_to_le16(ar.len); 2958 newex.ee_len = cpu_to_le16(ar.len);
2969 if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ 2959 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
2970 ext4_ext_mark_uninitialized(&newex); 2960 ext4_ext_mark_uninitialized(&newex);
2971 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2961 err = ext4_ext_insert_extent(handle, inode, path, &newex);
2972 if (err) { 2962 if (err) {
@@ -2983,18 +2973,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2983 newblock = ext_pblock(&newex); 2973 newblock = ext_pblock(&newex);
2984 allocated = ext4_ext_get_actual_len(&newex); 2974 allocated = ext4_ext_get_actual_len(&newex);
2985outnew: 2975outnew:
2986 if (extend_disksize) {
2987 disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
2988 if (disksize > i_size_read(inode))
2989 disksize = i_size_read(inode);
2990 if (disksize > EXT4_I(inode)->i_disksize)
2991 EXT4_I(inode)->i_disksize = disksize;
2992 }
2993
2994 set_buffer_new(bh_result); 2976 set_buffer_new(bh_result);
2995 2977
2996 /* Cache only when it is _not_ an uninitialized extent */ 2978 /* Cache only when it is _not_ an uninitialized extent */
2997 if (create != EXT4_CREATE_UNINITIALIZED_EXT) 2979 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
2998 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 2980 ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
2999 EXT4_EXT_CACHE_EXTENT); 2981 EXT4_EXT_CACHE_EXTENT);
3000out: 2982out:
@@ -3150,9 +3132,10 @@ retry:
3150 ret = PTR_ERR(handle); 3132 ret = PTR_ERR(handle);
3151 break; 3133 break;
3152 } 3134 }
3153 ret = ext4_get_blocks_wrap(handle, inode, block, 3135 map_bh.b_state = 0;
3154 max_blocks, &map_bh, 3136 ret = ext4_get_blocks(handle, inode, block,
3155 EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); 3137 max_blocks, &map_bh,
3138 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3156 if (ret <= 0) { 3139 if (ret <= 0) {
3157#ifdef EXT4FS_DEBUG 3140#ifdef EXT4FS_DEBUG
3158 WARN_ON(ret <= 0); 3141 WARN_ON(ret <= 0);
@@ -3195,7 +3178,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3195 void *data) 3178 void *data)
3196{ 3179{
3197 struct fiemap_extent_info *fieinfo = data; 3180 struct fiemap_extent_info *fieinfo = data;
3198 unsigned long blksize_bits = inode->i_sb->s_blocksize_bits; 3181 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
3199 __u64 logical; 3182 __u64 logical;
3200 __u64 physical; 3183 __u64 physical;
3201 __u64 length; 3184 __u64 length;
@@ -3242,9 +3225,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3242 * 3225 *
3243 * XXX this might miss a single-block extent at EXT_MAX_BLOCK 3226 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3244 */ 3227 */
3245 if (logical + length - 1 == EXT_MAX_BLOCK || 3228 if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
3246 ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK) 3229 newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
3230 loff_t size = i_size_read(inode);
3231 loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
3232
3247 flags |= FIEMAP_EXTENT_LAST; 3233 flags |= FIEMAP_EXTENT_LAST;
3234 if ((flags & FIEMAP_EXTENT_DELALLOC) &&
3235 logical+length > size)
3236 length = (size - logical + bs - 1) & ~(bs-1);
3237 }
3248 3238
3249 error = fiemap_fill_next_extent(fieinfo, logical, physical, 3239 error = fiemap_fill_next_extent(fieinfo, logical, physical,
3250 length, flags); 3240 length, flags);
@@ -3318,10 +3308,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3318 * Walk the extent tree gathering extent information. 3308 * Walk the extent tree gathering extent information.
3319 * ext4_ext_fiemap_cb will push extents back to user. 3309 * ext4_ext_fiemap_cb will push extents back to user.
3320 */ 3310 */
3321 down_write(&EXT4_I(inode)->i_data_sem); 3311 down_read(&EXT4_I(inode)->i_data_sem);
3322 error = ext4_ext_walk_space(inode, start_blk, len_blks, 3312 error = ext4_ext_walk_space(inode, start_blk, len_blks,
3323 ext4_ext_fiemap_cb, fieinfo); 3313 ext4_ext_fiemap_cb, fieinfo);
3324 up_write(&EXT4_I(inode)->i_data_sem); 3314 up_read(&EXT4_I(inode)->i_data_sem);
3325 } 3315 }
3326 3316
3327 return error; 3317 return error;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 588af8c77246..3f1873fef1c6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -21,6 +21,8 @@
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/jbd2.h> 23#include <linux/jbd2.h>
24#include <linux/mount.h>
25#include <linux/path.h>
24#include "ext4.h" 26#include "ext4.h"
25#include "ext4_jbd2.h" 27#include "ext4_jbd2.h"
26#include "xattr.h" 28#include "xattr.h"
@@ -145,6 +147,38 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
145 return 0; 147 return 0;
146} 148}
147 149
150static int ext4_file_open(struct inode * inode, struct file * filp)
151{
152 struct super_block *sb = inode->i_sb;
153 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
154 struct vfsmount *mnt = filp->f_path.mnt;
155 struct path path;
156 char buf[64], *cp;
157
158 if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
159 !(sb->s_flags & MS_RDONLY))) {
160 sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
161 /*
162 * Sample where the filesystem has been mounted and
163 * store it in the superblock for sysadmin convenience
164 * when trying to sort through large numbers of block
165 * devices or filesystem images.
166 */
167 memset(buf, 0, sizeof(buf));
168 path.mnt = mnt->mnt_parent;
169 path.dentry = mnt->mnt_mountpoint;
170 path_get(&path);
171 cp = d_path(&path, buf, sizeof(buf));
172 path_put(&path);
173 if (!IS_ERR(cp)) {
174 memcpy(sbi->s_es->s_last_mounted, cp,
175 sizeof(sbi->s_es->s_last_mounted));
176 sb->s_dirt = 1;
177 }
178 }
179 return generic_file_open(inode, filp);
180}
181
148const struct file_operations ext4_file_operations = { 182const struct file_operations ext4_file_operations = {
149 .llseek = generic_file_llseek, 183 .llseek = generic_file_llseek,
150 .read = do_sync_read, 184 .read = do_sync_read,
@@ -156,7 +190,7 @@ const struct file_operations ext4_file_operations = {
156 .compat_ioctl = ext4_compat_ioctl, 190 .compat_ioctl = ext4_compat_ioctl,
157#endif 191#endif
158 .mmap = ext4_file_mmap, 192 .mmap = ext4_file_mmap,
159 .open = generic_file_open, 193 .open = ext4_file_open,
160 .release = ext4_release_file, 194 .release = ext4_release_file,
161 .fsync = ext4_sync_file, 195 .fsync = ext4_sync_file,
162 .splice_read = generic_file_splice_read, 196 .splice_read = generic_file_splice_read,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5afe4370840b..83cf6415f599 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,10 +28,12 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/jbd2.h> 29#include <linux/jbd2.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/marker.h> 31
32#include "ext4.h" 32#include "ext4.h"
33#include "ext4_jbd2.h" 33#include "ext4_jbd2.h"
34 34
35#include <trace/events/ext4.h>
36
35/* 37/*
36 * akpm: A new design for ext4_sync_file(). 38 * akpm: A new design for ext4_sync_file().
37 * 39 *
@@ -52,9 +54,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
52 54
53 J_ASSERT(ext4_journal_current_handle() == NULL); 55 J_ASSERT(ext4_journal_current_handle() == NULL);
54 56
55 trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld", 57 trace_ext4_sync_file(file, dentry, datasync);
56 inode->i_sb->s_id, datasync, inode->i_ino,
57 dentry->d_parent->d_inode->i_ino);
58 58
59 /* 59 /*
60 * data=writeback: 60 * data=writeback:
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
deleted file mode 100644
index c2c0a8d06d0e..000000000000
--- a/fs/ext4/group.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * linux/fs/ext4/group.h
3 *
4 * Copyright (C) 2007 Cluster File Systems, Inc
5 *
6 * Author: Andreas Dilger <adilger@clusterfs.com>
7 */
8
9#ifndef _LINUX_EXT4_GROUP_H
10#define _LINUX_EXT4_GROUP_H
11
12extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
13 struct ext4_group_desc *gdp);
14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
15 struct ext4_group_desc *gdp);
16struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
17 ext4_group_t block_group);
18extern unsigned ext4_init_block_bitmap(struct super_block *sb,
19 struct buffer_head *bh,
20 ext4_group_t group,
21 struct ext4_group_desc *desc);
22#define ext4_free_blocks_after_init(sb, group, desc) \
23 ext4_init_block_bitmap(sb, NULL, group, desc)
24extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
25 struct buffer_head *bh,
26 ext4_group_t group,
27 struct ext4_group_desc *desc);
28extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
29#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18e0a08a6b5..29e6dc7299b8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -23,11 +23,13 @@
23#include <linux/bitops.h> 23#include <linux/bitops.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <asm/byteorder.h> 25#include <asm/byteorder.h>
26
26#include "ext4.h" 27#include "ext4.h"
27#include "ext4_jbd2.h" 28#include "ext4_jbd2.h"
28#include "xattr.h" 29#include "xattr.h"
29#include "acl.h" 30#include "acl.h"
30#include "group.h" 31
32#include <trace/events/ext4.h>
31 33
32/* 34/*
33 * ialloc.c contains the inodes allocation and deallocation routines 35 * ialloc.c contains the inodes allocation and deallocation routines
@@ -123,16 +125,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
123 unlock_buffer(bh); 125 unlock_buffer(bh);
124 return bh; 126 return bh;
125 } 127 }
126 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 128 ext4_lock_group(sb, block_group);
127 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 129 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
128 ext4_init_inode_bitmap(sb, bh, block_group, desc); 130 ext4_init_inode_bitmap(sb, bh, block_group, desc);
129 set_bitmap_uptodate(bh); 131 set_bitmap_uptodate(bh);
130 set_buffer_uptodate(bh); 132 set_buffer_uptodate(bh);
131 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 133 ext4_unlock_group(sb, block_group);
132 unlock_buffer(bh); 134 unlock_buffer(bh);
133 return bh; 135 return bh;
134 } 136 }
135 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 137 ext4_unlock_group(sb, block_group);
136 if (buffer_uptodate(bh)) { 138 if (buffer_uptodate(bh)) {
137 /* 139 /*
138 * if not uninit if bh is uptodate, 140 * if not uninit if bh is uptodate,
@@ -209,11 +211,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
209 211
210 ino = inode->i_ino; 212 ino = inode->i_ino;
211 ext4_debug("freeing inode %lu\n", ino); 213 ext4_debug("freeing inode %lu\n", ino);
212 trace_mark(ext4_free_inode, 214 trace_ext4_free_inode(inode);
213 "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
214 sb->s_id, inode->i_ino, inode->i_mode,
215 (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
216 (unsigned long long) inode->i_blocks);
217 215
218 /* 216 /*
219 * Note: we must free any quota before locking the superblock, 217 * Note: we must free any quota before locking the superblock,
@@ -247,9 +245,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
247 goto error_return; 245 goto error_return;
248 246
249 /* Ok, now we can actually update the inode bitmaps.. */ 247 /* Ok, now we can actually update the inode bitmaps.. */
250 spin_lock(sb_bgl_lock(sbi, block_group)); 248 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
251 cleared = ext4_clear_bit(bit, bitmap_bh->b_data); 249 bit, bitmap_bh->b_data);
252 spin_unlock(sb_bgl_lock(sbi, block_group));
253 if (!cleared) 250 if (!cleared)
254 ext4_error(sb, "ext4_free_inode", 251 ext4_error(sb, "ext4_free_inode",
255 "bit already cleared for inode %lu", ino); 252 "bit already cleared for inode %lu", ino);
@@ -261,7 +258,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
261 if (fatal) goto error_return; 258 if (fatal) goto error_return;
262 259
263 if (gdp) { 260 if (gdp) {
264 spin_lock(sb_bgl_lock(sbi, block_group)); 261 ext4_lock_group(sb, block_group);
265 count = ext4_free_inodes_count(sb, gdp) + 1; 262 count = ext4_free_inodes_count(sb, gdp) + 1;
266 ext4_free_inodes_set(sb, gdp, count); 263 ext4_free_inodes_set(sb, gdp, count);
267 if (is_directory) { 264 if (is_directory) {
@@ -277,7 +274,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
277 } 274 }
278 gdp->bg_checksum = ext4_group_desc_csum(sbi, 275 gdp->bg_checksum = ext4_group_desc_csum(sbi,
279 block_group, gdp); 276 block_group, gdp);
280 spin_unlock(sb_bgl_lock(sbi, block_group)); 277 ext4_unlock_group(sb, block_group);
281 percpu_counter_inc(&sbi->s_freeinodes_counter); 278 percpu_counter_inc(&sbi->s_freeinodes_counter);
282 if (is_directory) 279 if (is_directory)
283 percpu_counter_dec(&sbi->s_dirs_counter); 280 percpu_counter_dec(&sbi->s_dirs_counter);
@@ -316,7 +313,7 @@ error_return:
316static int find_group_dir(struct super_block *sb, struct inode *parent, 313static int find_group_dir(struct super_block *sb, struct inode *parent,
317 ext4_group_t *best_group) 314 ext4_group_t *best_group)
318{ 315{
319 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 316 ext4_group_t ngroups = ext4_get_groups_count(sb);
320 unsigned int freei, avefreei; 317 unsigned int freei, avefreei;
321 struct ext4_group_desc *desc, *best_desc = NULL; 318 struct ext4_group_desc *desc, *best_desc = NULL;
322 ext4_group_t group; 319 ext4_group_t group;
@@ -349,11 +346,10 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
349{ 346{
350 struct ext4_sb_info *sbi = EXT4_SB(sb); 347 struct ext4_sb_info *sbi = EXT4_SB(sb);
351 struct ext4_group_desc *desc; 348 struct ext4_group_desc *desc;
352 struct buffer_head *bh;
353 struct flex_groups *flex_group = sbi->s_flex_groups; 349 struct flex_groups *flex_group = sbi->s_flex_groups;
354 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 350 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
355 ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); 351 ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
356 ext4_group_t ngroups = sbi->s_groups_count; 352 ext4_group_t ngroups = ext4_get_groups_count(sb);
357 int flex_size = ext4_flex_bg_size(sbi); 353 int flex_size = ext4_flex_bg_size(sbi);
358 ext4_group_t best_flex = parent_fbg_group; 354 ext4_group_t best_flex = parent_fbg_group;
359 int blocks_per_flex = sbi->s_blocks_per_group * flex_size; 355 int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
@@ -362,7 +358,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
362 ext4_group_t n_fbg_groups; 358 ext4_group_t n_fbg_groups;
363 ext4_group_t i; 359 ext4_group_t i;
364 360
365 n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> 361 n_fbg_groups = (ngroups + flex_size - 1) >>
366 sbi->s_log_groups_per_flex; 362 sbi->s_log_groups_per_flex;
367 363
368find_close_to_parent: 364find_close_to_parent:
@@ -404,7 +400,7 @@ find_close_to_parent:
404found_flexbg: 400found_flexbg:
405 for (i = best_flex * flex_size; i < ngroups && 401 for (i = best_flex * flex_size; i < ngroups &&
406 i < (best_flex + 1) * flex_size; i++) { 402 i < (best_flex + 1) * flex_size; i++) {
407 desc = ext4_get_group_desc(sb, i, &bh); 403 desc = ext4_get_group_desc(sb, i, NULL);
408 if (ext4_free_inodes_count(sb, desc)) { 404 if (ext4_free_inodes_count(sb, desc)) {
409 *best_group = i; 405 *best_group = i;
410 goto out; 406 goto out;
@@ -474,24 +470,27 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g,
474 */ 470 */
475 471
476static int find_group_orlov(struct super_block *sb, struct inode *parent, 472static int find_group_orlov(struct super_block *sb, struct inode *parent,
477 ext4_group_t *group, int mode) 473 ext4_group_t *group, int mode,
474 const struct qstr *qstr)
478{ 475{
479 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 476 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
480 struct ext4_sb_info *sbi = EXT4_SB(sb); 477 struct ext4_sb_info *sbi = EXT4_SB(sb);
481 ext4_group_t ngroups = sbi->s_groups_count; 478 ext4_group_t real_ngroups = ext4_get_groups_count(sb);
482 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 479 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
483 unsigned int freei, avefreei; 480 unsigned int freei, avefreei;
484 ext4_fsblk_t freeb, avefreeb; 481 ext4_fsblk_t freeb, avefreeb;
485 unsigned int ndirs; 482 unsigned int ndirs;
486 int max_dirs, min_inodes; 483 int max_dirs, min_inodes;
487 ext4_grpblk_t min_blocks; 484 ext4_grpblk_t min_blocks;
488 ext4_group_t i, grp, g; 485 ext4_group_t i, grp, g, ngroups;
489 struct ext4_group_desc *desc; 486 struct ext4_group_desc *desc;
490 struct orlov_stats stats; 487 struct orlov_stats stats;
491 int flex_size = ext4_flex_bg_size(sbi); 488 int flex_size = ext4_flex_bg_size(sbi);
489 struct dx_hash_info hinfo;
492 490
491 ngroups = real_ngroups;
493 if (flex_size > 1) { 492 if (flex_size > 1) {
494 ngroups = (ngroups + flex_size - 1) >> 493 ngroups = (real_ngroups + flex_size - 1) >>
495 sbi->s_log_groups_per_flex; 494 sbi->s_log_groups_per_flex;
496 parent_group >>= sbi->s_log_groups_per_flex; 495 parent_group >>= sbi->s_log_groups_per_flex;
497 } 496 }
@@ -509,7 +508,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
509 int best_ndir = inodes_per_group; 508 int best_ndir = inodes_per_group;
510 int ret = -1; 509 int ret = -1;
511 510
512 get_random_bytes(&grp, sizeof(grp)); 511 if (qstr) {
512 hinfo.hash_version = DX_HASH_HALF_MD4;
513 hinfo.seed = sbi->s_hash_seed;
514 ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
515 grp = hinfo.hash;
516 } else
517 get_random_bytes(&grp, sizeof(grp));
513 parent_group = (unsigned)grp % ngroups; 518 parent_group = (unsigned)grp % ngroups;
514 for (i = 0; i < ngroups; i++) { 519 for (i = 0; i < ngroups; i++) {
515 g = (parent_group + i) % ngroups; 520 g = (parent_group + i) % ngroups;
@@ -543,7 +548,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
543 */ 548 */
544 grp *= flex_size; 549 grp *= flex_size;
545 for (i = 0; i < flex_size; i++) { 550 for (i = 0; i < flex_size; i++) {
546 if (grp+i >= sbi->s_groups_count) 551 if (grp+i >= real_ngroups)
547 break; 552 break;
548 desc = ext4_get_group_desc(sb, grp+i, NULL); 553 desc = ext4_get_group_desc(sb, grp+i, NULL);
549 if (desc && ext4_free_inodes_count(sb, desc)) { 554 if (desc && ext4_free_inodes_count(sb, desc)) {
@@ -583,7 +588,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
583 } 588 }
584 589
585fallback: 590fallback:
586 ngroups = sbi->s_groups_count; 591 ngroups = real_ngroups;
587 avefreei = freei / ngroups; 592 avefreei = freei / ngroups;
588fallback_retry: 593fallback_retry:
589 parent_group = EXT4_I(parent)->i_block_group; 594 parent_group = EXT4_I(parent)->i_block_group;
@@ -613,9 +618,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
613 ext4_group_t *group, int mode) 618 ext4_group_t *group, int mode)
614{ 619{
615 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 620 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
616 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 621 ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
617 struct ext4_group_desc *desc; 622 struct ext4_group_desc *desc;
618 ext4_group_t i, last;
619 int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); 623 int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
620 624
621 /* 625 /*
@@ -653,7 +657,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
653 *group = parent_group + flex_size; 657 *group = parent_group + flex_size;
654 if (*group > ngroups) 658 if (*group > ngroups)
655 *group = 0; 659 *group = 0;
656 return find_group_orlov(sb, parent, group, mode); 660 return find_group_orlov(sb, parent, group, mode, 0);
657 } 661 }
658 662
659 /* 663 /*
@@ -708,10 +712,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
708 712
709/* 713/*
710 * claim the inode from the inode bitmap. If the group 714 * claim the inode from the inode bitmap. If the group
711 * is uninit we need to take the groups's sb_bgl_lock 715 * is uninit we need to take the groups's ext4_group_lock
712 * and clear the uninit flag. The inode bitmap update 716 * and clear the uninit flag. The inode bitmap update
713 * and group desc uninit flag clear should be done 717 * and group desc uninit flag clear should be done
714 * after holding sb_bgl_lock so that ext4_read_inode_bitmap 718 * after holding ext4_group_lock so that ext4_read_inode_bitmap
715 * doesn't race with the ext4_claim_inode 719 * doesn't race with the ext4_claim_inode
716 */ 720 */
717static int ext4_claim_inode(struct super_block *sb, 721static int ext4_claim_inode(struct super_block *sb,
@@ -722,7 +726,7 @@ static int ext4_claim_inode(struct super_block *sb,
722 struct ext4_sb_info *sbi = EXT4_SB(sb); 726 struct ext4_sb_info *sbi = EXT4_SB(sb);
723 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); 727 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
724 728
725 spin_lock(sb_bgl_lock(sbi, group)); 729 ext4_lock_group(sb, group);
726 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { 730 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
727 /* not a free inode */ 731 /* not a free inode */
728 retval = 1; 732 retval = 1;
@@ -731,7 +735,7 @@ static int ext4_claim_inode(struct super_block *sb,
731 ino++; 735 ino++;
732 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 736 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
733 ino > EXT4_INODES_PER_GROUP(sb)) { 737 ino > EXT4_INODES_PER_GROUP(sb)) {
734 spin_unlock(sb_bgl_lock(sbi, group)); 738 ext4_unlock_group(sb, group);
735 ext4_error(sb, __func__, 739 ext4_error(sb, __func__,
736 "reserved inode or inode > inodes count - " 740 "reserved inode or inode > inodes count - "
737 "block_group = %u, inode=%lu", group, 741 "block_group = %u, inode=%lu", group,
@@ -780,7 +784,7 @@ static int ext4_claim_inode(struct super_block *sb,
780 } 784 }
781 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 785 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
782err_ret: 786err_ret:
783 spin_unlock(sb_bgl_lock(sbi, group)); 787 ext4_unlock_group(sb, group);
784 return retval; 788 return retval;
785} 789}
786 790
@@ -794,16 +798,16 @@ err_ret:
794 * For other inodes, search forward from the parent directory's block 798 * For other inodes, search forward from the parent directory's block
795 * group to find a free inode. 799 * group to find a free inode.
796 */ 800 */
797struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) 801struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
802 const struct qstr *qstr, __u32 goal)
798{ 803{
799 struct super_block *sb; 804 struct super_block *sb;
800 struct buffer_head *inode_bitmap_bh = NULL; 805 struct buffer_head *inode_bitmap_bh = NULL;
801 struct buffer_head *group_desc_bh; 806 struct buffer_head *group_desc_bh;
802 ext4_group_t group = 0; 807 ext4_group_t ngroups, group = 0;
803 unsigned long ino = 0; 808 unsigned long ino = 0;
804 struct inode *inode; 809 struct inode *inode;
805 struct ext4_group_desc *gdp = NULL; 810 struct ext4_group_desc *gdp = NULL;
806 struct ext4_super_block *es;
807 struct ext4_inode_info *ei; 811 struct ext4_inode_info *ei;
808 struct ext4_sb_info *sbi; 812 struct ext4_sb_info *sbi;
809 int ret2, err = 0; 813 int ret2, err = 0;
@@ -818,15 +822,23 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
818 return ERR_PTR(-EPERM); 822 return ERR_PTR(-EPERM);
819 823
820 sb = dir->i_sb; 824 sb = dir->i_sb;
821 trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, 825 ngroups = ext4_get_groups_count(sb);
822 dir->i_ino, mode); 826 trace_ext4_request_inode(dir, mode);
823 inode = new_inode(sb); 827 inode = new_inode(sb);
824 if (!inode) 828 if (!inode)
825 return ERR_PTR(-ENOMEM); 829 return ERR_PTR(-ENOMEM);
826 ei = EXT4_I(inode); 830 ei = EXT4_I(inode);
827
828 sbi = EXT4_SB(sb); 831 sbi = EXT4_SB(sb);
829 es = sbi->s_es; 832
833 if (!goal)
834 goal = sbi->s_inode_goal;
835
836 if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
837 group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
838 ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
839 ret2 = 0;
840 goto got_group;
841 }
830 842
831 if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { 843 if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
832 ret2 = find_group_flex(sb, dir, &group); 844 ret2 = find_group_flex(sb, dir, &group);
@@ -846,7 +858,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
846 if (test_opt(sb, OLDALLOC)) 858 if (test_opt(sb, OLDALLOC))
847 ret2 = find_group_dir(sb, dir, &group); 859 ret2 = find_group_dir(sb, dir, &group);
848 else 860 else
849 ret2 = find_group_orlov(sb, dir, &group, mode); 861 ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
850 } else 862 } else
851 ret2 = find_group_other(sb, dir, &group, mode); 863 ret2 = find_group_other(sb, dir, &group, mode);
852 864
@@ -856,7 +868,7 @@ got_group:
856 if (ret2 == -1) 868 if (ret2 == -1)
857 goto out; 869 goto out;
858 870
859 for (i = 0; i < sbi->s_groups_count; i++) { 871 for (i = 0; i < ngroups; i++, ino = 0) {
860 err = -EIO; 872 err = -EIO;
861 873
862 gdp = ext4_get_group_desc(sb, group, &group_desc_bh); 874 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -868,8 +880,6 @@ got_group:
868 if (!inode_bitmap_bh) 880 if (!inode_bitmap_bh)
869 goto fail; 881 goto fail;
870 882
871 ino = 0;
872
873repeat_in_this_group: 883repeat_in_this_group:
874 ino = ext4_find_next_zero_bit((unsigned long *) 884 ino = ext4_find_next_zero_bit((unsigned long *)
875 inode_bitmap_bh->b_data, 885 inode_bitmap_bh->b_data,
@@ -917,7 +927,7 @@ repeat_in_this_group:
917 * group descriptor metadata has not yet been updated. 927 * group descriptor metadata has not yet been updated.
918 * So we just go onto the next blockgroup. 928 * So we just go onto the next blockgroup.
919 */ 929 */
920 if (++group == sbi->s_groups_count) 930 if (++group == ngroups)
921 group = 0; 931 group = 0;
922 } 932 }
923 err = -ENOSPC; 933 err = -ENOSPC;
@@ -938,7 +948,7 @@ got:
938 } 948 }
939 949
940 free = 0; 950 free = 0;
941 spin_lock(sb_bgl_lock(sbi, group)); 951 ext4_lock_group(sb, group);
942 /* recheck and clear flag under lock if we still need to */ 952 /* recheck and clear flag under lock if we still need to */
943 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 953 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
944 free = ext4_free_blocks_after_init(sb, group, gdp); 954 free = ext4_free_blocks_after_init(sb, group, gdp);
@@ -947,7 +957,7 @@ got:
947 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, 957 gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
948 gdp); 958 gdp);
949 } 959 }
950 spin_unlock(sb_bgl_lock(sbi, group)); 960 ext4_unlock_group(sb, group);
951 961
952 /* Don't need to dirty bitmap block if we didn't change it */ 962 /* Don't need to dirty bitmap block if we didn't change it */
953 if (free) { 963 if (free) {
@@ -1052,8 +1062,7 @@ got:
1052 } 1062 }
1053 1063
1054 ext4_debug("allocating inode %lu\n", inode->i_ino); 1064 ext4_debug("allocating inode %lu\n", inode->i_ino);
1055 trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d", 1065 trace_ext4_allocate_inode(inode, dir, mode);
1056 sb->s_id, inode->i_ino, dir->i_ino, mode);
1057 goto really_out; 1066 goto really_out;
1058fail: 1067fail:
1059 ext4_std_error(sb, err); 1068 ext4_std_error(sb, err);
@@ -1158,7 +1167,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1158{ 1167{
1159 unsigned long desc_count; 1168 unsigned long desc_count;
1160 struct ext4_group_desc *gdp; 1169 struct ext4_group_desc *gdp;
1161 ext4_group_t i; 1170 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1162#ifdef EXT4FS_DEBUG 1171#ifdef EXT4FS_DEBUG
1163 struct ext4_super_block *es; 1172 struct ext4_super_block *es;
1164 unsigned long bitmap_count, x; 1173 unsigned long bitmap_count, x;
@@ -1168,7 +1177,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1168 desc_count = 0; 1177 desc_count = 0;
1169 bitmap_count = 0; 1178 bitmap_count = 0;
1170 gdp = NULL; 1179 gdp = NULL;
1171 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1180 for (i = 0; i < ngroups; i++) {
1172 gdp = ext4_get_group_desc(sb, i, NULL); 1181 gdp = ext4_get_group_desc(sb, i, NULL);
1173 if (!gdp) 1182 if (!gdp)
1174 continue; 1183 continue;
@@ -1190,7 +1199,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1190 return desc_count; 1199 return desc_count;
1191#else 1200#else
1192 desc_count = 0; 1201 desc_count = 0;
1193 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1202 for (i = 0; i < ngroups; i++) {
1194 gdp = ext4_get_group_desc(sb, i, NULL); 1203 gdp = ext4_get_group_desc(sb, i, NULL);
1195 if (!gdp) 1204 if (!gdp)
1196 continue; 1205 continue;
@@ -1205,9 +1214,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1205unsigned long ext4_count_dirs(struct super_block * sb) 1214unsigned long ext4_count_dirs(struct super_block * sb)
1206{ 1215{
1207 unsigned long count = 0; 1216 unsigned long count = 0;
1208 ext4_group_t i; 1217 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1209 1218
1210 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1219 for (i = 0; i < ngroups; i++) {
1211 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); 1220 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1212 if (!gdp) 1221 if (!gdp)
1213 continue; 1222 continue;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2a9ffd528dd1..f9c642b22efa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,11 +37,14 @@
37#include <linux/namei.h> 37#include <linux/namei.h>
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40
40#include "ext4_jbd2.h" 41#include "ext4_jbd2.h"
41#include "xattr.h" 42#include "xattr.h"
42#include "acl.h" 43#include "acl.h"
43#include "ext4_extents.h" 44#include "ext4_extents.h"
44 45
46#include <trace/events/ext4.h>
47
45#define MPAGE_DA_EXTENT_TAIL 0x01 48#define MPAGE_DA_EXTENT_TAIL 0x01
46 49
47static inline int ext4_begin_ordered_truncate(struct inode *inode, 50static inline int ext4_begin_ordered_truncate(struct inode *inode,
@@ -75,22 +78,20 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
75 * but there may still be a record of it in the journal, and that record 78 * but there may still be a record of it in the journal, and that record
76 * still needs to be revoked. 79 * still needs to be revoked.
77 * 80 *
78 * If the handle isn't valid we're not journaling so there's nothing to do. 81 * If the handle isn't valid we're not journaling, but we still need to
82 * call into ext4_journal_revoke() to put the buffer head.
79 */ 83 */
80int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 84int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
81 struct buffer_head *bh, ext4_fsblk_t blocknr) 85 struct buffer_head *bh, ext4_fsblk_t blocknr)
82{ 86{
83 int err; 87 int err;
84 88
85 if (!ext4_handle_valid(handle))
86 return 0;
87
88 might_sleep(); 89 might_sleep();
89 90
90 BUFFER_TRACE(bh, "enter"); 91 BUFFER_TRACE(bh, "enter");
91 92
92 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 93 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
93 "data mode %lx\n", 94 "data mode %x\n",
94 bh, is_metadata, inode->i_mode, 95 bh, is_metadata, inode->i_mode,
95 test_opt(inode->i_sb, DATA_FLAGS)); 96 test_opt(inode->i_sb, DATA_FLAGS));
96 97
@@ -329,8 +330,8 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
329 */ 330 */
330 331
331static int ext4_block_to_path(struct inode *inode, 332static int ext4_block_to_path(struct inode *inode,
332 ext4_lblk_t i_block, 333 ext4_lblk_t i_block,
333 ext4_lblk_t offsets[4], int *boundary) 334 ext4_lblk_t offsets[4], int *boundary)
334{ 335{
335 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 336 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
336 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 337 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -362,9 +363,9 @@ static int ext4_block_to_path(struct inode *inode,
362 final = ptrs; 363 final = ptrs;
363 } else { 364 } else {
364 ext4_warning(inode->i_sb, "ext4_block_to_path", 365 ext4_warning(inode->i_sb, "ext4_block_to_path",
365 "block %lu > max in inode %lu", 366 "block %lu > max in inode %lu",
366 i_block + direct_blocks + 367 i_block + direct_blocks +
367 indirect_blocks + double_blocks, inode->i_ino); 368 indirect_blocks + double_blocks, inode->i_ino);
368 } 369 }
369 if (boundary) 370 if (boundary)
370 *boundary = final - 1 - (i_block & (ptrs - 1)); 371 *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -372,31 +373,32 @@ static int ext4_block_to_path(struct inode *inode,
372} 373}
373 374
374static int __ext4_check_blockref(const char *function, struct inode *inode, 375static int __ext4_check_blockref(const char *function, struct inode *inode,
375 __le32 *p, unsigned int max) { 376 __le32 *p, unsigned int max)
376 377{
377 unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
378 __le32 *bref = p; 378 __le32 *bref = p;
379 unsigned int blk;
380
379 while (bref < p+max) { 381 while (bref < p+max) {
380 if (unlikely(le32_to_cpu(*bref) >= maxblocks)) { 382 blk = le32_to_cpu(*bref++);
383 if (blk &&
384 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
385 blk, 1))) {
381 ext4_error(inode->i_sb, function, 386 ext4_error(inode->i_sb, function,
382 "block reference %u >= max (%u) " 387 "invalid block reference %u "
383 "in inode #%lu, offset=%d", 388 "in inode #%lu", blk, inode->i_ino);
384 le32_to_cpu(*bref), maxblocks, 389 return -EIO;
385 inode->i_ino, (int)(bref-p)); 390 }
386 return -EIO; 391 }
387 } 392 return 0;
388 bref++;
389 }
390 return 0;
391} 393}
392 394
393 395
394#define ext4_check_indirect_blockref(inode, bh) \ 396#define ext4_check_indirect_blockref(inode, bh) \
395 __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ 397 __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \
396 EXT4_ADDR_PER_BLOCK((inode)->i_sb)) 398 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
397 399
398#define ext4_check_inode_blockref(inode) \ 400#define ext4_check_inode_blockref(inode) \
399 __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ 401 __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \
400 EXT4_NDIR_BLOCKS) 402 EXT4_NDIR_BLOCKS)
401 403
402/** 404/**
@@ -446,7 +448,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
446 bh = sb_getblk(sb, le32_to_cpu(p->key)); 448 bh = sb_getblk(sb, le32_to_cpu(p->key));
447 if (unlikely(!bh)) 449 if (unlikely(!bh))
448 goto failure; 450 goto failure;
449 451
450 if (!bh_uptodate_or_lock(bh)) { 452 if (!bh_uptodate_or_lock(bh)) {
451 if (bh_submit_read(bh) < 0) { 453 if (bh_submit_read(bh) < 0) {
452 put_bh(bh); 454 put_bh(bh);
@@ -458,7 +460,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
458 goto failure; 460 goto failure;
459 } 461 }
460 } 462 }
461 463
462 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 464 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
463 /* Reader: end */ 465 /* Reader: end */
464 if (!p->key) 466 if (!p->key)
@@ -551,7 +553,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
551 * returns it. 553 * returns it.
552 */ 554 */
553static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 555static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
554 Indirect *partial) 556 Indirect *partial)
555{ 557{
556 /* 558 /*
557 * XXX need to get goal block from mballoc's data structures 559 * XXX need to get goal block from mballoc's data structures
@@ -573,7 +575,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
573 * direct and indirect blocks. 575 * direct and indirect blocks.
574 */ 576 */
575static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, 577static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
576 int blocks_to_boundary) 578 int blocks_to_boundary)
577{ 579{
578 unsigned int count = 0; 580 unsigned int count = 0;
579 581
@@ -609,9 +611,9 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
609 * direct blocks 611 * direct blocks
610 */ 612 */
611static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 613static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
612 ext4_lblk_t iblock, ext4_fsblk_t goal, 614 ext4_lblk_t iblock, ext4_fsblk_t goal,
613 int indirect_blks, int blks, 615 int indirect_blks, int blks,
614 ext4_fsblk_t new_blocks[4], int *err) 616 ext4_fsblk_t new_blocks[4], int *err)
615{ 617{
616 struct ext4_allocation_request ar; 618 struct ext4_allocation_request ar;
617 int target, i; 619 int target, i;
@@ -682,10 +684,10 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
682 } 684 }
683 if (!*err) { 685 if (!*err) {
684 if (target == blks) { 686 if (target == blks) {
685 /* 687 /*
686 * save the new block number 688 * save the new block number
687 * for the first direct block 689 * for the first direct block
688 */ 690 */
689 new_blocks[index] = current_block; 691 new_blocks[index] = current_block;
690 } 692 }
691 blk_allocated += ar.len; 693 blk_allocated += ar.len;
@@ -727,9 +729,9 @@ failed_out:
727 * as described above and return 0. 729 * as described above and return 0.
728 */ 730 */
729static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 731static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
730 ext4_lblk_t iblock, int indirect_blks, 732 ext4_lblk_t iblock, int indirect_blks,
731 int *blks, ext4_fsblk_t goal, 733 int *blks, ext4_fsblk_t goal,
732 ext4_lblk_t *offsets, Indirect *branch) 734 ext4_lblk_t *offsets, Indirect *branch)
733{ 735{
734 int blocksize = inode->i_sb->s_blocksize; 736 int blocksize = inode->i_sb->s_blocksize;
735 int i, n = 0; 737 int i, n = 0;
@@ -776,7 +778,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
776 * the chain to point to the new allocated 778 * the chain to point to the new allocated
777 * data blocks numbers 779 * data blocks numbers
778 */ 780 */
779 for (i=1; i < num; i++) 781 for (i = 1; i < num; i++)
780 *(branch[n].p + i) = cpu_to_le32(++current_block); 782 *(branch[n].p + i) = cpu_to_le32(++current_block);
781 } 783 }
782 BUFFER_TRACE(bh, "marking uptodate"); 784 BUFFER_TRACE(bh, "marking uptodate");
@@ -819,7 +821,8 @@ failed:
819 * chain to new block and return 0. 821 * chain to new block and return 0.
820 */ 822 */
821static int ext4_splice_branch(handle_t *handle, struct inode *inode, 823static int ext4_splice_branch(handle_t *handle, struct inode *inode,
822 ext4_lblk_t block, Indirect *where, int num, int blks) 824 ext4_lblk_t block, Indirect *where, int num,
825 int blks)
823{ 826{
824 int i; 827 int i;
825 int err = 0; 828 int err = 0;
@@ -851,10 +854,6 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
851 } 854 }
852 855
853 /* We are done with atomic stuff, now do the rest of housekeeping */ 856 /* We are done with atomic stuff, now do the rest of housekeeping */
854
855 inode->i_ctime = ext4_current_time(inode);
856 ext4_mark_inode_dirty(handle, inode);
857
858 /* had we spliced it onto indirect block? */ 857 /* had we spliced it onto indirect block? */
859 if (where->bh) { 858 if (where->bh) {
860 /* 859 /*
@@ -873,8 +872,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
873 } else { 872 } else {
874 /* 873 /*
875 * OK, we spliced it into the inode itself on a direct block. 874 * OK, we spliced it into the inode itself on a direct block.
876 * Inode was dirtied above.
877 */ 875 */
876 ext4_mark_inode_dirty(handle, inode);
878 jbd_debug(5, "splicing direct\n"); 877 jbd_debug(5, "splicing direct\n");
879 } 878 }
880 return err; 879 return err;
@@ -892,6 +891,10 @@ err_out:
892} 891}
893 892
894/* 893/*
894 * The ext4_ind_get_blocks() function handles non-extents inodes
895 * (i.e., using the traditional indirect/double-indirect i_blocks
896 * scheme) for ext4_get_blocks().
897 *
895 * Allocation strategy is simple: if we have to allocate something, we will 898 * Allocation strategy is simple: if we have to allocate something, we will
896 * have to go the whole way to leaf. So let's do it before attaching anything 899 * have to go the whole way to leaf. So let's do it before attaching anything
897 * to tree, set linkage between the newborn blocks, write them if sync is 900 * to tree, set linkage between the newborn blocks, write them if sync is
@@ -909,15 +912,16 @@ err_out:
909 * return = 0, if plain lookup failed. 912 * return = 0, if plain lookup failed.
910 * return < 0, error case. 913 * return < 0, error case.
911 * 914 *
912 * 915 * The ext4_ind_get_blocks() function should be called with
913 * Need to be called with 916 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
914 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 917 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
915 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 918 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
919 * blocks.
916 */ 920 */
917static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 921static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
918 ext4_lblk_t iblock, unsigned int maxblocks, 922 ext4_lblk_t iblock, unsigned int maxblocks,
919 struct buffer_head *bh_result, 923 struct buffer_head *bh_result,
920 int create, int extend_disksize) 924 int flags)
921{ 925{
922 int err = -EIO; 926 int err = -EIO;
923 ext4_lblk_t offsets[4]; 927 ext4_lblk_t offsets[4];
@@ -927,16 +931,13 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
927 int indirect_blks; 931 int indirect_blks;
928 int blocks_to_boundary = 0; 932 int blocks_to_boundary = 0;
929 int depth; 933 int depth;
930 struct ext4_inode_info *ei = EXT4_I(inode);
931 int count = 0; 934 int count = 0;
932 ext4_fsblk_t first_block = 0; 935 ext4_fsblk_t first_block = 0;
933 loff_t disksize;
934
935 936
936 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 937 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
937 J_ASSERT(handle != NULL || create == 0); 938 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
938 depth = ext4_block_to_path(inode, iblock, offsets, 939 depth = ext4_block_to_path(inode, iblock, offsets,
939 &blocks_to_boundary); 940 &blocks_to_boundary);
940 941
941 if (depth == 0) 942 if (depth == 0)
942 goto out; 943 goto out;
@@ -963,7 +964,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
963 } 964 }
964 965
965 /* Next simple case - plain lookup or failed read of indirect block */ 966 /* Next simple case - plain lookup or failed read of indirect block */
966 if (!create || err == -EIO) 967 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
967 goto cleanup; 968 goto cleanup;
968 969
969 /* 970 /*
@@ -984,8 +985,8 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
984 * Block out ext4_truncate while we alter the tree 985 * Block out ext4_truncate while we alter the tree
985 */ 986 */
986 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 987 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
987 &count, goal, 988 &count, goal,
988 offsets + (partial - chain), partial); 989 offsets + (partial - chain), partial);
989 990
990 /* 991 /*
991 * The ext4_splice_branch call will free and forget any buffers 992 * The ext4_splice_branch call will free and forget any buffers
@@ -996,20 +997,8 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
996 */ 997 */
997 if (!err) 998 if (!err)
998 err = ext4_splice_branch(handle, inode, iblock, 999 err = ext4_splice_branch(handle, inode, iblock,
999 partial, indirect_blks, count); 1000 partial, indirect_blks, count);
1000 /* 1001 else
1001 * i_disksize growing is protected by i_data_sem. Don't forget to
1002 * protect it if you're about to implement concurrent
1003 * ext4_get_block() -bzzz
1004 */
1005 if (!err && extend_disksize) {
1006 disksize = ((loff_t) iblock + count) << inode->i_blkbits;
1007 if (disksize > i_size_read(inode))
1008 disksize = i_size_read(inode);
1009 if (disksize > ei->i_disksize)
1010 ei->i_disksize = disksize;
1011 }
1012 if (err)
1013 goto cleanup; 1002 goto cleanup;
1014 1003
1015 set_buffer_new(bh_result); 1004 set_buffer_new(bh_result);
@@ -1120,8 +1109,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1120 ext4_discard_preallocations(inode); 1109 ext4_discard_preallocations(inode);
1121} 1110}
1122 1111
1112static int check_block_validity(struct inode *inode, sector_t logical,
1113 sector_t phys, int len)
1114{
1115 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1116 ext4_error(inode->i_sb, "check_block_validity",
1117 "inode #%lu logical block %llu mapped to %llu "
1118 "(size %d)", inode->i_ino,
1119 (unsigned long long) logical,
1120 (unsigned long long) phys, len);
1121 WARN_ON(1);
1122 return -EIO;
1123 }
1124 return 0;
1125}
1126
1123/* 1127/*
1124 * The ext4_get_blocks_wrap() function try to look up the requested blocks, 1128 * The ext4_get_blocks() function tries to look up the requested blocks,
1125 * and returns if the blocks are already mapped. 1129 * and returns if the blocks are already mapped.
1126 * 1130 *
1127 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1131 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
@@ -1129,7 +1133,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1129 * mapped. 1133 * mapped.
1130 * 1134 *
1131 * If file type is extents based, it will call ext4_ext_get_blocks(), 1135 * If file type is extents based, it will call ext4_ext_get_blocks(),
1132 * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping 1136 * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
1133 * based files 1137 * based files
1134 * 1138 *
1135 * On success, it returns the number of blocks being mapped or allocate. 1139 * On success, it returns the number of blocks being mapped or allocate.
@@ -1142,9 +1146,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1142 * 1146 *
1143 * It returns the error in case of allocation failure. 1147 * It returns the error in case of allocation failure.
1144 */ 1148 */
1145int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 1149int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1146 unsigned int max_blocks, struct buffer_head *bh, 1150 unsigned int max_blocks, struct buffer_head *bh,
1147 int create, int extend_disksize, int flag) 1151 int flags)
1148{ 1152{
1149 int retval; 1153 int retval;
1150 1154
@@ -1152,21 +1156,28 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1152 clear_buffer_unwritten(bh); 1156 clear_buffer_unwritten(bh);
1153 1157
1154 /* 1158 /*
1155 * Try to see if we can get the block without requesting 1159 * Try to see if we can get the block without requesting a new
1156 * for new file system block. 1160 * file system block.
1157 */ 1161 */
1158 down_read((&EXT4_I(inode)->i_data_sem)); 1162 down_read((&EXT4_I(inode)->i_data_sem));
1159 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1163 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
1160 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1164 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
1161 bh, 0, 0); 1165 bh, 0);
1162 } else { 1166 } else {
1163 retval = ext4_get_blocks_handle(handle, 1167 retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
1164 inode, block, max_blocks, bh, 0, 0); 1168 bh, 0);
1165 } 1169 }
1166 up_read((&EXT4_I(inode)->i_data_sem)); 1170 up_read((&EXT4_I(inode)->i_data_sem));
1167 1171
1172 if (retval > 0 && buffer_mapped(bh)) {
1173 int ret = check_block_validity(inode, block,
1174 bh->b_blocknr, retval);
1175 if (ret != 0)
1176 return ret;
1177 }
1178
1168 /* If it is only a block(s) look up */ 1179 /* If it is only a block(s) look up */
1169 if (!create) 1180 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
1170 return retval; 1181 return retval;
1171 1182
1172 /* 1183 /*
@@ -1205,7 +1216,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1205 * let the underlying get_block() function know to 1216 * let the underlying get_block() function know to
1206 * avoid double accounting 1217 * avoid double accounting
1207 */ 1218 */
1208 if (flag) 1219 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1209 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1220 EXT4_I(inode)->i_delalloc_reserved_flag = 1;
1210 /* 1221 /*
1211 * We need to check for EXT4 here because migrate 1222 * We need to check for EXT4 here because migrate
@@ -1213,10 +1224,10 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1213 */ 1224 */
1214 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1225 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
1215 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1226 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
1216 bh, create, extend_disksize); 1227 bh, flags);
1217 } else { 1228 } else {
1218 retval = ext4_get_blocks_handle(handle, inode, block, 1229 retval = ext4_ind_get_blocks(handle, inode, block,
1219 max_blocks, bh, create, extend_disksize); 1230 max_blocks, bh, flags);
1220 1231
1221 if (retval > 0 && buffer_new(bh)) { 1232 if (retval > 0 && buffer_new(bh)) {
1222 /* 1233 /*
@@ -1229,18 +1240,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1229 } 1240 }
1230 } 1241 }
1231 1242
1232 if (flag) { 1243 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1233 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1244 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1234 /* 1245
1235 * Update reserved blocks/metadata blocks 1246 /*
1236 * after successful block allocation 1247 * Update reserved blocks/metadata blocks after successful
1237 * which were deferred till now 1248 * block allocation which had been deferred till now.
1238 */ 1249 */
1239 if ((retval > 0) && buffer_delay(bh)) 1250 if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
1240 ext4_da_update_reserve_space(inode, retval); 1251 ext4_da_update_reserve_space(inode, retval);
1241 }
1242 1252
1243 up_write((&EXT4_I(inode)->i_data_sem)); 1253 up_write((&EXT4_I(inode)->i_data_sem));
1254 if (retval > 0 && buffer_mapped(bh)) {
1255 int ret = check_block_validity(inode, block,
1256 bh->b_blocknr, retval);
1257 if (ret != 0)
1258 return ret;
1259 }
1244 return retval; 1260 return retval;
1245} 1261}
1246 1262
@@ -1268,8 +1284,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
1268 started = 1; 1284 started = 1;
1269 } 1285 }
1270 1286
1271 ret = ext4_get_blocks_wrap(handle, inode, iblock, 1287 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
1272 max_blocks, bh_result, create, 0, 0); 1288 create ? EXT4_GET_BLOCKS_CREATE : 0);
1273 if (ret > 0) { 1289 if (ret > 0) {
1274 bh_result->b_size = (ret << inode->i_blkbits); 1290 bh_result->b_size = (ret << inode->i_blkbits);
1275 ret = 0; 1291 ret = 0;
@@ -1288,17 +1304,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1288{ 1304{
1289 struct buffer_head dummy; 1305 struct buffer_head dummy;
1290 int fatal = 0, err; 1306 int fatal = 0, err;
1307 int flags = 0;
1291 1308
1292 J_ASSERT(handle != NULL || create == 0); 1309 J_ASSERT(handle != NULL || create == 0);
1293 1310
1294 dummy.b_state = 0; 1311 dummy.b_state = 0;
1295 dummy.b_blocknr = -1000; 1312 dummy.b_blocknr = -1000;
1296 buffer_trace_init(&dummy.b_history); 1313 buffer_trace_init(&dummy.b_history);
1297 err = ext4_get_blocks_wrap(handle, inode, block, 1, 1314 if (create)
1298 &dummy, create, 1, 0); 1315 flags |= EXT4_GET_BLOCKS_CREATE;
1316 err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
1299 /* 1317 /*
1300 * ext4_get_blocks_handle() returns number of blocks 1318 * ext4_get_blocks() returns number of blocks mapped. 0 in
1301 * mapped. 0 in case of a HOLE. 1319 * case of a HOLE.
1302 */ 1320 */
1303 if (err > 0) { 1321 if (err > 0) {
1304 if (err > 1) 1322 if (err > 1)
@@ -1385,8 +1403,7 @@ static int walk_page_buffers(handle_t *handle,
1385 1403
1386 for (bh = head, block_start = 0; 1404 for (bh = head, block_start = 0;
1387 ret == 0 && (bh != head || !block_start); 1405 ret == 0 && (bh != head || !block_start);
1388 block_start = block_end, bh = next) 1406 block_start = block_end, bh = next) {
1389 {
1390 next = bh->b_this_page; 1407 next = bh->b_this_page;
1391 block_end = block_start + blocksize; 1408 block_end = block_start + blocksize;
1392 if (block_end <= from || block_start >= to) { 1409 if (block_end <= from || block_start >= to) {
@@ -1427,7 +1444,7 @@ static int walk_page_buffers(handle_t *handle,
1427 * write. 1444 * write.
1428 */ 1445 */
1429static int do_journal_get_write_access(handle_t *handle, 1446static int do_journal_get_write_access(handle_t *handle,
1430 struct buffer_head *bh) 1447 struct buffer_head *bh)
1431{ 1448{
1432 if (!buffer_mapped(bh) || buffer_freed(bh)) 1449 if (!buffer_mapped(bh) || buffer_freed(bh))
1433 return 0; 1450 return 0;
@@ -1435,22 +1452,24 @@ static int do_journal_get_write_access(handle_t *handle,
1435} 1452}
1436 1453
1437static int ext4_write_begin(struct file *file, struct address_space *mapping, 1454static int ext4_write_begin(struct file *file, struct address_space *mapping,
1438 loff_t pos, unsigned len, unsigned flags, 1455 loff_t pos, unsigned len, unsigned flags,
1439 struct page **pagep, void **fsdata) 1456 struct page **pagep, void **fsdata)
1440{ 1457{
1441 struct inode *inode = mapping->host; 1458 struct inode *inode = mapping->host;
1442 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 1459 int ret, needed_blocks;
1443 handle_t *handle; 1460 handle_t *handle;
1444 int retries = 0; 1461 int retries = 0;
1445 struct page *page; 1462 struct page *page;
1446 pgoff_t index; 1463 pgoff_t index;
1447 unsigned from, to; 1464 unsigned from, to;
1448 1465
1449 trace_mark(ext4_write_begin, 1466 trace_ext4_write_begin(inode, pos, len, flags);
1450 "dev %s ino %lu pos %llu len %u flags %u", 1467 /*
1451 inode->i_sb->s_id, inode->i_ino, 1468 * Reserve one block more for addition to orphan list in case
1452 (unsigned long long) pos, len, flags); 1469 * we allocate blocks but write fails for some reason
1453 index = pos >> PAGE_CACHE_SHIFT; 1470 */
1471 needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
1472 index = pos >> PAGE_CACHE_SHIFT;
1454 from = pos & (PAGE_CACHE_SIZE - 1); 1473 from = pos & (PAGE_CACHE_SIZE - 1);
1455 to = from + len; 1474 to = from + len;
1456 1475
@@ -1483,15 +1502,30 @@ retry:
1483 1502
1484 if (ret) { 1503 if (ret) {
1485 unlock_page(page); 1504 unlock_page(page);
1486 ext4_journal_stop(handle);
1487 page_cache_release(page); 1505 page_cache_release(page);
1488 /* 1506 /*
1489 * block_write_begin may have instantiated a few blocks 1507 * block_write_begin may have instantiated a few blocks
1490 * outside i_size. Trim these off again. Don't need 1508 * outside i_size. Trim these off again. Don't need
1491 * i_size_read because we hold i_mutex. 1509 * i_size_read because we hold i_mutex.
1510 *
1511 * Add inode to orphan list in case we crash before
1512 * truncate finishes
1492 */ 1513 */
1493 if (pos + len > inode->i_size) 1514 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1494 vmtruncate(inode, inode->i_size); 1515 ext4_orphan_add(handle, inode);
1516
1517 ext4_journal_stop(handle);
1518 if (pos + len > inode->i_size) {
1519 ext4_truncate(inode);
1520 /*
1521 * If truncate failed early the inode might
1522 * still be on the orphan list; we need to
1523 * make sure the inode is removed from the
1524 * orphan list in that case.
1525 */
1526 if (inode->i_nlink)
1527 ext4_orphan_del(NULL, inode);
1528 }
1495 } 1529 }
1496 1530
1497 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1531 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1509,6 +1543,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1509 return ext4_handle_dirty_metadata(handle, NULL, bh); 1543 return ext4_handle_dirty_metadata(handle, NULL, bh);
1510} 1544}
1511 1545
1546static int ext4_generic_write_end(struct file *file,
1547 struct address_space *mapping,
1548 loff_t pos, unsigned len, unsigned copied,
1549 struct page *page, void *fsdata)
1550{
1551 int i_size_changed = 0;
1552 struct inode *inode = mapping->host;
1553 handle_t *handle = ext4_journal_current_handle();
1554
1555 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1556
1557 /*
1558 * No need to use i_size_read() here, the i_size
1559 * cannot change under us because we hold i_mutex.
1560 *
1561 * But it's important to update i_size while still holding page lock:
1562 * page writeout could otherwise come in and zero beyond i_size.
1563 */
1564 if (pos + copied > inode->i_size) {
1565 i_size_write(inode, pos + copied);
1566 i_size_changed = 1;
1567 }
1568
1569 if (pos + copied > EXT4_I(inode)->i_disksize) {
1570 /* We need to mark inode dirty even if
1571 * new_i_size is less that inode->i_size
1572 * bu greater than i_disksize.(hint delalloc)
1573 */
1574 ext4_update_i_disksize(inode, (pos + copied));
1575 i_size_changed = 1;
1576 }
1577 unlock_page(page);
1578 page_cache_release(page);
1579
1580 /*
1581 * Don't mark the inode dirty under page lock. First, it unnecessarily
1582 * makes the holding time of page lock longer. Second, it forces lock
1583 * ordering of page lock and transaction start for journaling
1584 * filesystems.
1585 */
1586 if (i_size_changed)
1587 ext4_mark_inode_dirty(handle, inode);
1588
1589 return copied;
1590}
1591
1512/* 1592/*
1513 * We need to pick up the new inode size which generic_commit_write gave us 1593 * We need to pick up the new inode size which generic_commit_write gave us
1514 * `file' can be NULL - eg, when called from page_symlink(). 1594 * `file' can be NULL - eg, when called from page_symlink().
@@ -1517,36 +1597,27 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1517 * buffers are managed internally. 1597 * buffers are managed internally.
1518 */ 1598 */
1519static int ext4_ordered_write_end(struct file *file, 1599static int ext4_ordered_write_end(struct file *file,
1520 struct address_space *mapping, 1600 struct address_space *mapping,
1521 loff_t pos, unsigned len, unsigned copied, 1601 loff_t pos, unsigned len, unsigned copied,
1522 struct page *page, void *fsdata) 1602 struct page *page, void *fsdata)
1523{ 1603{
1524 handle_t *handle = ext4_journal_current_handle(); 1604 handle_t *handle = ext4_journal_current_handle();
1525 struct inode *inode = mapping->host; 1605 struct inode *inode = mapping->host;
1526 int ret = 0, ret2; 1606 int ret = 0, ret2;
1527 1607
1528 trace_mark(ext4_ordered_write_end, 1608 trace_ext4_ordered_write_end(inode, pos, len, copied);
1529 "dev %s ino %lu pos %llu len %u copied %u",
1530 inode->i_sb->s_id, inode->i_ino,
1531 (unsigned long long) pos, len, copied);
1532 ret = ext4_jbd2_file_inode(handle, inode); 1609 ret = ext4_jbd2_file_inode(handle, inode);
1533 1610
1534 if (ret == 0) { 1611 if (ret == 0) {
1535 loff_t new_i_size; 1612 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1536
1537 new_i_size = pos + copied;
1538 if (new_i_size > EXT4_I(inode)->i_disksize) {
1539 ext4_update_i_disksize(inode, new_i_size);
1540 /* We need to mark inode dirty even if
1541 * new_i_size is less that inode->i_size
1542 * bu greater than i_disksize.(hint delalloc)
1543 */
1544 ext4_mark_inode_dirty(handle, inode);
1545 }
1546
1547 ret2 = generic_write_end(file, mapping, pos, len, copied,
1548 page, fsdata); 1613 page, fsdata);
1549 copied = ret2; 1614 copied = ret2;
1615 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1616 /* if we have allocated more blocks and copied
1617 * less. We will have blocks allocated outside
1618 * inode->i_size. So truncate them
1619 */
1620 ext4_orphan_add(handle, inode);
1550 if (ret2 < 0) 1621 if (ret2 < 0)
1551 ret = ret2; 1622 ret = ret2;
1552 } 1623 }
@@ -1554,36 +1625,41 @@ static int ext4_ordered_write_end(struct file *file,
1554 if (!ret) 1625 if (!ret)
1555 ret = ret2; 1626 ret = ret2;
1556 1627
1628 if (pos + len > inode->i_size) {
1629 ext4_truncate(inode);
1630 /*
1631 * If truncate failed early the inode might still be
1632 * on the orphan list; we need to make sure the inode
1633 * is removed from the orphan list in that case.
1634 */
1635 if (inode->i_nlink)
1636 ext4_orphan_del(NULL, inode);
1637 }
1638
1639
1557 return ret ? ret : copied; 1640 return ret ? ret : copied;
1558} 1641}
1559 1642
1560static int ext4_writeback_write_end(struct file *file, 1643static int ext4_writeback_write_end(struct file *file,
1561 struct address_space *mapping, 1644 struct address_space *mapping,
1562 loff_t pos, unsigned len, unsigned copied, 1645 loff_t pos, unsigned len, unsigned copied,
1563 struct page *page, void *fsdata) 1646 struct page *page, void *fsdata)
1564{ 1647{
1565 handle_t *handle = ext4_journal_current_handle(); 1648 handle_t *handle = ext4_journal_current_handle();
1566 struct inode *inode = mapping->host; 1649 struct inode *inode = mapping->host;
1567 int ret = 0, ret2; 1650 int ret = 0, ret2;
1568 loff_t new_i_size;
1569 1651
1570 trace_mark(ext4_writeback_write_end, 1652 trace_ext4_writeback_write_end(inode, pos, len, copied);
1571 "dev %s ino %lu pos %llu len %u copied %u", 1653 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1572 inode->i_sb->s_id, inode->i_ino,
1573 (unsigned long long) pos, len, copied);
1574 new_i_size = pos + copied;
1575 if (new_i_size > EXT4_I(inode)->i_disksize) {
1576 ext4_update_i_disksize(inode, new_i_size);
1577 /* We need to mark inode dirty even if
1578 * new_i_size is less that inode->i_size
1579 * bu greater than i_disksize.(hint delalloc)
1580 */
1581 ext4_mark_inode_dirty(handle, inode);
1582 }
1583
1584 ret2 = generic_write_end(file, mapping, pos, len, copied,
1585 page, fsdata); 1654 page, fsdata);
1586 copied = ret2; 1655 copied = ret2;
1656 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1657 /* if we have allocated more blocks and copied
1658 * less. We will have blocks allocated outside
1659 * inode->i_size. So truncate them
1660 */
1661 ext4_orphan_add(handle, inode);
1662
1587 if (ret2 < 0) 1663 if (ret2 < 0)
1588 ret = ret2; 1664 ret = ret2;
1589 1665
@@ -1591,13 +1667,24 @@ static int ext4_writeback_write_end(struct file *file,
1591 if (!ret) 1667 if (!ret)
1592 ret = ret2; 1668 ret = ret2;
1593 1669
1670 if (pos + len > inode->i_size) {
1671 ext4_truncate(inode);
1672 /*
1673 * If truncate failed early the inode might still be
1674 * on the orphan list; we need to make sure the inode
1675 * is removed from the orphan list in that case.
1676 */
1677 if (inode->i_nlink)
1678 ext4_orphan_del(NULL, inode);
1679 }
1680
1594 return ret ? ret : copied; 1681 return ret ? ret : copied;
1595} 1682}
1596 1683
1597static int ext4_journalled_write_end(struct file *file, 1684static int ext4_journalled_write_end(struct file *file,
1598 struct address_space *mapping, 1685 struct address_space *mapping,
1599 loff_t pos, unsigned len, unsigned copied, 1686 loff_t pos, unsigned len, unsigned copied,
1600 struct page *page, void *fsdata) 1687 struct page *page, void *fsdata)
1601{ 1688{
1602 handle_t *handle = ext4_journal_current_handle(); 1689 handle_t *handle = ext4_journal_current_handle();
1603 struct inode *inode = mapping->host; 1690 struct inode *inode = mapping->host;
@@ -1606,10 +1693,7 @@ static int ext4_journalled_write_end(struct file *file,
1606 unsigned from, to; 1693 unsigned from, to;
1607 loff_t new_i_size; 1694 loff_t new_i_size;
1608 1695
1609 trace_mark(ext4_journalled_write_end, 1696 trace_ext4_journalled_write_end(inode, pos, len, copied);
1610 "dev %s ino %lu pos %llu len %u copied %u",
1611 inode->i_sb->s_id, inode->i_ino,
1612 (unsigned long long) pos, len, copied);
1613 from = pos & (PAGE_CACHE_SIZE - 1); 1697 from = pos & (PAGE_CACHE_SIZE - 1);
1614 to = from + len; 1698 to = from + len;
1615 1699
@@ -1635,10 +1719,27 @@ static int ext4_journalled_write_end(struct file *file,
1635 } 1719 }
1636 1720
1637 unlock_page(page); 1721 unlock_page(page);
1722 page_cache_release(page);
1723 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1724 /* if we have allocated more blocks and copied
1725 * less. We will have blocks allocated outside
1726 * inode->i_size. So truncate them
1727 */
1728 ext4_orphan_add(handle, inode);
1729
1638 ret2 = ext4_journal_stop(handle); 1730 ret2 = ext4_journal_stop(handle);
1639 if (!ret) 1731 if (!ret)
1640 ret = ret2; 1732 ret = ret2;
1641 page_cache_release(page); 1733 if (pos + len > inode->i_size) {
1734 ext4_truncate(inode);
1735 /*
1736 * If truncate failed early the inode might still be
1737 * on the orphan list; we need to make sure the inode
1738 * is removed from the orphan list in that case.
1739 */
1740 if (inode->i_nlink)
1741 ext4_orphan_del(NULL, inode);
1742 }
1642 1743
1643 return ret ? ret : copied; 1744 return ret ? ret : copied;
1644} 1745}
@@ -1738,7 +1839,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1738} 1839}
1739 1840
1740static void ext4_da_page_release_reservation(struct page *page, 1841static void ext4_da_page_release_reservation(struct page *page,
1741 unsigned long offset) 1842 unsigned long offset)
1742{ 1843{
1743 int to_release = 0; 1844 int to_release = 0;
1744 struct buffer_head *head, *bh; 1845 struct buffer_head *head, *bh;
@@ -1852,7 +1953,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1852 * @logical - first logical block to start assignment with 1953 * @logical - first logical block to start assignment with
1853 * 1954 *
1854 * the function goes through all passed space and put actual disk 1955 * the function goes through all passed space and put actual disk
1855 * block numbers into buffer heads, dropping BH_Delay 1956 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
1856 */ 1957 */
1857static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 1958static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1858 struct buffer_head *exbh) 1959 struct buffer_head *exbh)
@@ -1902,16 +2003,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1902 do { 2003 do {
1903 if (cur_logical >= logical + blocks) 2004 if (cur_logical >= logical + blocks)
1904 break; 2005 break;
1905 if (buffer_delay(bh)) { 2006
1906 bh->b_blocknr = pblock; 2007 if (buffer_delay(bh) ||
1907 clear_buffer_delay(bh); 2008 buffer_unwritten(bh)) {
1908 bh->b_bdev = inode->i_sb->s_bdev; 2009
1909 } else if (buffer_unwritten(bh)) { 2010 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
1910 bh->b_blocknr = pblock; 2011
1911 clear_buffer_unwritten(bh); 2012 if (buffer_delay(bh)) {
1912 set_buffer_mapped(bh); 2013 clear_buffer_delay(bh);
1913 set_buffer_new(bh); 2014 bh->b_blocknr = pblock;
1914 bh->b_bdev = inode->i_sb->s_bdev; 2015 } else {
2016 /*
2017 * unwritten already should have
2018 * blocknr assigned. Verify that
2019 */
2020 clear_buffer_unwritten(bh);
2021 BUG_ON(bh->b_blocknr != pblock);
2022 }
2023
1915 } else if (buffer_mapped(bh)) 2024 } else if (buffer_mapped(bh))
1916 BUG_ON(bh->b_blocknr != pblock); 2025 BUG_ON(bh->b_blocknr != pblock);
1917 2026
@@ -1990,51 +2099,6 @@ static void ext4_print_free_blocks(struct inode *inode)
1990 return; 2099 return;
1991} 2100}
1992 2101
1993#define EXT4_DELALLOC_RSVED 1
1994static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1995 struct buffer_head *bh_result, int create)
1996{
1997 int ret;
1998 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1999 loff_t disksize = EXT4_I(inode)->i_disksize;
2000 handle_t *handle = NULL;
2001
2002 handle = ext4_journal_current_handle();
2003 BUG_ON(!handle);
2004 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2005 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2006 if (ret <= 0)
2007 return ret;
2008
2009 bh_result->b_size = (ret << inode->i_blkbits);
2010
2011 if (ext4_should_order_data(inode)) {
2012 int retval;
2013 retval = ext4_jbd2_file_inode(handle, inode);
2014 if (retval)
2015 /*
2016 * Failed to add inode for ordered mode. Don't
2017 * update file size
2018 */
2019 return retval;
2020 }
2021
2022 /*
2023 * Update on-disk size along with block allocation we don't
2024 * use 'extend_disksize' as size may change within already
2025 * allocated block -bzzz
2026 */
2027 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2028 if (disksize > i_size_read(inode))
2029 disksize = i_size_read(inode);
2030 if (disksize > EXT4_I(inode)->i_disksize) {
2031 ext4_update_i_disksize(inode, disksize);
2032 ret = ext4_mark_inode_dirty(handle, inode);
2033 return ret;
2034 }
2035 return 0;
2036}
2037
2038/* 2102/*
2039 * mpage_da_map_blocks - go through given space 2103 * mpage_da_map_blocks - go through given space
2040 * 2104 *
@@ -2045,29 +2109,57 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2045 */ 2109 */
2046static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2110static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2047{ 2111{
2048 int err = 0; 2112 int err, blks, get_blocks_flags;
2049 struct buffer_head new; 2113 struct buffer_head new;
2050 sector_t next; 2114 sector_t next = mpd->b_blocknr;
2115 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2116 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2117 handle_t *handle = NULL;
2051 2118
2052 /* 2119 /*
2053 * We consider only non-mapped and non-allocated blocks 2120 * We consider only non-mapped and non-allocated blocks
2054 */ 2121 */
2055 if ((mpd->b_state & (1 << BH_Mapped)) && 2122 if ((mpd->b_state & (1 << BH_Mapped)) &&
2056 !(mpd->b_state & (1 << BH_Delay))) 2123 !(mpd->b_state & (1 << BH_Delay)) &&
2124 !(mpd->b_state & (1 << BH_Unwritten)))
2057 return 0; 2125 return 0;
2058 new.b_state = mpd->b_state; 2126
2059 new.b_blocknr = 0;
2060 new.b_size = mpd->b_size;
2061 next = mpd->b_blocknr;
2062 /* 2127 /*
2063 * If we didn't accumulate anything 2128 * If we didn't accumulate anything to write simply return
2064 * to write simply return
2065 */ 2129 */
2066 if (!new.b_size) 2130 if (!mpd->b_size)
2067 return 0; 2131 return 0;
2068 2132
2069 err = ext4_da_get_block_write(mpd->inode, next, &new, 1); 2133 handle = ext4_journal_current_handle();
2070 if (err) { 2134 BUG_ON(!handle);
2135
2136 /*
2137 * Call ext4_get_blocks() to allocate any delayed allocation
2138 * blocks, or to convert an uninitialized extent to be
2139 * initialized (in the case where we have written into
2140 * one or more preallocated blocks).
2141 *
2142 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
2143 * indicate that we are on the delayed allocation path. This
2144 * affects functions in many different parts of the allocation
2145 * call path. This flag exists primarily because we don't
2146 * want to change *many* call functions, so ext4_get_blocks()
2147 * will set the magic i_delalloc_reserved_flag once the
2148 * inode's allocation semaphore is taken.
2149 *
2150 * If the blocks in questions were delalloc blocks, set
2151 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2152 * variables are updated after the blocks have been allocated.
2153 */
2154 new.b_state = 0;
2155 get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
2156 EXT4_GET_BLOCKS_DELALLOC_RESERVE);
2157 if (mpd->b_state & (1 << BH_Delay))
2158 get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
2159 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
2160 &new, get_blocks_flags);
2161 if (blks < 0) {
2162 err = blks;
2071 /* 2163 /*
2072 * If get block returns with error we simply 2164 * If get block returns with error we simply
2073 * return. Later writepage will redirty the page and 2165 * return. Later writepage will redirty the page and
@@ -2100,12 +2192,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2100 if (err == -ENOSPC) { 2192 if (err == -ENOSPC) {
2101 ext4_print_free_blocks(mpd->inode); 2193 ext4_print_free_blocks(mpd->inode);
2102 } 2194 }
2103 /* invlaidate all the pages */ 2195 /* invalidate all the pages */
2104 ext4_da_block_invalidatepages(mpd, next, 2196 ext4_da_block_invalidatepages(mpd, next,
2105 mpd->b_size >> mpd->inode->i_blkbits); 2197 mpd->b_size >> mpd->inode->i_blkbits);
2106 return err; 2198 return err;
2107 } 2199 }
2108 BUG_ON(new.b_size == 0); 2200 BUG_ON(blks == 0);
2201
2202 new.b_size = (blks << mpd->inode->i_blkbits);
2109 2203
2110 if (buffer_new(&new)) 2204 if (buffer_new(&new))
2111 __unmap_underlying_blocks(mpd->inode, &new); 2205 __unmap_underlying_blocks(mpd->inode, &new);
@@ -2118,6 +2212,23 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2118 (mpd->b_state & (1 << BH_Unwritten))) 2212 (mpd->b_state & (1 << BH_Unwritten)))
2119 mpage_put_bnr_to_bhs(mpd, next, &new); 2213 mpage_put_bnr_to_bhs(mpd, next, &new);
2120 2214
2215 if (ext4_should_order_data(mpd->inode)) {
2216 err = ext4_jbd2_file_inode(handle, mpd->inode);
2217 if (err)
2218 return err;
2219 }
2220
2221 /*
2222 * Update on-disk size along with block allocation.
2223 */
2224 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
2225 if (disksize > i_size_read(mpd->inode))
2226 disksize = i_size_read(mpd->inode);
2227 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2228 ext4_update_i_disksize(mpd->inode, disksize);
2229 return ext4_mark_inode_dirty(handle, mpd->inode);
2230 }
2231
2121 return 0; 2232 return 0;
2122} 2233}
2123 2234
@@ -2192,6 +2303,11 @@ flush_it:
2192 return; 2303 return;
2193} 2304}
2194 2305
2306static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2307{
2308 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
2309}
2310
2195/* 2311/*
2196 * __mpage_da_writepage - finds extent of pages and blocks 2312 * __mpage_da_writepage - finds extent of pages and blocks
2197 * 2313 *
@@ -2274,10 +2390,9 @@ static int __mpage_da_writepage(struct page *page,
2274 * We need to try to allocate 2390 * We need to try to allocate
2275 * unmapped blocks in the same page. 2391 * unmapped blocks in the same page.
2276 * Otherwise we won't make progress 2392 * Otherwise we won't make progress
2277 * with the page in ext4_da_writepage 2393 * with the page in ext4_writepage
2278 */ 2394 */
2279 if (buffer_dirty(bh) && 2395 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2280 (!buffer_mapped(bh) || buffer_delay(bh))) {
2281 mpage_add_bh_to_extent(mpd, logical, 2396 mpage_add_bh_to_extent(mpd, logical,
2282 bh->b_size, 2397 bh->b_size,
2283 bh->b_state); 2398 bh->b_state);
@@ -2303,8 +2418,16 @@ static int __mpage_da_writepage(struct page *page,
2303} 2418}
2304 2419
2305/* 2420/*
2306 * this is a special callback for ->write_begin() only 2421 * This is a special get_blocks_t callback which is used by
2307 * it's intention is to return mapped block or reserve space 2422 * ext4_da_write_begin(). It will either return mapped block or
2423 * reserve space for a single block.
2424 *
2425 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
2426 * We also have b_blocknr = -1 and b_bdev initialized properly
2427 *
2428 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
2429 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
2430 * initialized properly.
2308 */ 2431 */
2309static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2432static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2310 struct buffer_head *bh_result, int create) 2433 struct buffer_head *bh_result, int create)
@@ -2323,7 +2446,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2323 * preallocated blocks are unmapped but should treated 2446 * preallocated blocks are unmapped but should treated
2324 * the same as allocated blocks. 2447 * the same as allocated blocks.
2325 */ 2448 */
2326 ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); 2449 ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0);
2327 if ((ret == 0) && !buffer_delay(bh_result)) { 2450 if ((ret == 0) && !buffer_delay(bh_result)) {
2328 /* the block isn't (pre)allocated yet, let's reserve space */ 2451 /* the block isn't (pre)allocated yet, let's reserve space */
2329 /* 2452 /*
@@ -2340,40 +2463,52 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2340 set_buffer_delay(bh_result); 2463 set_buffer_delay(bh_result);
2341 } else if (ret > 0) { 2464 } else if (ret > 0) {
2342 bh_result->b_size = (ret << inode->i_blkbits); 2465 bh_result->b_size = (ret << inode->i_blkbits);
2343 /* 2466 if (buffer_unwritten(bh_result)) {
2344 * With sub-block writes into unwritten extents 2467 /* A delayed write to unwritten bh should
2345 * we also need to mark the buffer as new so that 2468 * be marked new and mapped. Mapped ensures
2346 * the unwritten parts of the buffer gets correctly zeroed. 2469 * that we don't do get_block multiple times
2347 */ 2470 * when we write to the same offset and new
2348 if (buffer_unwritten(bh_result)) 2471 * ensures that we do proper zero out for
2472 * partial write.
2473 */
2349 set_buffer_new(bh_result); 2474 set_buffer_new(bh_result);
2475 set_buffer_mapped(bh_result);
2476 }
2350 ret = 0; 2477 ret = 0;
2351 } 2478 }
2352 2479
2353 return ret; 2480 return ret;
2354} 2481}
2355 2482
2356static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2483/*
2357{ 2484 * This function is used as a standard get_block_t calback function
2358 /* 2485 * when there is no desire to allocate any blocks. It is used as a
2359 * unmapped buffer is possible for holes. 2486 * callback function for block_prepare_write(), nobh_writepage(), and
2360 * delay buffer is possible with delayed allocation 2487 * block_write_full_page(). These functions should only try to map a
2361 */ 2488 * single block at a time.
2362 return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); 2489 *
2363} 2490 * Since this function doesn't do block allocations even if the caller
2364 2491 * requests it by passing in create=1, it is critically important that
2365static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, 2492 * any caller checks to make sure that any buffer heads are returned
2493 * by this function are either all already mapped or marked for
2494 * delayed allocation before calling nobh_writepage() or
2495 * block_write_full_page(). Otherwise, b_blocknr could be left
2496 * unitialized, and the page write functions will be taken by
2497 * surprise.
2498 */
2499static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2366 struct buffer_head *bh_result, int create) 2500 struct buffer_head *bh_result, int create)
2367{ 2501{
2368 int ret = 0; 2502 int ret = 0;
2369 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2503 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2370 2504
2505 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2506
2371 /* 2507 /*
2372 * we don't want to do block allocation in writepage 2508 * we don't want to do block allocation in writepage
2373 * so call get_block_wrap with create = 0 2509 * so call get_block_wrap with create = 0
2374 */ 2510 */
2375 ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, 2511 ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
2376 bh_result, 0, 0, 0);
2377 if (ret > 0) { 2512 if (ret > 0) {
2378 bh_result->b_size = (ret << inode->i_blkbits); 2513 bh_result->b_size = (ret << inode->i_blkbits);
2379 ret = 0; 2514 ret = 0;
@@ -2381,14 +2516,102 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
2381 return ret; 2516 return ret;
2382} 2517}
2383 2518
2519static int bget_one(handle_t *handle, struct buffer_head *bh)
2520{
2521 get_bh(bh);
2522 return 0;
2523}
2524
2525static int bput_one(handle_t *handle, struct buffer_head *bh)
2526{
2527 put_bh(bh);
2528 return 0;
2529}
2530
2531static int __ext4_journalled_writepage(struct page *page,
2532 struct writeback_control *wbc,
2533 unsigned int len)
2534{
2535 struct address_space *mapping = page->mapping;
2536 struct inode *inode = mapping->host;
2537 struct buffer_head *page_bufs;
2538 handle_t *handle = NULL;
2539 int ret = 0;
2540 int err;
2541
2542 page_bufs = page_buffers(page);
2543 BUG_ON(!page_bufs);
2544 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
2545 /* As soon as we unlock the page, it can go away, but we have
2546 * references to buffers so we are safe */
2547 unlock_page(page);
2548
2549 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
2550 if (IS_ERR(handle)) {
2551 ret = PTR_ERR(handle);
2552 goto out;
2553 }
2554
2555 ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2556 do_journal_get_write_access);
2557
2558 err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2559 write_end_fn);
2560 if (ret == 0)
2561 ret = err;
2562 err = ext4_journal_stop(handle);
2563 if (!ret)
2564 ret = err;
2565
2566 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
2567 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
2568out:
2569 return ret;
2570}
2571
2384/* 2572/*
2385 * get called vi ext4_da_writepages after taking page lock (have journal handle) 2573 * Note that we don't need to start a transaction unless we're journaling data
2386 * get called via journal_submit_inode_data_buffers (no journal handle) 2574 * because we should have holes filled from ext4_page_mkwrite(). We even don't
2387 * get called via shrink_page_list via pdflush (no journal handle) 2575 * need to file the inode to the transaction's list in ordered mode because if
2388 * or grab_page_cache when doing write_begin (have journal handle) 2576 * we are writing back data added by write(), the inode is already there and if
2577 * we are writing back data modified via mmap(), noone guarantees in which
2578 * transaction the data will hit the disk. In case we are journaling data, we
2579 * cannot start transaction directly because transaction start ranks above page
2580 * lock so we have to do some magic.
2581 *
2582 * This function can get called via...
2583 * - ext4_da_writepages after taking page lock (have journal handle)
2584 * - journal_submit_inode_data_buffers (no journal handle)
2585 * - shrink_page_list via pdflush (no journal handle)
2586 * - grab_page_cache when doing write_begin (have journal handle)
2587 *
2588 * We don't do any block allocation in this function. If we have page with
2589 * multiple blocks we need to write those buffer_heads that are mapped. This
2590 * is important for mmaped based write. So if we do with blocksize 1K
2591 * truncate(f, 1024);
2592 * a = mmap(f, 0, 4096);
2593 * a[0] = 'a';
2594 * truncate(f, 4096);
2595 * we have in the page first buffer_head mapped via page_mkwrite call back
2596 * but other bufer_heads would be unmapped but dirty(dirty done via the
2597 * do_wp_page). So writepage should write the first block. If we modify
2598 * the mmap area beyond 1024 we will again get a page_fault and the
2599 * page_mkwrite callback will do the block allocation and mark the
2600 * buffer_heads mapped.
2601 *
2602 * We redirty the page if we have any buffer_heads that is either delay or
2603 * unwritten in the page.
2604 *
2605 * We can get recursively called as show below.
2606 *
2607 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
2608 * ext4_writepage()
2609 *
2610 * But since we don't do any block allocation we should not deadlock.
2611 * Page also have the dirty flag cleared so we don't get recurive page_lock.
2389 */ 2612 */
2390static int ext4_da_writepage(struct page *page, 2613static int ext4_writepage(struct page *page,
2391 struct writeback_control *wbc) 2614 struct writeback_control *wbc)
2392{ 2615{
2393 int ret = 0; 2616 int ret = 0;
2394 loff_t size; 2617 loff_t size;
@@ -2396,9 +2619,7 @@ static int ext4_da_writepage(struct page *page,
2396 struct buffer_head *page_bufs; 2619 struct buffer_head *page_bufs;
2397 struct inode *inode = page->mapping->host; 2620 struct inode *inode = page->mapping->host;
2398 2621
2399 trace_mark(ext4_da_writepage, 2622 trace_ext4_writepage(inode, page);
2400 "dev %s ino %lu page_index %lu",
2401 inode->i_sb->s_id, inode->i_ino, page->index);
2402 size = i_size_read(inode); 2623 size = i_size_read(inode);
2403 if (page->index == size >> PAGE_CACHE_SHIFT) 2624 if (page->index == size >> PAGE_CACHE_SHIFT)
2404 len = size & ~PAGE_CACHE_MASK; 2625 len = size & ~PAGE_CACHE_MASK;
@@ -2408,7 +2629,7 @@ static int ext4_da_writepage(struct page *page,
2408 if (page_has_buffers(page)) { 2629 if (page_has_buffers(page)) {
2409 page_bufs = page_buffers(page); 2630 page_bufs = page_buffers(page);
2410 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2631 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2411 ext4_bh_unmapped_or_delay)) { 2632 ext4_bh_delay_or_unwritten)) {
2412 /* 2633 /*
2413 * We don't want to do block allocation 2634 * We don't want to do block allocation
2414 * So redirty the page and return 2635 * So redirty the page and return
@@ -2435,13 +2656,13 @@ static int ext4_da_writepage(struct page *page,
2435 * all are mapped and non delay. We don't want to 2656 * all are mapped and non delay. We don't want to
2436 * do block allocation here. 2657 * do block allocation here.
2437 */ 2658 */
2438 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2659 ret = block_prepare_write(page, 0, len,
2439 ext4_normal_get_block_write); 2660 noalloc_get_block_write);
2440 if (!ret) { 2661 if (!ret) {
2441 page_bufs = page_buffers(page); 2662 page_bufs = page_buffers(page);
2442 /* check whether all are mapped and non delay */ 2663 /* check whether all are mapped and non delay */
2443 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2664 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2444 ext4_bh_unmapped_or_delay)) { 2665 ext4_bh_delay_or_unwritten)) {
2445 redirty_page_for_writepage(wbc, page); 2666 redirty_page_for_writepage(wbc, page);
2446 unlock_page(page); 2667 unlock_page(page);
2447 return 0; 2668 return 0;
@@ -2457,15 +2678,23 @@ static int ext4_da_writepage(struct page *page,
2457 return 0; 2678 return 0;
2458 } 2679 }
2459 /* now mark the buffer_heads as dirty and uptodate */ 2680 /* now mark the buffer_heads as dirty and uptodate */
2460 block_commit_write(page, 0, PAGE_CACHE_SIZE); 2681 block_commit_write(page, 0, len);
2682 }
2683
2684 if (PageChecked(page) && ext4_should_journal_data(inode)) {
2685 /*
2686 * It's mmapped pagecache. Add buffers and journal it. There
2687 * doesn't seem much point in redirtying the page here.
2688 */
2689 ClearPageChecked(page);
2690 return __ext4_journalled_writepage(page, wbc, len);
2461 } 2691 }
2462 2692
2463 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2693 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2464 ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); 2694 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2465 else 2695 else
2466 ret = block_write_full_page(page, 2696 ret = block_write_full_page(page, noalloc_get_block_write,
2467 ext4_normal_get_block_write, 2697 wbc);
2468 wbc);
2469 2698
2470 return ret; 2699 return ret;
2471} 2700}
@@ -2510,19 +2739,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2510 int needed_blocks, ret = 0, nr_to_writebump = 0; 2739 int needed_blocks, ret = 0, nr_to_writebump = 0;
2511 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2740 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2512 2741
2513 trace_mark(ext4_da_writepages, 2742 trace_ext4_da_writepages(inode, wbc);
2514 "dev %s ino %lu nr_t_write %ld "
2515 "pages_skipped %ld range_start %llu "
2516 "range_end %llu nonblocking %d "
2517 "for_kupdate %d for_reclaim %d "
2518 "for_writepages %d range_cyclic %d",
2519 inode->i_sb->s_id, inode->i_ino,
2520 wbc->nr_to_write, wbc->pages_skipped,
2521 (unsigned long long) wbc->range_start,
2522 (unsigned long long) wbc->range_end,
2523 wbc->nonblocking, wbc->for_kupdate,
2524 wbc->for_reclaim, wbc->for_writepages,
2525 wbc->range_cyclic);
2526 2743
2527 /* 2744 /*
2528 * No pages to write? This is mainly a kludge to avoid starting 2745 * No pages to write? This is mainly a kludge to avoid starting
@@ -2536,13 +2753,13 @@ static int ext4_da_writepages(struct address_space *mapping,
2536 * If the filesystem has aborted, it is read-only, so return 2753 * If the filesystem has aborted, it is read-only, so return
2537 * right away instead of dumping stack traces later on that 2754 * right away instead of dumping stack traces later on that
2538 * will obscure the real source of the problem. We test 2755 * will obscure the real source of the problem. We test
2539 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because 2756 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2540 * the latter could be true if the filesystem is mounted 2757 * the latter could be true if the filesystem is mounted
2541 * read-only, and in that case, ext4_da_writepages should 2758 * read-only, and in that case, ext4_da_writepages should
2542 * *never* be called, so if that ever happens, we would want 2759 * *never* be called, so if that ever happens, we would want
2543 * the stack trace. 2760 * the stack trace.
2544 */ 2761 */
2545 if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT)) 2762 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2546 return -EROFS; 2763 return -EROFS;
2547 2764
2548 /* 2765 /*
@@ -2688,14 +2905,7 @@ out_writepages:
2688 if (!no_nrwrite_index_update) 2905 if (!no_nrwrite_index_update)
2689 wbc->no_nrwrite_index_update = 0; 2906 wbc->no_nrwrite_index_update = 0;
2690 wbc->nr_to_write -= nr_to_writebump; 2907 wbc->nr_to_write -= nr_to_writebump;
2691 trace_mark(ext4_da_writepage_result, 2908 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2692 "dev %s ino %lu ret %d pages_written %d "
2693 "pages_skipped %ld congestion %d "
2694 "more_io %d no_nrwrite_index_update %d",
2695 inode->i_sb->s_id, inode->i_ino, ret,
2696 pages_written, wbc->pages_skipped,
2697 wbc->encountered_congestion, wbc->more_io,
2698 wbc->no_nrwrite_index_update);
2699 return ret; 2909 return ret;
2700} 2910}
2701 2911
@@ -2727,8 +2937,8 @@ static int ext4_nonda_switch(struct super_block *sb)
2727} 2937}
2728 2938
2729static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2939static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2730 loff_t pos, unsigned len, unsigned flags, 2940 loff_t pos, unsigned len, unsigned flags,
2731 struct page **pagep, void **fsdata) 2941 struct page **pagep, void **fsdata)
2732{ 2942{
2733 int ret, retries = 0; 2943 int ret, retries = 0;
2734 struct page *page; 2944 struct page *page;
@@ -2747,11 +2957,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2747 len, flags, pagep, fsdata); 2957 len, flags, pagep, fsdata);
2748 } 2958 }
2749 *fsdata = (void *)0; 2959 *fsdata = (void *)0;
2750 2960 trace_ext4_da_write_begin(inode, pos, len, flags);
2751 trace_mark(ext4_da_write_begin,
2752 "dev %s ino %lu pos %llu len %u flags %u",
2753 inode->i_sb->s_id, inode->i_ino,
2754 (unsigned long long) pos, len, flags);
2755retry: 2961retry:
2756 /* 2962 /*
2757 * With delayed allocation, we don't log the i_disksize update 2963 * With delayed allocation, we don't log the i_disksize update
@@ -2777,7 +2983,7 @@ retry:
2777 *pagep = page; 2983 *pagep = page;
2778 2984
2779 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 2985 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2780 ext4_da_get_block_prep); 2986 ext4_da_get_block_prep);
2781 if (ret < 0) { 2987 if (ret < 0) {
2782 unlock_page(page); 2988 unlock_page(page);
2783 ext4_journal_stop(handle); 2989 ext4_journal_stop(handle);
@@ -2788,7 +2994,7 @@ retry:
2788 * i_size_read because we hold i_mutex. 2994 * i_size_read because we hold i_mutex.
2789 */ 2995 */
2790 if (pos + len > inode->i_size) 2996 if (pos + len > inode->i_size)
2791 vmtruncate(inode, inode->i_size); 2997 ext4_truncate(inode);
2792 } 2998 }
2793 2999
2794 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3000 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2802,7 +3008,7 @@ out:
2802 * when write to the end of file but not require block allocation 3008 * when write to the end of file but not require block allocation
2803 */ 3009 */
2804static int ext4_da_should_update_i_disksize(struct page *page, 3010static int ext4_da_should_update_i_disksize(struct page *page,
2805 unsigned long offset) 3011 unsigned long offset)
2806{ 3012{
2807 struct buffer_head *bh; 3013 struct buffer_head *bh;
2808 struct inode *inode = page->mapping->host; 3014 struct inode *inode = page->mapping->host;
@@ -2815,15 +3021,15 @@ static int ext4_da_should_update_i_disksize(struct page *page,
2815 for (i = 0; i < idx; i++) 3021 for (i = 0; i < idx; i++)
2816 bh = bh->b_this_page; 3022 bh = bh->b_this_page;
2817 3023
2818 if (!buffer_mapped(bh) || (buffer_delay(bh))) 3024 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
2819 return 0; 3025 return 0;
2820 return 1; 3026 return 1;
2821} 3027}
2822 3028
2823static int ext4_da_write_end(struct file *file, 3029static int ext4_da_write_end(struct file *file,
2824 struct address_space *mapping, 3030 struct address_space *mapping,
2825 loff_t pos, unsigned len, unsigned copied, 3031 loff_t pos, unsigned len, unsigned copied,
2826 struct page *page, void *fsdata) 3032 struct page *page, void *fsdata)
2827{ 3033{
2828 struct inode *inode = mapping->host; 3034 struct inode *inode = mapping->host;
2829 int ret = 0, ret2; 3035 int ret = 0, ret2;
@@ -2844,10 +3050,7 @@ static int ext4_da_write_end(struct file *file,
2844 } 3050 }
2845 } 3051 }
2846 3052
2847 trace_mark(ext4_da_write_end, 3053 trace_ext4_da_write_end(inode, pos, len, copied);
2848 "dev %s ino %lu pos %llu len %u copied %u",
2849 inode->i_sb->s_id, inode->i_ino,
2850 (unsigned long long) pos, len, copied);
2851 start = pos & (PAGE_CACHE_SIZE - 1); 3054 start = pos & (PAGE_CACHE_SIZE - 1);
2852 end = start + copied - 1; 3055 end = start + copied - 1;
2853 3056
@@ -2924,7 +3127,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
2924 * not strictly speaking necessary (and for users of 3127 * not strictly speaking necessary (and for users of
2925 * laptop_mode, not even desirable). However, to do otherwise 3128 * laptop_mode, not even desirable). However, to do otherwise
2926 * would require replicating code paths in: 3129 * would require replicating code paths in:
2927 * 3130 *
2928 * ext4_da_writepages() -> 3131 * ext4_da_writepages() ->
2929 * write_cache_pages() ---> (via passed in callback function) 3132 * write_cache_pages() ---> (via passed in callback function)
2930 * __mpage_da_writepage() --> 3133 * __mpage_da_writepage() -->
@@ -2944,7 +3147,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
2944 * write out the pages, but rather only collect contiguous 3147 * write out the pages, but rather only collect contiguous
2945 * logical block extents, call the multi-block allocator, and 3148 * logical block extents, call the multi-block allocator, and
2946 * then update the buffer heads with the block allocations. 3149 * then update the buffer heads with the block allocations.
2947 * 3150 *
2948 * For now, though, we'll cheat by calling filemap_flush(), 3151 * For now, though, we'll cheat by calling filemap_flush(),
2949 * which will map the blocks, and start the I/O, but not 3152 * which will map the blocks, and start the I/O, but not
2950 * actually wait for the I/O to complete. 3153 * actually wait for the I/O to complete.
@@ -3014,229 +3217,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3014 return generic_block_bmap(mapping, block, ext4_get_block); 3217 return generic_block_bmap(mapping, block, ext4_get_block);
3015} 3218}
3016 3219
3017static int bget_one(handle_t *handle, struct buffer_head *bh)
3018{
3019 get_bh(bh);
3020 return 0;
3021}
3022
3023static int bput_one(handle_t *handle, struct buffer_head *bh)
3024{
3025 put_bh(bh);
3026 return 0;
3027}
3028
3029/*
3030 * Note that we don't need to start a transaction unless we're journaling data
3031 * because we should have holes filled from ext4_page_mkwrite(). We even don't
3032 * need to file the inode to the transaction's list in ordered mode because if
3033 * we are writing back data added by write(), the inode is already there and if
3034 * we are writing back data modified via mmap(), noone guarantees in which
3035 * transaction the data will hit the disk. In case we are journaling data, we
3036 * cannot start transaction directly because transaction start ranks above page
3037 * lock so we have to do some magic.
3038 *
3039 * In all journaling modes block_write_full_page() will start the I/O.
3040 *
3041 * Problem:
3042 *
3043 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
3044 * ext4_writepage()
3045 *
3046 * Similar for:
3047 *
3048 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
3049 *
3050 * Same applies to ext4_get_block(). We will deadlock on various things like
3051 * lock_journal and i_data_sem
3052 *
3053 * Setting PF_MEMALLOC here doesn't work - too many internal memory
3054 * allocations fail.
3055 *
3056 * 16May01: If we're reentered then journal_current_handle() will be
3057 * non-zero. We simply *return*.
3058 *
3059 * 1 July 2001: @@@ FIXME:
3060 * In journalled data mode, a data buffer may be metadata against the
3061 * current transaction. But the same file is part of a shared mapping
3062 * and someone does a writepage() on it.
3063 *
3064 * We will move the buffer onto the async_data list, but *after* it has
3065 * been dirtied. So there's a small window where we have dirty data on
3066 * BJ_Metadata.
3067 *
3068 * Note that this only applies to the last partial page in the file. The
3069 * bit which block_write_full_page() uses prepare/commit for. (That's
3070 * broken code anyway: it's wrong for msync()).
3071 *
3072 * It's a rare case: affects the final partial page, for journalled data
3073 * where the file is subject to bith write() and writepage() in the same
3074 * transction. To fix it we'll need a custom block_write_full_page().
3075 * We'll probably need that anyway for journalling writepage() output.
3076 *
3077 * We don't honour synchronous mounts for writepage(). That would be
3078 * disastrous. Any write() or metadata operation will sync the fs for
3079 * us.
3080 *
3081 */
3082static int __ext4_normal_writepage(struct page *page,
3083 struct writeback_control *wbc)
3084{
3085 struct inode *inode = page->mapping->host;
3086
3087 if (test_opt(inode->i_sb, NOBH))
3088 return nobh_writepage(page,
3089 ext4_normal_get_block_write, wbc);
3090 else
3091 return block_write_full_page(page,
3092 ext4_normal_get_block_write,
3093 wbc);
3094}
3095
3096static int ext4_normal_writepage(struct page *page,
3097 struct writeback_control *wbc)
3098{
3099 struct inode *inode = page->mapping->host;
3100 loff_t size = i_size_read(inode);
3101 loff_t len;
3102
3103 trace_mark(ext4_normal_writepage,
3104 "dev %s ino %lu page_index %lu",
3105 inode->i_sb->s_id, inode->i_ino, page->index);
3106 J_ASSERT(PageLocked(page));
3107 if (page->index == size >> PAGE_CACHE_SHIFT)
3108 len = size & ~PAGE_CACHE_MASK;
3109 else
3110 len = PAGE_CACHE_SIZE;
3111
3112 if (page_has_buffers(page)) {
3113 /* if page has buffers it should all be mapped
3114 * and allocated. If there are not buffers attached
3115 * to the page we know the page is dirty but it lost
3116 * buffers. That means that at some moment in time
3117 * after write_begin() / write_end() has been called
3118 * all buffers have been clean and thus they must have been
3119 * written at least once. So they are all mapped and we can
3120 * happily proceed with mapping them and writing the page.
3121 */
3122 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
3123 ext4_bh_unmapped_or_delay));
3124 }
3125
3126 if (!ext4_journal_current_handle())
3127 return __ext4_normal_writepage(page, wbc);
3128
3129 redirty_page_for_writepage(wbc, page);
3130 unlock_page(page);
3131 return 0;
3132}
3133
3134static int __ext4_journalled_writepage(struct page *page,
3135 struct writeback_control *wbc)
3136{
3137 struct address_space *mapping = page->mapping;
3138 struct inode *inode = mapping->host;
3139 struct buffer_head *page_bufs;
3140 handle_t *handle = NULL;
3141 int ret = 0;
3142 int err;
3143
3144 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
3145 ext4_normal_get_block_write);
3146 if (ret != 0)
3147 goto out_unlock;
3148
3149 page_bufs = page_buffers(page);
3150 walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
3151 bget_one);
3152 /* As soon as we unlock the page, it can go away, but we have
3153 * references to buffers so we are safe */
3154 unlock_page(page);
3155
3156 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
3157 if (IS_ERR(handle)) {
3158 ret = PTR_ERR(handle);
3159 goto out;
3160 }
3161
3162 ret = walk_page_buffers(handle, page_bufs, 0,
3163 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
3164
3165 err = walk_page_buffers(handle, page_bufs, 0,
3166 PAGE_CACHE_SIZE, NULL, write_end_fn);
3167 if (ret == 0)
3168 ret = err;
3169 err = ext4_journal_stop(handle);
3170 if (!ret)
3171 ret = err;
3172
3173 walk_page_buffers(handle, page_bufs, 0,
3174 PAGE_CACHE_SIZE, NULL, bput_one);
3175 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
3176 goto out;
3177
3178out_unlock:
3179 unlock_page(page);
3180out:
3181 return ret;
3182}
3183
3184static int ext4_journalled_writepage(struct page *page,
3185 struct writeback_control *wbc)
3186{
3187 struct inode *inode = page->mapping->host;
3188 loff_t size = i_size_read(inode);
3189 loff_t len;
3190
3191 trace_mark(ext4_journalled_writepage,
3192 "dev %s ino %lu page_index %lu",
3193 inode->i_sb->s_id, inode->i_ino, page->index);
3194 J_ASSERT(PageLocked(page));
3195 if (page->index == size >> PAGE_CACHE_SHIFT)
3196 len = size & ~PAGE_CACHE_MASK;
3197 else
3198 len = PAGE_CACHE_SIZE;
3199
3200 if (page_has_buffers(page)) {
3201 /* if page has buffers it should all be mapped
3202 * and allocated. If there are not buffers attached
3203 * to the page we know the page is dirty but it lost
3204 * buffers. That means that at some moment in time
3205 * after write_begin() / write_end() has been called
3206 * all buffers have been clean and thus they must have been
3207 * written at least once. So they are all mapped and we can
3208 * happily proceed with mapping them and writing the page.
3209 */
3210 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
3211 ext4_bh_unmapped_or_delay));
3212 }
3213
3214 if (ext4_journal_current_handle())
3215 goto no_write;
3216
3217 if (PageChecked(page)) {
3218 /*
3219 * It's mmapped pagecache. Add buffers and journal it. There
3220 * doesn't seem much point in redirtying the page here.
3221 */
3222 ClearPageChecked(page);
3223 return __ext4_journalled_writepage(page, wbc);
3224 } else {
3225 /*
3226 * It may be a page full of checkpoint-mode buffers. We don't
3227 * really know unless we go poke around in the buffer_heads.
3228 * But block_write_full_page will do the right thing.
3229 */
3230 return block_write_full_page(page,
3231 ext4_normal_get_block_write,
3232 wbc);
3233 }
3234no_write:
3235 redirty_page_for_writepage(wbc, page);
3236 unlock_page(page);
3237 return 0;
3238}
3239
3240static int ext4_readpage(struct file *file, struct page *page) 3220static int ext4_readpage(struct file *file, struct page *page)
3241{ 3221{
3242 return mpage_readpage(page, ext4_get_block); 3222 return mpage_readpage(page, ext4_get_block);
@@ -3288,8 +3268,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3288 * VFS code falls back into buffered path in that case so we are safe. 3268 * VFS code falls back into buffered path in that case so we are safe.
3289 */ 3269 */
3290static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3270static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3291 const struct iovec *iov, loff_t offset, 3271 const struct iovec *iov, loff_t offset,
3292 unsigned long nr_segs) 3272 unsigned long nr_segs)
3293{ 3273{
3294 struct file *file = iocb->ki_filp; 3274 struct file *file = iocb->ki_filp;
3295 struct inode *inode = file->f_mapping->host; 3275 struct inode *inode = file->f_mapping->host;
@@ -3383,7 +3363,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
3383static const struct address_space_operations ext4_ordered_aops = { 3363static const struct address_space_operations ext4_ordered_aops = {
3384 .readpage = ext4_readpage, 3364 .readpage = ext4_readpage,
3385 .readpages = ext4_readpages, 3365 .readpages = ext4_readpages,
3386 .writepage = ext4_normal_writepage, 3366 .writepage = ext4_writepage,
3387 .sync_page = block_sync_page, 3367 .sync_page = block_sync_page,
3388 .write_begin = ext4_write_begin, 3368 .write_begin = ext4_write_begin,
3389 .write_end = ext4_ordered_write_end, 3369 .write_end = ext4_ordered_write_end,
@@ -3398,7 +3378,7 @@ static const struct address_space_operations ext4_ordered_aops = {
3398static const struct address_space_operations ext4_writeback_aops = { 3378static const struct address_space_operations ext4_writeback_aops = {
3399 .readpage = ext4_readpage, 3379 .readpage = ext4_readpage,
3400 .readpages = ext4_readpages, 3380 .readpages = ext4_readpages,
3401 .writepage = ext4_normal_writepage, 3381 .writepage = ext4_writepage,
3402 .sync_page = block_sync_page, 3382 .sync_page = block_sync_page,
3403 .write_begin = ext4_write_begin, 3383 .write_begin = ext4_write_begin,
3404 .write_end = ext4_writeback_write_end, 3384 .write_end = ext4_writeback_write_end,
@@ -3413,7 +3393,7 @@ static const struct address_space_operations ext4_writeback_aops = {
3413static const struct address_space_operations ext4_journalled_aops = { 3393static const struct address_space_operations ext4_journalled_aops = {
3414 .readpage = ext4_readpage, 3394 .readpage = ext4_readpage,
3415 .readpages = ext4_readpages, 3395 .readpages = ext4_readpages,
3416 .writepage = ext4_journalled_writepage, 3396 .writepage = ext4_writepage,
3417 .sync_page = block_sync_page, 3397 .sync_page = block_sync_page,
3418 .write_begin = ext4_write_begin, 3398 .write_begin = ext4_write_begin,
3419 .write_end = ext4_journalled_write_end, 3399 .write_end = ext4_journalled_write_end,
@@ -3427,7 +3407,7 @@ static const struct address_space_operations ext4_journalled_aops = {
3427static const struct address_space_operations ext4_da_aops = { 3407static const struct address_space_operations ext4_da_aops = {
3428 .readpage = ext4_readpage, 3408 .readpage = ext4_readpage,
3429 .readpages = ext4_readpages, 3409 .readpages = ext4_readpages,
3430 .writepage = ext4_da_writepage, 3410 .writepage = ext4_writepage,
3431 .writepages = ext4_da_writepages, 3411 .writepages = ext4_da_writepages,
3432 .sync_page = block_sync_page, 3412 .sync_page = block_sync_page,
3433 .write_begin = ext4_da_write_begin, 3413 .write_begin = ext4_da_write_begin,
@@ -3474,7 +3454,8 @@ int ext4_block_truncate_page(handle_t *handle,
3474 struct page *page; 3454 struct page *page;
3475 int err = 0; 3455 int err = 0;
3476 3456
3477 page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); 3457 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3458 mapping_gfp_mask(mapping) & ~__GFP_FS);
3478 if (!page) 3459 if (!page)
3479 return -EINVAL; 3460 return -EINVAL;
3480 3461
@@ -3609,7 +3590,8 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
3609 * (no partially truncated stuff there). */ 3590 * (no partially truncated stuff there). */
3610 3591
3611static Indirect *ext4_find_shared(struct inode *inode, int depth, 3592static Indirect *ext4_find_shared(struct inode *inode, int depth,
3612 ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) 3593 ext4_lblk_t offsets[4], Indirect chain[4],
3594 __le32 *top)
3613{ 3595{
3614 Indirect *partial, *p; 3596 Indirect *partial, *p;
3615 int k, err; 3597 int k, err;
@@ -3665,8 +3647,10 @@ no_top:
3665 * than `count' because there can be holes in there. 3647 * than `count' because there can be holes in there.
3666 */ 3648 */
3667static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 3649static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3668 struct buffer_head *bh, ext4_fsblk_t block_to_free, 3650 struct buffer_head *bh,
3669 unsigned long count, __le32 *first, __le32 *last) 3651 ext4_fsblk_t block_to_free,
3652 unsigned long count, __le32 *first,
3653 __le32 *last)
3670{ 3654{
3671 __le32 *p; 3655 __le32 *p;
3672 if (try_to_extend_transaction(handle, inode)) { 3656 if (try_to_extend_transaction(handle, inode)) {
@@ -3683,10 +3667,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3683 } 3667 }
3684 3668
3685 /* 3669 /*
3686 * Any buffers which are on the journal will be in memory. We find 3670 * Any buffers which are on the journal will be in memory. We
3687 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget() 3671 * find them on the hash table so jbd2_journal_revoke() will
3688 * on them. We've already detached each block from the file, so 3672 * run jbd2_journal_forget() on them. We've already detached
3689 * bforget() in jbd2_journal_forget() should be safe. 3673 * each block from the file, so bforget() in
3674 * jbd2_journal_forget() should be safe.
3690 * 3675 *
3691 * AKPM: turn on bforget in jbd2_journal_forget()!!! 3676 * AKPM: turn on bforget in jbd2_journal_forget()!!!
3692 */ 3677 */
@@ -3973,7 +3958,8 @@ void ext4_truncate(struct inode *inode)
3973 if (!ext4_can_truncate(inode)) 3958 if (!ext4_can_truncate(inode))
3974 return; 3959 return;
3975 3960
3976 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 3961 if (ei->i_disksize && inode->i_size == 0 &&
3962 !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3977 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 3963 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3978 3964
3979 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3965 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4057,7 +4043,7 @@ void ext4_truncate(struct inode *inode)
4057 (__le32*)partial->bh->b_data+addr_per_block, 4043 (__le32*)partial->bh->b_data+addr_per_block,
4058 (chain+n-1) - partial); 4044 (chain+n-1) - partial);
4059 BUFFER_TRACE(partial->bh, "call brelse"); 4045 BUFFER_TRACE(partial->bh, "call brelse");
4060 brelse (partial->bh); 4046 brelse(partial->bh);
4061 partial--; 4047 partial--;
4062 } 4048 }
4063do_indirects: 4049do_indirects:
@@ -4298,8 +4284,9 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei)
4298 if (flags & S_DIRSYNC) 4284 if (flags & S_DIRSYNC)
4299 ei->i_flags |= EXT4_DIRSYNC_FL; 4285 ei->i_flags |= EXT4_DIRSYNC_FL;
4300} 4286}
4287
4301static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4288static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4302 struct ext4_inode_info *ei) 4289 struct ext4_inode_info *ei)
4303{ 4290{
4304 blkcnt_t i_blocks ; 4291 blkcnt_t i_blocks ;
4305 struct inode *inode = &(ei->vfs_inode); 4292 struct inode *inode = &(ei->vfs_inode);
@@ -4338,10 +4325,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4338 return inode; 4325 return inode;
4339 4326
4340 ei = EXT4_I(inode); 4327 ei = EXT4_I(inode);
4341#ifdef CONFIG_EXT4_FS_POSIX_ACL
4342 ei->i_acl = EXT4_ACL_NOT_CACHED;
4343 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
4344#endif
4345 4328
4346 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4329 ret = __ext4_get_inode_loc(inode, &iloc, 0);
4347 if (ret < 0) 4330 if (ret < 0)
@@ -4414,7 +4397,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4414 EXT4_GOOD_OLD_INODE_SIZE + 4397 EXT4_GOOD_OLD_INODE_SIZE +
4415 ei->i_extra_isize; 4398 ei->i_extra_isize;
4416 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 4399 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4417 ei->i_state |= EXT4_STATE_XATTR; 4400 ei->i_state |= EXT4_STATE_XATTR;
4418 } 4401 }
4419 } else 4402 } else
4420 ei->i_extra_isize = 0; 4403 ei->i_extra_isize = 0;
@@ -4433,7 +4416,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4433 4416
4434 ret = 0; 4417 ret = 0;
4435 if (ei->i_file_acl && 4418 if (ei->i_file_acl &&
4436 ((ei->i_file_acl < 4419 ((ei->i_file_acl <
4437 (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + 4420 (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
4438 EXT4_SB(sb)->s_gdb_count)) || 4421 EXT4_SB(sb)->s_gdb_count)) ||
4439 (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { 4422 (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
@@ -4448,15 +4431,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4448 !ext4_inode_is_fast_symlink(inode))) 4431 !ext4_inode_is_fast_symlink(inode)))
4449 /* Validate extent which is part of inode */ 4432 /* Validate extent which is part of inode */
4450 ret = ext4_ext_check_inode(inode); 4433 ret = ext4_ext_check_inode(inode);
4451 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4434 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4452 (S_ISLNK(inode->i_mode) && 4435 (S_ISLNK(inode->i_mode) &&
4453 !ext4_inode_is_fast_symlink(inode))) { 4436 !ext4_inode_is_fast_symlink(inode))) {
4454 /* Validate block references which are part of inode */ 4437 /* Validate block references which are part of inode */
4455 ret = ext4_check_inode_blockref(inode); 4438 ret = ext4_check_inode_blockref(inode);
4456 } 4439 }
4457 if (ret) { 4440 if (ret) {
4458 brelse(bh); 4441 brelse(bh);
4459 goto bad_inode; 4442 goto bad_inode;
4460 } 4443 }
4461 4444
4462 if (S_ISREG(inode->i_mode)) { 4445 if (S_ISREG(inode->i_mode)) {
@@ -4487,7 +4470,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4487 } else { 4470 } else {
4488 brelse(bh); 4471 brelse(bh);
4489 ret = -EIO; 4472 ret = -EIO;
4490 ext4_error(inode->i_sb, __func__, 4473 ext4_error(inode->i_sb, __func__,
4491 "bogus i_mode (%o) for inode=%lu", 4474 "bogus i_mode (%o) for inode=%lu",
4492 inode->i_mode, inode->i_ino); 4475 inode->i_mode, inode->i_ino);
4493 goto bad_inode; 4476 goto bad_inode;
@@ -4640,8 +4623,9 @@ static int ext4_do_update_inode(handle_t *handle,
4640 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4623 cpu_to_le32(new_encode_dev(inode->i_rdev));
4641 raw_inode->i_block[2] = 0; 4624 raw_inode->i_block[2] = 0;
4642 } 4625 }
4643 } else for (block = 0; block < EXT4_N_BLOCKS; block++) 4626 } else
4644 raw_inode->i_block[block] = ei->i_data[block]; 4627 for (block = 0; block < EXT4_N_BLOCKS; block++)
4628 raw_inode->i_block[block] = ei->i_data[block];
4645 4629
4646 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4630 raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
4647 if (ei->i_extra_isize) { 4631 if (ei->i_extra_isize) {
@@ -4715,25 +4699,6 @@ int ext4_write_inode(struct inode *inode, int wait)
4715 return ext4_force_commit(inode->i_sb); 4699 return ext4_force_commit(inode->i_sb);
4716} 4700}
4717 4701
4718int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
4719{
4720 int err = 0;
4721
4722 mark_buffer_dirty(bh);
4723 if (inode && inode_needs_sync(inode)) {
4724 sync_dirty_buffer(bh);
4725 if (buffer_req(bh) && !buffer_uptodate(bh)) {
4726 ext4_error(inode->i_sb, __func__,
4727 "IO error syncing inode, "
4728 "inode=%lu, block=%llu",
4729 inode->i_ino,
4730 (unsigned long long)bh->b_blocknr);
4731 err = -EIO;
4732 }
4733 }
4734 return err;
4735}
4736
4737/* 4702/*
4738 * ext4_setattr() 4703 * ext4_setattr()
4739 * 4704 *
@@ -4930,7 +4895,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4930 */ 4895 */
4931int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4896int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4932{ 4897{
4933 int groups, gdpblocks; 4898 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
4899 int gdpblocks;
4934 int idxblocks; 4900 int idxblocks;
4935 int ret = 0; 4901 int ret = 0;
4936 4902
@@ -4957,8 +4923,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4957 groups += nrblocks; 4923 groups += nrblocks;
4958 4924
4959 gdpblocks = groups; 4925 gdpblocks = groups;
4960 if (groups > EXT4_SB(inode->i_sb)->s_groups_count) 4926 if (groups > ngroups)
4961 groups = EXT4_SB(inode->i_sb)->s_groups_count; 4927 groups = ngroups;
4962 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 4928 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
4963 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 4929 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
4964 4930
@@ -4998,7 +4964,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
4998 * Calculate the journal credits for a chunk of data modification. 4964 * Calculate the journal credits for a chunk of data modification.
4999 * 4965 *
5000 * This is called from DIO, fallocate or whoever calling 4966 * This is called from DIO, fallocate or whoever calling
5001 * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. 4967 * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
5002 * 4968 *
5003 * journal buffers for data blocks are not included here, as DIO 4969 * journal buffers for data blocks are not included here, as DIO
5004 * and fallocate do no need to journal data buffers. 4970 * and fallocate do no need to journal data buffers.
@@ -5013,7 +4979,7 @@ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
5013 * Give this, we know that the caller already has write access to iloc->bh. 4979 * Give this, we know that the caller already has write access to iloc->bh.
5014 */ 4980 */
5015int ext4_mark_iloc_dirty(handle_t *handle, 4981int ext4_mark_iloc_dirty(handle_t *handle,
5016 struct inode *inode, struct ext4_iloc *iloc) 4982 struct inode *inode, struct ext4_iloc *iloc)
5017{ 4983{
5018 int err = 0; 4984 int err = 0;
5019 4985
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 91e75f7a9e73..7050a9cd04a4 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -12,8 +12,8 @@
12#include <linux/capability.h> 12#include <linux/capability.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/compat.h> 14#include <linux/compat.h>
15#include <linux/smp_lock.h>
16#include <linux/mount.h> 15#include <linux/mount.h>
16#include <linux/file.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "ext4_jbd2.h" 18#include "ext4_jbd2.h"
19#include "ext4.h" 19#include "ext4.h"
@@ -191,7 +191,7 @@ setversion_out:
191 case EXT4_IOC_GROUP_EXTEND: { 191 case EXT4_IOC_GROUP_EXTEND: {
192 ext4_fsblk_t n_blocks_count; 192 ext4_fsblk_t n_blocks_count;
193 struct super_block *sb = inode->i_sb; 193 struct super_block *sb = inode->i_sb;
194 int err, err2; 194 int err, err2=0;
195 195
196 if (!capable(CAP_SYS_RESOURCE)) 196 if (!capable(CAP_SYS_RESOURCE))
197 return -EPERM; 197 return -EPERM;
@@ -204,19 +204,56 @@ setversion_out:
204 return err; 204 return err;
205 205
206 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); 206 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
207 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 207 if (EXT4_SB(sb)->s_journal) {
208 err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 208 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
209 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 209 err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
210 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
211 }
210 if (err == 0) 212 if (err == 0)
211 err = err2; 213 err = err2;
212 mnt_drop_write(filp->f_path.mnt); 214 mnt_drop_write(filp->f_path.mnt);
213 215
214 return err; 216 return err;
215 } 217 }
218
219 case EXT4_IOC_MOVE_EXT: {
220 struct move_extent me;
221 struct file *donor_filp;
222 int err;
223
224 if (copy_from_user(&me,
225 (struct move_extent __user *)arg, sizeof(me)))
226 return -EFAULT;
227
228 donor_filp = fget(me.donor_fd);
229 if (!donor_filp)
230 return -EBADF;
231
232 if (!capable(CAP_DAC_OVERRIDE)) {
233 if ((current->real_cred->fsuid != inode->i_uid) ||
234 !(inode->i_mode & S_IRUSR) ||
235 !(donor_filp->f_dentry->d_inode->i_mode &
236 S_IRUSR)) {
237 fput(donor_filp);
238 return -EACCES;
239 }
240 }
241
242 err = ext4_move_extents(filp, donor_filp, me.orig_start,
243 me.donor_start, me.len, &me.moved_len);
244 fput(donor_filp);
245
246 if (!err)
247 if (copy_to_user((struct move_extent *)arg,
248 &me, sizeof(me)))
249 return -EFAULT;
250 return err;
251 }
252
216 case EXT4_IOC_GROUP_ADD: { 253 case EXT4_IOC_GROUP_ADD: {
217 struct ext4_new_group_data input; 254 struct ext4_new_group_data input;
218 struct super_block *sb = inode->i_sb; 255 struct super_block *sb = inode->i_sb;
219 int err, err2; 256 int err, err2=0;
220 257
221 if (!capable(CAP_SYS_RESOURCE)) 258 if (!capable(CAP_SYS_RESOURCE))
222 return -EPERM; 259 return -EPERM;
@@ -230,9 +267,11 @@ setversion_out:
230 return err; 267 return err;
231 268
232 err = ext4_group_add(sb, &input); 269 err = ext4_group_add(sb, &input);
233 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 270 if (EXT4_SB(sb)->s_journal) {
234 err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 271 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
235 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 272 err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
273 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
274 }
236 if (err == 0) 275 if (err == 0)
237 err = err2; 276 err = err2;
238 mnt_drop_write(filp->f_path.mnt); 277 mnt_drop_write(filp->f_path.mnt);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f871677a7984..cd258463e2a9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,8 @@
22 */ 22 */
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <trace/events/ext4.h>
26
25/* 27/*
26 * MUSTDO: 28 * MUSTDO:
27 * - test ext4_ext_search_left() and ext4_ext_search_right() 29 * - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -340,8 +342,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
340 ext4_group_t group); 342 ext4_group_t group);
341static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 343static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
342 344
343
344
345static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 345static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
346{ 346{
347#if BITS_PER_LONG == 64 347#if BITS_PER_LONG == 64
@@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr)
372 ext4_set_bit(bit, addr); 372 ext4_set_bit(bit, addr);
373} 373}
374 374
375static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
376{
377 addr = mb_correct_addr_and_bit(&bit, addr);
378 ext4_set_bit_atomic(lock, bit, addr);
379}
380
381static inline void mb_clear_bit(int bit, void *addr) 375static inline void mb_clear_bit(int bit, void *addr)
382{ 376{
383 addr = mb_correct_addr_and_bit(&bit, addr); 377 addr = mb_correct_addr_and_bit(&bit, addr);
384 ext4_clear_bit(bit, addr); 378 ext4_clear_bit(bit, addr);
385} 379}
386 380
387static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
388{
389 addr = mb_correct_addr_and_bit(&bit, addr);
390 ext4_clear_bit_atomic(lock, bit, addr);
391}
392
393static inline int mb_find_next_zero_bit(void *addr, int max, int start) 381static inline int mb_find_next_zero_bit(void *addr, int max, int start)
394{ 382{
395 int fix = 0, ret, tmpmax; 383 int fix = 0, ret, tmpmax;
@@ -448,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
448 436
449 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 437 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
450 return; 438 return;
451 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); 439 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
452 for (i = 0; i < count; i++) { 440 for (i = 0; i < count; i++) {
453 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 441 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
454 ext4_fsblk_t blocknr; 442 ext4_fsblk_t blocknr;
@@ -472,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
472 460
473 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 461 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
474 return; 462 return;
475 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); 463 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
476 for (i = 0; i < count; i++) { 464 for (i = 0; i < count; i++) {
477 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); 465 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
478 mb_set_bit(first + i, e4b->bd_info->bb_bitmap); 466 mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
@@ -669,7 +657,8 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
669 } 657 }
670} 658}
671 659
672static void ext4_mb_generate_buddy(struct super_block *sb, 660static noinline_for_stack
661void ext4_mb_generate_buddy(struct super_block *sb,
673 void *buddy, void *bitmap, ext4_group_t group) 662 void *buddy, void *bitmap, ext4_group_t group)
674{ 663{
675 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 664 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -739,6 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
739 728
740static int ext4_mb_init_cache(struct page *page, char *incore) 729static int ext4_mb_init_cache(struct page *page, char *incore)
741{ 730{
731 ext4_group_t ngroups;
742 int blocksize; 732 int blocksize;
743 int blocks_per_page; 733 int blocks_per_page;
744 int groups_per_page; 734 int groups_per_page;
@@ -757,6 +747,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
757 747
758 inode = page->mapping->host; 748 inode = page->mapping->host;
759 sb = inode->i_sb; 749 sb = inode->i_sb;
750 ngroups = ext4_get_groups_count(sb);
760 blocksize = 1 << inode->i_blkbits; 751 blocksize = 1 << inode->i_blkbits;
761 blocks_per_page = PAGE_CACHE_SIZE / blocksize; 752 blocks_per_page = PAGE_CACHE_SIZE / blocksize;
762 753
@@ -780,7 +771,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
780 for (i = 0; i < groups_per_page; i++) { 771 for (i = 0; i < groups_per_page; i++) {
781 struct ext4_group_desc *desc; 772 struct ext4_group_desc *desc;
782 773
783 if (first_group + i >= EXT4_SB(sb)->s_groups_count) 774 if (first_group + i >= ngroups)
784 break; 775 break;
785 776
786 err = -EIO; 777 err = -EIO;
@@ -801,17 +792,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
801 unlock_buffer(bh[i]); 792 unlock_buffer(bh[i]);
802 continue; 793 continue;
803 } 794 }
804 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 795 ext4_lock_group(sb, first_group + i);
805 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 796 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
806 ext4_init_block_bitmap(sb, bh[i], 797 ext4_init_block_bitmap(sb, bh[i],
807 first_group + i, desc); 798 first_group + i, desc);
808 set_bitmap_uptodate(bh[i]); 799 set_bitmap_uptodate(bh[i]);
809 set_buffer_uptodate(bh[i]); 800 set_buffer_uptodate(bh[i]);
810 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 801 ext4_unlock_group(sb, first_group + i);
811 unlock_buffer(bh[i]); 802 unlock_buffer(bh[i]);
812 continue; 803 continue;
813 } 804 }
814 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 805 ext4_unlock_group(sb, first_group + i);
815 if (buffer_uptodate(bh[i])) { 806 if (buffer_uptodate(bh[i])) {
816 /* 807 /*
817 * if not uninit if bh is uptodate, 808 * if not uninit if bh is uptodate,
@@ -852,7 +843,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
852 struct ext4_group_info *grinfo; 843 struct ext4_group_info *grinfo;
853 844
854 group = (first_block + i) >> 1; 845 group = (first_block + i) >> 1;
855 if (group >= EXT4_SB(sb)->s_groups_count) 846 if (group >= ngroups)
856 break; 847 break;
857 848
858 /* 849 /*
@@ -1078,7 +1069,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1078 return 0; 1069 return 0;
1079} 1070}
1080 1071
1081static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) 1072static void mb_clear_bits(void *bm, int cur, int len)
1082{ 1073{
1083 __u32 *addr; 1074 __u32 *addr;
1084 1075
@@ -1091,15 +1082,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1091 cur += 32; 1082 cur += 32;
1092 continue; 1083 continue;
1093 } 1084 }
1094 if (lock) 1085 mb_clear_bit(cur, bm);
1095 mb_clear_bit_atomic(lock, cur, bm);
1096 else
1097 mb_clear_bit(cur, bm);
1098 cur++; 1086 cur++;
1099 } 1087 }
1100} 1088}
1101 1089
1102static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) 1090static void mb_set_bits(void *bm, int cur, int len)
1103{ 1091{
1104 __u32 *addr; 1092 __u32 *addr;
1105 1093
@@ -1112,10 +1100,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1112 cur += 32; 1100 cur += 32;
1113 continue; 1101 continue;
1114 } 1102 }
1115 if (lock) 1103 mb_set_bit(cur, bm);
1116 mb_set_bit_atomic(lock, cur, bm);
1117 else
1118 mb_set_bit(cur, bm);
1119 cur++; 1104 cur++;
1120 } 1105 }
1121} 1106}
@@ -1131,7 +1116,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1131 struct super_block *sb = e4b->bd_sb; 1116 struct super_block *sb = e4b->bd_sb;
1132 1117
1133 BUG_ON(first + count > (sb->s_blocksize << 3)); 1118 BUG_ON(first + count > (sb->s_blocksize << 3));
1134 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); 1119 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1135 mb_check_buddy(e4b); 1120 mb_check_buddy(e4b);
1136 mb_free_blocks_double(inode, e4b, first, count); 1121 mb_free_blocks_double(inode, e4b, first, count);
1137 1122
@@ -1212,7 +1197,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1212 int ord; 1197 int ord;
1213 void *buddy; 1198 void *buddy;
1214 1199
1215 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); 1200 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1216 BUG_ON(ex == NULL); 1201 BUG_ON(ex == NULL);
1217 1202
1218 buddy = mb_find_buddy(e4b, order, &max); 1203 buddy = mb_find_buddy(e4b, order, &max);
@@ -1276,7 +1261,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1276 1261
1277 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 1262 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1278 BUG_ON(e4b->bd_group != ex->fe_group); 1263 BUG_ON(e4b->bd_group != ex->fe_group);
1279 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); 1264 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1280 mb_check_buddy(e4b); 1265 mb_check_buddy(e4b);
1281 mb_mark_used_double(e4b, start, len); 1266 mb_mark_used_double(e4b, start, len);
1282 1267
@@ -1330,8 +1315,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1330 e4b->bd_info->bb_counters[ord]++; 1315 e4b->bd_info->bb_counters[ord]++;
1331 } 1316 }
1332 1317
1333 mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group), 1318 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1334 EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1335 mb_check_buddy(e4b); 1319 mb_check_buddy(e4b);
1336 1320
1337 return ret; 1321 return ret;
@@ -1497,7 +1481,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1497 ext4_mb_check_limits(ac, e4b, 0); 1481 ext4_mb_check_limits(ac, e4b, 0);
1498} 1482}
1499 1483
1500static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, 1484static noinline_for_stack
1485int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1501 struct ext4_buddy *e4b) 1486 struct ext4_buddy *e4b)
1502{ 1487{
1503 struct ext4_free_extent ex = ac->ac_b_ex; 1488 struct ext4_free_extent ex = ac->ac_b_ex;
@@ -1524,7 +1509,8 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1524 return 0; 1509 return 0;
1525} 1510}
1526 1511
1527static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 1512static noinline_for_stack
1513int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1528 struct ext4_buddy *e4b) 1514 struct ext4_buddy *e4b)
1529{ 1515{
1530 ext4_group_t group = ac->ac_g_ex.fe_group; 1516 ext4_group_t group = ac->ac_g_ex.fe_group;
@@ -1583,7 +1569,8 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1583 * The routine scans buddy structures (not bitmap!) from given order 1569 * The routine scans buddy structures (not bitmap!) from given order
1584 * to max order and tries to find big enough chunk to satisfy the req 1570 * to max order and tries to find big enough chunk to satisfy the req
1585 */ 1571 */
1586static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 1572static noinline_for_stack
1573void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1587 struct ext4_buddy *e4b) 1574 struct ext4_buddy *e4b)
1588{ 1575{
1589 struct super_block *sb = ac->ac_sb; 1576 struct super_block *sb = ac->ac_sb;
@@ -1626,7 +1613,8 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1626 * In order to optimize scanning, caller must pass number of 1613 * In order to optimize scanning, caller must pass number of
1627 * free blocks in the group, so the routine can know upper limit. 1614 * free blocks in the group, so the routine can know upper limit.
1628 */ 1615 */
1629static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 1616static noinline_for_stack
1617void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1630 struct ext4_buddy *e4b) 1618 struct ext4_buddy *e4b)
1631{ 1619{
1632 struct super_block *sb = ac->ac_sb; 1620 struct super_block *sb = ac->ac_sb;
@@ -1685,7 +1673,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1685 * we try to find stripe-aligned chunks for stripe-size requests 1673 * we try to find stripe-aligned chunks for stripe-size requests
1686 * XXX should do so at least for multiples of stripe size as well 1674 * XXX should do so at least for multiples of stripe size as well
1687 */ 1675 */
1688static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 1676static noinline_for_stack
1677void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1689 struct ext4_buddy *e4b) 1678 struct ext4_buddy *e4b)
1690{ 1679{
1691 struct super_block *sb = ac->ac_sb; 1680 struct super_block *sb = ac->ac_sb;
@@ -1726,7 +1715,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1726 unsigned free, fragments; 1715 unsigned free, fragments;
1727 unsigned i, bits; 1716 unsigned i, bits;
1728 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1717 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1729 struct ext4_group_desc *desc;
1730 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1718 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1731 1719
1732 BUG_ON(cr < 0 || cr >= 4); 1720 BUG_ON(cr < 0 || cr >= 4);
@@ -1742,10 +1730,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1742 switch (cr) { 1730 switch (cr) {
1743 case 0: 1731 case 0:
1744 BUG_ON(ac->ac_2order == 0); 1732 BUG_ON(ac->ac_2order == 0);
1745 /* If this group is uninitialized, skip it initially */
1746 desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
1747 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1748 return 0;
1749 1733
1750 /* Avoid using the first bg of a flexgroup for data files */ 1734 /* Avoid using the first bg of a flexgroup for data files */
1751 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 1735 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
@@ -1788,6 +1772,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1788 int block, pnum; 1772 int block, pnum;
1789 int blocks_per_page; 1773 int blocks_per_page;
1790 int groups_per_page; 1774 int groups_per_page;
1775 ext4_group_t ngroups = ext4_get_groups_count(sb);
1791 ext4_group_t first_group; 1776 ext4_group_t first_group;
1792 struct ext4_group_info *grp; 1777 struct ext4_group_info *grp;
1793 1778
@@ -1807,7 +1792,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1807 /* read all groups the page covers into the cache */ 1792 /* read all groups the page covers into the cache */
1808 for (i = 0; i < groups_per_page; i++) { 1793 for (i = 0; i < groups_per_page; i++) {
1809 1794
1810 if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) 1795 if ((first_group + i) >= ngroups)
1811 break; 1796 break;
1812 grp = ext4_get_group_info(sb, first_group + i); 1797 grp = ext4_get_group_info(sb, first_group + i);
1813 /* take all groups write allocation 1798 /* take all groups write allocation
@@ -1852,7 +1837,8 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1852 1837
1853} 1838}
1854 1839
1855static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1840static noinline_for_stack
1841int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1856{ 1842{
1857 1843
1858 int ret; 1844 int ret;
@@ -1945,8 +1931,7 @@ err:
1945static noinline_for_stack int 1931static noinline_for_stack int
1946ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1932ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1947{ 1933{
1948 ext4_group_t group; 1934 ext4_group_t ngroups, group, i;
1949 ext4_group_t i;
1950 int cr; 1935 int cr;
1951 int err = 0; 1936 int err = 0;
1952 int bsbits; 1937 int bsbits;
@@ -1957,6 +1942,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1957 1942
1958 sb = ac->ac_sb; 1943 sb = ac->ac_sb;
1959 sbi = EXT4_SB(sb); 1944 sbi = EXT4_SB(sb);
1945 ngroups = ext4_get_groups_count(sb);
1960 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1946 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1961 1947
1962 /* first, try the goal */ 1948 /* first, try the goal */
@@ -2017,11 +2003,11 @@ repeat:
2017 */ 2003 */
2018 group = ac->ac_g_ex.fe_group; 2004 group = ac->ac_g_ex.fe_group;
2019 2005
2020 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { 2006 for (i = 0; i < ngroups; group++, i++) {
2021 struct ext4_group_info *grp; 2007 struct ext4_group_info *grp;
2022 struct ext4_group_desc *desc; 2008 struct ext4_group_desc *desc;
2023 2009
2024 if (group == EXT4_SB(sb)->s_groups_count) 2010 if (group == ngroups)
2025 group = 0; 2011 group = 0;
2026 2012
2027 /* quick check to skip empty groups */ 2013 /* quick check to skip empty groups */
@@ -2064,9 +2050,7 @@ repeat:
2064 2050
2065 ac->ac_groups_scanned++; 2051 ac->ac_groups_scanned++;
2066 desc = ext4_get_group_desc(sb, group, NULL); 2052 desc = ext4_get_group_desc(sb, group, NULL);
2067 if (cr == 0 || (desc->bg_flags & 2053 if (cr == 0)
2068 cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
2069 ac->ac_2order != 0))
2070 ext4_mb_simple_scan_group(ac, &e4b); 2054 ext4_mb_simple_scan_group(ac, &e4b);
2071 else if (cr == 1 && 2055 else if (cr == 1 &&
2072 ac->ac_g_ex.fe_len == sbi->s_stripe) 2056 ac->ac_g_ex.fe_len == sbi->s_stripe)
@@ -2315,12 +2299,10 @@ static struct file_operations ext4_mb_seq_history_fops = {
2315static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2299static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2316{ 2300{
2317 struct super_block *sb = seq->private; 2301 struct super_block *sb = seq->private;
2318 struct ext4_sb_info *sbi = EXT4_SB(sb);
2319 ext4_group_t group; 2302 ext4_group_t group;
2320 2303
2321 if (*pos < 0 || *pos >= sbi->s_groups_count) 2304 if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2322 return NULL; 2305 return NULL;
2323
2324 group = *pos + 1; 2306 group = *pos + 1;
2325 return (void *) ((unsigned long) group); 2307 return (void *) ((unsigned long) group);
2326} 2308}
@@ -2328,11 +2310,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2328static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2310static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2329{ 2311{
2330 struct super_block *sb = seq->private; 2312 struct super_block *sb = seq->private;
2331 struct ext4_sb_info *sbi = EXT4_SB(sb);
2332 ext4_group_t group; 2313 ext4_group_t group;
2333 2314
2334 ++*pos; 2315 ++*pos;
2335 if (*pos < 0 || *pos >= sbi->s_groups_count) 2316 if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2336 return NULL; 2317 return NULL;
2337 group = *pos + 1; 2318 group = *pos + 1;
2338 return (void *) ((unsigned long) group); 2319 return (void *) ((unsigned long) group);
@@ -2420,7 +2401,8 @@ static void ext4_mb_history_release(struct super_block *sb)
2420 2401
2421 if (sbi->s_proc != NULL) { 2402 if (sbi->s_proc != NULL) {
2422 remove_proc_entry("mb_groups", sbi->s_proc); 2403 remove_proc_entry("mb_groups", sbi->s_proc);
2423 remove_proc_entry("mb_history", sbi->s_proc); 2404 if (sbi->s_mb_history_max)
2405 remove_proc_entry("mb_history", sbi->s_proc);
2424 } 2406 }
2425 kfree(sbi->s_mb_history); 2407 kfree(sbi->s_mb_history);
2426} 2408}
@@ -2431,17 +2413,17 @@ static void ext4_mb_history_init(struct super_block *sb)
2431 int i; 2413 int i;
2432 2414
2433 if (sbi->s_proc != NULL) { 2415 if (sbi->s_proc != NULL) {
2434 proc_create_data("mb_history", S_IRUGO, sbi->s_proc, 2416 if (sbi->s_mb_history_max)
2435 &ext4_mb_seq_history_fops, sb); 2417 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2418 &ext4_mb_seq_history_fops, sb);
2436 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2419 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2437 &ext4_mb_seq_groups_fops, sb); 2420 &ext4_mb_seq_groups_fops, sb);
2438 } 2421 }
2439 2422
2440 sbi->s_mb_history_max = 1000;
2441 sbi->s_mb_history_cur = 0; 2423 sbi->s_mb_history_cur = 0;
2442 spin_lock_init(&sbi->s_mb_history_lock); 2424 spin_lock_init(&sbi->s_mb_history_lock);
2443 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); 2425 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2444 sbi->s_mb_history = kzalloc(i, GFP_KERNEL); 2426 sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
2445 /* if we can't allocate history, then we simple won't use it */ 2427 /* if we can't allocate history, then we simple won't use it */
2446} 2428}
2447 2429
@@ -2451,7 +2433,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
2451 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2433 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2452 struct ext4_mb_history h; 2434 struct ext4_mb_history h;
2453 2435
2454 if (unlikely(sbi->s_mb_history == NULL)) 2436 if (sbi->s_mb_history == NULL)
2455 return; 2437 return;
2456 2438
2457 if (!(ac->ac_op & sbi->s_mb_history_filter)) 2439 if (!(ac->ac_op & sbi->s_mb_history_filter))
@@ -2587,6 +2569,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
2587 2569
2588static int ext4_mb_init_backend(struct super_block *sb) 2570static int ext4_mb_init_backend(struct super_block *sb)
2589{ 2571{
2572 ext4_group_t ngroups = ext4_get_groups_count(sb);
2590 ext4_group_t i; 2573 ext4_group_t i;
2591 int metalen; 2574 int metalen;
2592 struct ext4_sb_info *sbi = EXT4_SB(sb); 2575 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2598,7 +2581,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2598 struct ext4_group_desc *desc; 2581 struct ext4_group_desc *desc;
2599 2582
2600 /* This is the number of blocks used by GDT */ 2583 /* This is the number of blocks used by GDT */
2601 num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 2584 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
2602 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); 2585 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
2603 2586
2604 /* 2587 /*
@@ -2644,7 +2627,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2644 for (i = 0; i < num_meta_group_infos; i++) { 2627 for (i = 0; i < num_meta_group_infos; i++) {
2645 if ((i + 1) == num_meta_group_infos) 2628 if ((i + 1) == num_meta_group_infos)
2646 metalen = sizeof(*meta_group_info) * 2629 metalen = sizeof(*meta_group_info) *
2647 (sbi->s_groups_count - 2630 (ngroups -
2648 (i << EXT4_DESC_PER_BLOCK_BITS(sb))); 2631 (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2649 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2632 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2650 if (meta_group_info == NULL) { 2633 if (meta_group_info == NULL) {
@@ -2655,7 +2638,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2655 sbi->s_group_info[i] = meta_group_info; 2638 sbi->s_group_info[i] = meta_group_info;
2656 } 2639 }
2657 2640
2658 for (i = 0; i < sbi->s_groups_count; i++) { 2641 for (i = 0; i < ngroups; i++) {
2659 desc = ext4_get_group_desc(sb, i, NULL); 2642 desc = ext4_get_group_desc(sb, i, NULL);
2660 if (desc == NULL) { 2643 if (desc == NULL) {
2661 printk(KERN_ERR 2644 printk(KERN_ERR
@@ -2761,7 +2744,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2761 return 0; 2744 return 0;
2762} 2745}
2763 2746
2764/* need to called with ext4 group lock (ext4_lock_group) */ 2747/* need to called with the ext4 group lock held */
2765static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) 2748static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2766{ 2749{
2767 struct ext4_prealloc_space *pa; 2750 struct ext4_prealloc_space *pa;
@@ -2781,13 +2764,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2781 2764
2782int ext4_mb_release(struct super_block *sb) 2765int ext4_mb_release(struct super_block *sb)
2783{ 2766{
2767 ext4_group_t ngroups = ext4_get_groups_count(sb);
2784 ext4_group_t i; 2768 ext4_group_t i;
2785 int num_meta_group_infos; 2769 int num_meta_group_infos;
2786 struct ext4_group_info *grinfo; 2770 struct ext4_group_info *grinfo;
2787 struct ext4_sb_info *sbi = EXT4_SB(sb); 2771 struct ext4_sb_info *sbi = EXT4_SB(sb);
2788 2772
2789 if (sbi->s_group_info) { 2773 if (sbi->s_group_info) {
2790 for (i = 0; i < sbi->s_groups_count; i++) { 2774 for (i = 0; i < ngroups; i++) {
2791 grinfo = ext4_get_group_info(sb, i); 2775 grinfo = ext4_get_group_info(sb, i);
2792#ifdef DOUBLE_CHECK 2776#ifdef DOUBLE_CHECK
2793 kfree(grinfo->bb_bitmap); 2777 kfree(grinfo->bb_bitmap);
@@ -2797,7 +2781,7 @@ int ext4_mb_release(struct super_block *sb)
2797 ext4_unlock_group(sb, i); 2781 ext4_unlock_group(sb, i);
2798 kfree(grinfo); 2782 kfree(grinfo);
2799 } 2783 }
2800 num_meta_group_infos = (sbi->s_groups_count + 2784 num_meta_group_infos = (ngroups +
2801 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2785 EXT4_DESC_PER_BLOCK(sb) - 1) >>
2802 EXT4_DESC_PER_BLOCK_BITS(sb); 2786 EXT4_DESC_PER_BLOCK_BITS(sb);
2803 for (i = 0; i < num_meta_group_infos; i++) 2787 for (i = 0; i < num_meta_group_infos; i++)
@@ -2882,9 +2866,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2882 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2866 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
2883 + entry->start_blk 2867 + entry->start_blk
2884 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2868 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
2885 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", 2869 trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
2886 sb->s_id, (unsigned long long) discard_block, 2870 entry->count);
2887 entry->count);
2888 sb_issue_discard(sb, discard_block, entry->count); 2871 sb_issue_discard(sb, discard_block, entry->count);
2889 2872
2890 kmem_cache_free(ext4_free_ext_cachep, entry); 2873 kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2926,7 +2909,11 @@ int __init init_ext4_mballoc(void)
2926 2909
2927void exit_ext4_mballoc(void) 2910void exit_ext4_mballoc(void)
2928{ 2911{
2929 /* XXX: synchronize_rcu(); */ 2912 /*
2913 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2914 * before destroying the slab cache.
2915 */
2916 rcu_barrier();
2930 kmem_cache_destroy(ext4_pspace_cachep); 2917 kmem_cache_destroy(ext4_pspace_cachep);
2931 kmem_cache_destroy(ext4_ac_cachep); 2918 kmem_cache_destroy(ext4_ac_cachep);
2932 kmem_cache_destroy(ext4_free_ext_cachep); 2919 kmem_cache_destroy(ext4_free_ext_cachep);
@@ -2984,27 +2971,25 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2984 + le32_to_cpu(es->s_first_data_block); 2971 + le32_to_cpu(es->s_first_data_block);
2985 2972
2986 len = ac->ac_b_ex.fe_len; 2973 len = ac->ac_b_ex.fe_len;
2987 if (in_range(ext4_block_bitmap(sb, gdp), block, len) || 2974 if (!ext4_data_block_valid(sbi, block, len)) {
2988 in_range(ext4_inode_bitmap(sb, gdp), block, len) ||
2989 in_range(block, ext4_inode_table(sb, gdp),
2990 EXT4_SB(sb)->s_itb_per_group) ||
2991 in_range(block + len - 1, ext4_inode_table(sb, gdp),
2992 EXT4_SB(sb)->s_itb_per_group)) {
2993 ext4_error(sb, __func__, 2975 ext4_error(sb, __func__,
2994 "Allocating block %llu in system zone of %d group\n", 2976 "Allocating blocks %llu-%llu which overlap "
2995 block, ac->ac_b_ex.fe_group); 2977 "fs metadata\n", block, block+len);
2996 /* File system mounted not to panic on error 2978 /* File system mounted not to panic on error
2997 * Fix the bitmap and repeat the block allocation 2979 * Fix the bitmap and repeat the block allocation
2998 * We leak some of the blocks here. 2980 * We leak some of the blocks here.
2999 */ 2981 */
3000 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), 2982 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3001 bitmap_bh->b_data, ac->ac_b_ex.fe_start, 2983 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
3002 ac->ac_b_ex.fe_len); 2984 ac->ac_b_ex.fe_len);
2985 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3003 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2986 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
3004 if (!err) 2987 if (!err)
3005 err = -EAGAIN; 2988 err = -EAGAIN;
3006 goto out_err; 2989 goto out_err;
3007 } 2990 }
2991
2992 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3008#ifdef AGGRESSIVE_CHECK 2993#ifdef AGGRESSIVE_CHECK
3009 { 2994 {
3010 int i; 2995 int i;
@@ -3014,9 +2999,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3014 } 2999 }
3015 } 3000 }
3016#endif 3001#endif
3017 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3002 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
3018 mb_set_bits(NULL, bitmap_bh->b_data,
3019 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
3020 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 3003 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
3021 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3004 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3022 ext4_free_blks_set(sb, gdp, 3005 ext4_free_blks_set(sb, gdp,
@@ -3026,7 +3009,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3026 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; 3009 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
3027 ext4_free_blks_set(sb, gdp, len); 3010 ext4_free_blks_set(sb, gdp, len);
3028 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 3011 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
3029 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3012
3013 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3030 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 3014 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
3031 /* 3015 /*
3032 * Now reduce the dirty block count also. Should not go negative 3016 * Now reduce the dirty block count also. Should not go negative
@@ -3459,7 +3443,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3459 * the function goes through all block freed in the group 3443 * the function goes through all block freed in the group
3460 * but not yet committed and marks them used in in-core bitmap. 3444 * but not yet committed and marks them used in in-core bitmap.
3461 * buddy must be generated from this bitmap 3445 * buddy must be generated from this bitmap
3462 * Need to be called with ext4 group lock (ext4_lock_group) 3446 * Need to be called with the ext4 group lock held
3463 */ 3447 */
3464static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 3448static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3465 ext4_group_t group) 3449 ext4_group_t group)
@@ -3473,9 +3457,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3473 3457
3474 while (n) { 3458 while (n) {
3475 entry = rb_entry(n, struct ext4_free_data, node); 3459 entry = rb_entry(n, struct ext4_free_data, node);
3476 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), 3460 mb_set_bits(bitmap, entry->start_blk, entry->count);
3477 bitmap, entry->start_blk,
3478 entry->count);
3479 n = rb_next(n); 3461 n = rb_next(n);
3480 } 3462 }
3481 return; 3463 return;
@@ -3484,9 +3466,10 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3484/* 3466/*
3485 * the function goes through all preallocation in this group and marks them 3467 * the function goes through all preallocation in this group and marks them
3486 * used in in-core bitmap. buddy must be generated from this bitmap 3468 * used in in-core bitmap. buddy must be generated from this bitmap
3487 * Need to be called with ext4 group lock (ext4_lock_group) 3469 * Need to be called with ext4 group lock held
3488 */ 3470 */
3489static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 3471static noinline_for_stack
3472void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3490 ext4_group_t group) 3473 ext4_group_t group)
3491{ 3474{
3492 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3475 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -3516,8 +3499,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3516 if (unlikely(len == 0)) 3499 if (unlikely(len == 0))
3517 continue; 3500 continue;
3518 BUG_ON(groupnr != group); 3501 BUG_ON(groupnr != group);
3519 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), 3502 mb_set_bits(bitmap, start, len);
3520 bitmap, start, len);
3521 preallocated += len; 3503 preallocated += len;
3522 count++; 3504 count++;
3523 } 3505 }
@@ -3658,10 +3640,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3658 3640
3659 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3641 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3660 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3642 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3661 trace_mark(ext4_mb_new_inode_pa, 3643 trace_ext4_mb_new_inode_pa(ac, pa);
3662 "dev %s ino %lu pstart %llu len %u lstart %u",
3663 sb->s_id, ac->ac_inode->i_ino,
3664 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3665 3644
3666 ext4_mb_use_inode_pa(ac, pa); 3645 ext4_mb_use_inode_pa(ac, pa);
3667 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3646 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3720,9 +3699,8 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3720 pa->pa_type = MB_GROUP_PA; 3699 pa->pa_type = MB_GROUP_PA;
3721 3700
3722 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3701 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3723 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3702 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3724 trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u", 3703 trace_ext4_mb_new_group_pa(ac, pa);
3725 sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3726 3704
3727 ext4_mb_use_group_pa(ac, pa); 3705 ext4_mb_use_group_pa(ac, pa);
3728 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3706 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3812,10 +3790,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3812 ext4_mb_store_history(ac); 3790 ext4_mb_store_history(ac);
3813 } 3791 }
3814 3792
3815 trace_mark(ext4_mb_release_inode_pa, 3793 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
3816 "dev %s ino %lu block %llu count %u", 3794 next - bit);
3817 sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
3818 next - bit);
3819 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3795 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3820 bit = next + 1; 3796 bit = next + 1;
3821 } 3797 }
@@ -3849,8 +3825,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3849 if (ac) 3825 if (ac)
3850 ac->ac_op = EXT4_MB_HISTORY_DISCARD; 3826 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3851 3827
3852 trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d", 3828 trace_ext4_mb_release_group_pa(ac, pa);
3853 sb->s_id, pa->pa_pstart, pa->pa_len);
3854 BUG_ON(pa->pa_deleted == 0); 3829 BUG_ON(pa->pa_deleted == 0);
3855 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3830 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3856 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3831 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3918,6 +3893,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3918 3893
3919 INIT_LIST_HEAD(&list); 3894 INIT_LIST_HEAD(&list);
3920 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 3895 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3896 if (ac)
3897 ac->ac_sb = sb;
3921repeat: 3898repeat:
3922 ext4_lock_group(sb, group); 3899 ext4_lock_group(sb, group);
3923 list_for_each_entry_safe(pa, tmp, 3900 list_for_each_entry_safe(pa, tmp,
@@ -4016,12 +3993,15 @@ void ext4_discard_preallocations(struct inode *inode)
4016 } 3993 }
4017 3994
4018 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 3995 mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
4019 trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id, 3996 trace_ext4_discard_preallocations(inode);
4020 inode->i_ino);
4021 3997
4022 INIT_LIST_HEAD(&list); 3998 INIT_LIST_HEAD(&list);
4023 3999
4024 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4000 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4001 if (ac) {
4002 ac->ac_sb = sb;
4003 ac->ac_inode = inode;
4004 }
4025repeat: 4005repeat:
4026 /* first, collect all pa's in the inode */ 4006 /* first, collect all pa's in the inode */
4027 spin_lock(&ei->i_prealloc_lock); 4007 spin_lock(&ei->i_prealloc_lock);
@@ -4121,7 +4101,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
4121static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 4101static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4122{ 4102{
4123 struct super_block *sb = ac->ac_sb; 4103 struct super_block *sb = ac->ac_sb;
4124 ext4_group_t i; 4104 ext4_group_t ngroups, i;
4125 4105
4126 printk(KERN_ERR "EXT4-fs: Can't allocate:" 4106 printk(KERN_ERR "EXT4-fs: Can't allocate:"
4127 " Allocation context details:\n"); 4107 " Allocation context details:\n");
@@ -4145,7 +4125,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4145 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, 4125 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
4146 ac->ac_found); 4126 ac->ac_found);
4147 printk(KERN_ERR "EXT4-fs: groups: \n"); 4127 printk(KERN_ERR "EXT4-fs: groups: \n");
4148 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 4128 ngroups = ext4_get_groups_count(sb);
4129 for (i = 0; i < ngroups; i++) {
4149 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 4130 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4150 struct ext4_prealloc_space *pa; 4131 struct ext4_prealloc_space *pa;
4151 ext4_grpblk_t start; 4132 ext4_grpblk_t start;
@@ -4246,14 +4227,9 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4246 ext4_get_group_no_and_offset(sb, goal, &group, &block); 4227 ext4_get_group_no_and_offset(sb, goal, &group, &block);
4247 4228
4248 /* set up allocation goals */ 4229 /* set up allocation goals */
4230 memset(ac, 0, sizeof(struct ext4_allocation_context));
4249 ac->ac_b_ex.fe_logical = ar->logical; 4231 ac->ac_b_ex.fe_logical = ar->logical;
4250 ac->ac_b_ex.fe_group = 0;
4251 ac->ac_b_ex.fe_start = 0;
4252 ac->ac_b_ex.fe_len = 0;
4253 ac->ac_status = AC_STATUS_CONTINUE; 4232 ac->ac_status = AC_STATUS_CONTINUE;
4254 ac->ac_groups_scanned = 0;
4255 ac->ac_ex_scanned = 0;
4256 ac->ac_found = 0;
4257 ac->ac_sb = sb; 4233 ac->ac_sb = sb;
4258 ac->ac_inode = ar->inode; 4234 ac->ac_inode = ar->inode;
4259 ac->ac_o_ex.fe_logical = ar->logical; 4235 ac->ac_o_ex.fe_logical = ar->logical;
@@ -4264,15 +4240,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4264 ac->ac_g_ex.fe_group = group; 4240 ac->ac_g_ex.fe_group = group;
4265 ac->ac_g_ex.fe_start = block; 4241 ac->ac_g_ex.fe_start = block;
4266 ac->ac_g_ex.fe_len = len; 4242 ac->ac_g_ex.fe_len = len;
4267 ac->ac_f_ex.fe_len = 0;
4268 ac->ac_flags = ar->flags; 4243 ac->ac_flags = ar->flags;
4269 ac->ac_2order = 0;
4270 ac->ac_criteria = 0;
4271 ac->ac_pa = NULL;
4272 ac->ac_bitmap_page = NULL;
4273 ac->ac_buddy_page = NULL;
4274 ac->alloc_semp = NULL;
4275 ac->ac_lg = NULL;
4276 4244
4277 /* we have to define context: we'll we work with a file or 4245 /* we have to define context: we'll we work with a file or
4278 * locality group. this is a policy, actually */ 4246 * locality group. this is a policy, actually */
@@ -4304,6 +4272,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4304 4272
4305 INIT_LIST_HEAD(&discard_list); 4273 INIT_LIST_HEAD(&discard_list);
4306 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4274 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4275 if (ac)
4276 ac->ac_sb = sb;
4307 4277
4308 spin_lock(&lg->lg_prealloc_lock); 4278 spin_lock(&lg->lg_prealloc_lock);
4309 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 4279 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4469,13 +4439,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4469 4439
4470static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) 4440static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4471{ 4441{
4472 ext4_group_t i; 4442 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4473 int ret; 4443 int ret;
4474 int freed = 0; 4444 int freed = 0;
4475 4445
4476 trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", 4446 trace_ext4_mb_discard_preallocations(sb, needed);
4477 sb->s_id, needed); 4447 for (i = 0; i < ngroups && needed > 0; i++) {
4478 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
4479 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4448 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4480 freed += ret; 4449 freed += ret;
4481 needed -= ret; 4450 needed -= ret;
@@ -4503,17 +4472,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4503 sb = ar->inode->i_sb; 4472 sb = ar->inode->i_sb;
4504 sbi = EXT4_SB(sb); 4473 sbi = EXT4_SB(sb);
4505 4474
4506 trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu " 4475 trace_ext4_request_blocks(ar);
4507 "lblk %llu goal %llu lleft %llu lright %llu "
4508 "pleft %llu pright %llu ",
4509 sb->s_id, ar->flags, ar->len,
4510 ar->inode ? ar->inode->i_ino : 0,
4511 (unsigned long long) ar->logical,
4512 (unsigned long long) ar->goal,
4513 (unsigned long long) ar->lleft,
4514 (unsigned long long) ar->lright,
4515 (unsigned long long) ar->pleft,
4516 (unsigned long long) ar->pright);
4517 4476
4518 /* 4477 /*
4519 * For delayed allocation, we could skip the ENOSPC and 4478 * For delayed allocation, we could skip the ENOSPC and
@@ -4622,18 +4581,7 @@ out3:
4622 reserv_blks); 4581 reserv_blks);
4623 } 4582 }
4624 4583
4625 trace_mark(ext4_allocate_blocks, 4584 trace_ext4_allocate_blocks(ar, (unsigned long long)block);
4626 "dev %s block %llu flags %u len %u ino %lu "
4627 "logical %llu goal %llu lleft %llu lright %llu "
4628 "pleft %llu pright %llu ",
4629 sb->s_id, (unsigned long long) block,
4630 ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
4631 (unsigned long long) ar->logical,
4632 (unsigned long long) ar->goal,
4633 (unsigned long long) ar->lleft,
4634 (unsigned long long) ar->lright,
4635 (unsigned long long) ar->pleft,
4636 (unsigned long long) ar->pright);
4637 4585
4638 return block; 4586 return block;
4639} 4587}
@@ -4737,7 +4685,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4737 * Main entry point into mballoc to free blocks 4685 * Main entry point into mballoc to free blocks
4738 */ 4686 */
4739void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, 4687void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4740 unsigned long block, unsigned long count, 4688 ext4_fsblk_t block, unsigned long count,
4741 int metadata, unsigned long *freed) 4689 int metadata, unsigned long *freed)
4742{ 4690{
4743 struct buffer_head *bitmap_bh = NULL; 4691 struct buffer_head *bitmap_bh = NULL;
@@ -4763,15 +4711,12 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4763 block + count > ext4_blocks_count(es)) { 4711 block + count > ext4_blocks_count(es)) {
4764 ext4_error(sb, __func__, 4712 ext4_error(sb, __func__,
4765 "Freeing blocks not in datazone - " 4713 "Freeing blocks not in datazone - "
4766 "block = %lu, count = %lu", block, count); 4714 "block = %llu, count = %lu", block, count);
4767 goto error_return; 4715 goto error_return;
4768 } 4716 }
4769 4717
4770 ext4_debug("freeing block %lu\n", block); 4718 ext4_debug("freeing block %llu\n", block);
4771 trace_mark(ext4_free_blocks, 4719 trace_ext4_free_blocks(inode, block, count, metadata);
4772 "dev %s block %llu count %lu metadata %d ino %lu",
4773 sb->s_id, (unsigned long long) block, count, metadata,
4774 inode ? inode->i_ino : 0);
4775 4720
4776 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4721 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4777 if (ac) { 4722 if (ac) {
@@ -4812,7 +4757,7 @@ do_more:
4812 4757
4813 ext4_error(sb, __func__, 4758 ext4_error(sb, __func__,
4814 "Freeing blocks in system zone - " 4759 "Freeing blocks in system zone - "
4815 "Block = %lu, count = %lu", block, count); 4760 "Block = %llu, count = %lu", block, count);
4816 /* err = 0. ext4_std_error should be a no op */ 4761 /* err = 0. ext4_std_error should be a no op */
4817 goto error_return; 4762 goto error_return;
4818 } 4763 }
@@ -4859,29 +4804,25 @@ do_more:
4859 new_entry->group = block_group; 4804 new_entry->group = block_group;
4860 new_entry->count = count; 4805 new_entry->count = count;
4861 new_entry->t_tid = handle->h_transaction->t_tid; 4806 new_entry->t_tid = handle->h_transaction->t_tid;
4807
4862 ext4_lock_group(sb, block_group); 4808 ext4_lock_group(sb, block_group);
4863 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, 4809 mb_clear_bits(bitmap_bh->b_data, bit, count);
4864 bit, count);
4865 ext4_mb_free_metadata(handle, &e4b, new_entry); 4810 ext4_mb_free_metadata(handle, &e4b, new_entry);
4866 ext4_unlock_group(sb, block_group);
4867 } else { 4811 } else {
4868 ext4_lock_group(sb, block_group);
4869 /* need to update group_info->bb_free and bitmap 4812 /* need to update group_info->bb_free and bitmap
4870 * with group lock held. generate_buddy look at 4813 * with group lock held. generate_buddy look at
4871 * them with group lock_held 4814 * them with group lock_held
4872 */ 4815 */
4873 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, 4816 ext4_lock_group(sb, block_group);
4874 bit, count); 4817 mb_clear_bits(bitmap_bh->b_data, bit, count);
4875 mb_free_blocks(inode, &e4b, bit, count); 4818 mb_free_blocks(inode, &e4b, bit, count);
4876 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4819 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4877 ext4_unlock_group(sb, block_group);
4878 } 4820 }
4879 4821
4880 spin_lock(sb_bgl_lock(sbi, block_group));
4881 ret = ext4_free_blks_count(sb, gdp) + count; 4822 ret = ext4_free_blks_count(sb, gdp) + count;
4882 ext4_free_blks_set(sb, gdp, ret); 4823 ext4_free_blks_set(sb, gdp, ret);
4883 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4824 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4884 spin_unlock(sb_bgl_lock(sbi, block_group)); 4825 ext4_unlock_group(sb, block_group);
4885 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4826 percpu_counter_add(&sbi->s_freeblocks_counter, count);
4886 4827
4887 if (sbi->s_log_groups_per_flex) { 4828 if (sbi->s_log_groups_per_flex) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index dd9e6cd5f6cf..c96bb19f58f9 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -19,11 +19,9 @@
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/version.h> 20#include <linux/version.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/marker.h>
23#include <linux/mutex.h> 22#include <linux/mutex.h>
24#include "ext4_jbd2.h" 23#include "ext4_jbd2.h"
25#include "ext4.h" 24#include "ext4.h"
26#include "group.h"
27 25
28/* 26/*
29 * with AGGRESSIVE_CHECK allocator runs consistency checks over 27 * with AGGRESSIVE_CHECK allocator runs consistency checks over
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index fe64d9f79852..313a50b39741 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -458,6 +458,7 @@ int ext4_ext_migrate(struct inode *inode)
458 struct inode *tmp_inode = NULL; 458 struct inode *tmp_inode = NULL;
459 struct list_blocks_struct lb; 459 struct list_blocks_struct lb;
460 unsigned long max_entries; 460 unsigned long max_entries;
461 __u32 goal;
461 462
462 /* 463 /*
463 * If the filesystem does not support extents, or the inode 464 * If the filesystem does not support extents, or the inode
@@ -483,9 +484,10 @@ int ext4_ext_migrate(struct inode *inode)
483 retval = PTR_ERR(handle); 484 retval = PTR_ERR(handle);
484 return retval; 485 return retval;
485 } 486 }
486 tmp_inode = ext4_new_inode(handle, 487 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
487 inode->i_sb->s_root->d_inode, 488 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
488 S_IFREG); 489 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
490 S_IFREG, 0, goal);
489 if (IS_ERR(tmp_inode)) { 491 if (IS_ERR(tmp_inode)) {
490 retval = -ENOMEM; 492 retval = -ENOMEM;
491 ext4_journal_stop(handle); 493 ext4_journal_stop(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
new file mode 100644
index 000000000000..bbf2dd9404dc
--- /dev/null
+++ b/fs/ext4/move_extent.c
@@ -0,0 +1,1320 @@
1/*
2 * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
3 * Written by Takashi Sato <t-sato@yk.jp.nec.com>
4 * Akira Fujita <a-fujita@rs.jp.nec.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of version 2.1 of the GNU Lesser General Public License
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 */
15
16#include <linux/fs.h>
17#include <linux/quotaops.h>
18#include "ext4_jbd2.h"
19#include "ext4_extents.h"
20#include "ext4.h"
21
22#define get_ext_path(path, inode, block, ret) \
23 do { \
24 path = ext4_ext_find_extent(inode, block, path); \
25 if (IS_ERR(path)) { \
26 ret = PTR_ERR(path); \
27 path = NULL; \
28 } \
29 } while (0)
30
31/**
32 * copy_extent_status - Copy the extent's initialization status
33 *
34 * @src: an extent for getting initialize status
35 * @dest: an extent to be set the status
36 */
37static void
38copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
39{
40 if (ext4_ext_is_uninitialized(src))
41 ext4_ext_mark_uninitialized(dest);
42 else
43 dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
44}
45
46/**
47 * mext_next_extent - Search for the next extent and set it to "extent"
48 *
49 * @inode: inode which is searched
50 * @path: this will obtain data for the next extent
51 * @extent: pointer to the next extent we have just gotten
52 *
53 * Search the next extent in the array of ext4_ext_path structure (@path)
54 * and set it to ext4_extent structure (@extent). In addition, the member of
55 * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
56 * ext4_ext_path structure refers to the last extent, or a negative error
57 * value on failure.
58 */
59static int
60mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
61 struct ext4_extent **extent)
62{
63 int ppos, leaf_ppos = path->p_depth;
64
65 ppos = leaf_ppos;
66 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
67 /* leaf block */
68 *extent = ++path[ppos].p_ext;
69 return 0;
70 }
71
72 while (--ppos >= 0) {
73 if (EXT_LAST_INDEX(path[ppos].p_hdr) >
74 path[ppos].p_idx) {
75 int cur_ppos = ppos;
76
77 /* index block */
78 path[ppos].p_idx++;
79 path[ppos].p_block = idx_pblock(path[ppos].p_idx);
80 if (path[ppos+1].p_bh)
81 brelse(path[ppos+1].p_bh);
82 path[ppos+1].p_bh =
83 sb_bread(inode->i_sb, path[ppos].p_block);
84 if (!path[ppos+1].p_bh)
85 return -EIO;
86 path[ppos+1].p_hdr =
87 ext_block_hdr(path[ppos+1].p_bh);
88
89 /* Halfway index block */
90 while (++cur_ppos < leaf_ppos) {
91 path[cur_ppos].p_idx =
92 EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
93 path[cur_ppos].p_block =
94 idx_pblock(path[cur_ppos].p_idx);
95 if (path[cur_ppos+1].p_bh)
96 brelse(path[cur_ppos+1].p_bh);
97 path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
98 path[cur_ppos].p_block);
99 if (!path[cur_ppos+1].p_bh)
100 return -EIO;
101 path[cur_ppos+1].p_hdr =
102 ext_block_hdr(path[cur_ppos+1].p_bh);
103 }
104
105 /* leaf block */
106 path[leaf_ppos].p_ext = *extent =
107 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
108 return 0;
109 }
110 }
111 /* We found the last extent */
112 return 1;
113}
114
115/**
116 * mext_double_down_read - Acquire two inodes' read semaphore
117 *
118 * @orig_inode: original inode structure
119 * @donor_inode: donor inode structure
120 * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
121 */
122static void
123mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
124{
125 struct inode *first = orig_inode, *second = donor_inode;
126
127 BUG_ON(orig_inode == NULL || donor_inode == NULL);
128
129 /*
130 * Use the inode number to provide the stable locking order instead
131 * of its address, because the C language doesn't guarantee you can
132 * compare pointers that don't come from the same array.
133 */
134 if (donor_inode->i_ino < orig_inode->i_ino) {
135 first = donor_inode;
136 second = orig_inode;
137 }
138
139 down_read(&EXT4_I(first)->i_data_sem);
140 down_read(&EXT4_I(second)->i_data_sem);
141}
142
143/**
144 * mext_double_down_write - Acquire two inodes' write semaphore
145 *
146 * @orig_inode: original inode structure
147 * @donor_inode: donor inode structure
148 * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
149 */
150static void
151mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
152{
153 struct inode *first = orig_inode, *second = donor_inode;
154
155 BUG_ON(orig_inode == NULL || donor_inode == NULL);
156
157 /*
158 * Use the inode number to provide the stable locking order instead
159 * of its address, because the C language doesn't guarantee you can
160 * compare pointers that don't come from the same array.
161 */
162 if (donor_inode->i_ino < orig_inode->i_ino) {
163 first = donor_inode;
164 second = orig_inode;
165 }
166
167 down_write(&EXT4_I(first)->i_data_sem);
168 down_write(&EXT4_I(second)->i_data_sem);
169}
170
171/**
172 * mext_double_up_read - Release two inodes' read semaphore
173 *
174 * @orig_inode: original inode structure to be released its lock first
175 * @donor_inode: donor inode structure to be released its lock second
176 * Release read semaphore of two inodes (orig and donor).
177 */
178static void
179mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
180{
181 BUG_ON(orig_inode == NULL || donor_inode == NULL);
182
183 up_read(&EXT4_I(orig_inode)->i_data_sem);
184 up_read(&EXT4_I(donor_inode)->i_data_sem);
185}
186
187/**
188 * mext_double_up_write - Release two inodes' write semaphore
189 *
190 * @orig_inode: original inode structure to be released its lock first
191 * @donor_inode: donor inode structure to be released its lock second
192 * Release write semaphore of two inodes (orig and donor).
193 */
194static void
195mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
196{
197 BUG_ON(orig_inode == NULL || donor_inode == NULL);
198
199 up_write(&EXT4_I(orig_inode)->i_data_sem);
200 up_write(&EXT4_I(donor_inode)->i_data_sem);
201}
202
203/**
204 * mext_insert_across_blocks - Insert extents across leaf block
205 *
206 * @handle: journal handle
207 * @orig_inode: original inode
208 * @o_start: first original extent to be changed
209 * @o_end: last original extent to be changed
210 * @start_ext: first new extent to be inserted
211 * @new_ext: middle of new extent to be inserted
212 * @end_ext: last new extent to be inserted
213 *
214 * Allocate a new leaf block and insert extents into it. Return 0 on success,
215 * or a negative error value on failure.
216 */
217static int
218mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
219 struct ext4_extent *o_start, struct ext4_extent *o_end,
220 struct ext4_extent *start_ext, struct ext4_extent *new_ext,
221 struct ext4_extent *end_ext)
222{
223 struct ext4_ext_path *orig_path = NULL;
224 ext4_lblk_t eblock = 0;
225 int new_flag = 0;
226 int end_flag = 0;
227 int err = 0;
228
229 if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
230 if (o_start == o_end) {
231
232 /* start_ext new_ext end_ext
233 * donor |---------|-----------|--------|
234 * orig |------------------------------|
235 */
236 end_flag = 1;
237 } else {
238
239 /* start_ext new_ext end_ext
240 * donor |---------|----------|---------|
241 * orig |---------------|--------------|
242 */
243 o_end->ee_block = end_ext->ee_block;
244 o_end->ee_len = end_ext->ee_len;
245 ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
246 }
247
248 o_start->ee_len = start_ext->ee_len;
249 new_flag = 1;
250
251 } else if (start_ext->ee_len && new_ext->ee_len &&
252 !end_ext->ee_len && o_start == o_end) {
253
254 /* start_ext new_ext
255 * donor |--------------|---------------|
256 * orig |------------------------------|
257 */
258 o_start->ee_len = start_ext->ee_len;
259 new_flag = 1;
260
261 } else if (!start_ext->ee_len && new_ext->ee_len &&
262 end_ext->ee_len && o_start == o_end) {
263
264 /* new_ext end_ext
265 * donor |--------------|---------------|
266 * orig |------------------------------|
267 */
268 o_end->ee_block = end_ext->ee_block;
269 o_end->ee_len = end_ext->ee_len;
270 ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
271
272 /*
273 * Set 0 to the extent block if new_ext was
274 * the first block.
275 */
276 if (new_ext->ee_block)
277 eblock = le32_to_cpu(new_ext->ee_block);
278
279 new_flag = 1;
280 } else {
281 ext4_debug("ext4 move extent: Unexpected insert case\n");
282 return -EIO;
283 }
284
285 if (new_flag) {
286 get_ext_path(orig_path, orig_inode, eblock, err);
287 if (orig_path == NULL)
288 goto out;
289
290 if (ext4_ext_insert_extent(handle, orig_inode,
291 orig_path, new_ext))
292 goto out;
293 }
294
295 if (end_flag) {
296 get_ext_path(orig_path, orig_inode,
297 le32_to_cpu(end_ext->ee_block) - 1, err);
298 if (orig_path == NULL)
299 goto out;
300
301 if (ext4_ext_insert_extent(handle, orig_inode,
302 orig_path, end_ext))
303 goto out;
304 }
305out:
306 if (orig_path) {
307 ext4_ext_drop_refs(orig_path);
308 kfree(orig_path);
309 }
310
311 return err;
312
313}
314
315/**
316 * mext_insert_inside_block - Insert new extent to the extent block
317 *
318 * @o_start: first original extent to be moved
319 * @o_end: last original extent to be moved
320 * @start_ext: first new extent to be inserted
321 * @new_ext: middle of new extent to be inserted
322 * @end_ext: last new extent to be inserted
323 * @eh: extent header of target leaf block
324 * @range_to_move: used to decide how to insert extent
325 *
326 * Insert extents into the leaf block. The extent (@o_start) is overwritten
327 * by inserted extents.
328 */
329static void
330mext_insert_inside_block(struct ext4_extent *o_start,
331 struct ext4_extent *o_end,
332 struct ext4_extent *start_ext,
333 struct ext4_extent *new_ext,
334 struct ext4_extent *end_ext,
335 struct ext4_extent_header *eh,
336 int range_to_move)
337{
338 int i = 0;
339 unsigned long len;
340
341 /* Move the existing extents */
342 if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
343 len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
344 (unsigned long)(o_end + 1);
345 memmove(o_end + 1 + range_to_move, o_end + 1, len);
346 }
347
348 /* Insert start entry */
349 if (start_ext->ee_len)
350 o_start[i++].ee_len = start_ext->ee_len;
351
352 /* Insert new entry */
353 if (new_ext->ee_len) {
354 o_start[i] = *new_ext;
355 ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
356 }
357
358 /* Insert end entry */
359 if (end_ext->ee_len)
360 o_start[i] = *end_ext;
361
362 /* Increment the total entries counter on the extent block */
363 le16_add_cpu(&eh->eh_entries, range_to_move);
364}
365
366/**
367 * mext_insert_extents - Insert new extent
368 *
369 * @handle: journal handle
370 * @orig_inode: original inode
371 * @orig_path: path indicates first extent to be changed
372 * @o_start: first original extent to be changed
373 * @o_end: last original extent to be changed
374 * @start_ext: first new extent to be inserted
375 * @new_ext: middle of new extent to be inserted
376 * @end_ext: last new extent to be inserted
377 *
378 * Call the function to insert extents. If we cannot add more extents into
379 * the leaf block, we call mext_insert_across_blocks() to create a
380 * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
381 * on success, or a negative error value on failure.
382 */
383static int
384mext_insert_extents(handle_t *handle, struct inode *orig_inode,
385 struct ext4_ext_path *orig_path,
386 struct ext4_extent *o_start,
387 struct ext4_extent *o_end,
388 struct ext4_extent *start_ext,
389 struct ext4_extent *new_ext,
390 struct ext4_extent *end_ext)
391{
392 struct ext4_extent_header *eh;
393 unsigned long need_slots, slots_range;
394 int range_to_move, depth, ret;
395
396 /*
397 * The extents need to be inserted
398 * start_extent + new_extent + end_extent.
399 */
400 need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
401 (new_ext->ee_len ? 1 : 0);
402
403 /* The number of slots between start and end */
404 slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
405 / sizeof(struct ext4_extent);
406
407 /* Range to move the end of extent */
408 range_to_move = need_slots - slots_range;
409 depth = orig_path->p_depth;
410 orig_path += depth;
411 eh = orig_path->p_hdr;
412
413 if (depth) {
414 /* Register to journal */
415 ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
416 if (ret)
417 return ret;
418 }
419
420 /* Expansion */
421 if (range_to_move > 0 &&
422 (range_to_move > le16_to_cpu(eh->eh_max)
423 - le16_to_cpu(eh->eh_entries))) {
424
425 ret = mext_insert_across_blocks(handle, orig_inode, o_start,
426 o_end, start_ext, new_ext, end_ext);
427 if (ret < 0)
428 return ret;
429 } else
430 mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
431 end_ext, eh, range_to_move);
432
433 if (depth) {
434 ret = ext4_handle_dirty_metadata(handle, orig_inode,
435 orig_path->p_bh);
436 if (ret)
437 return ret;
438 } else {
439 ret = ext4_mark_inode_dirty(handle, orig_inode);
440 if (ret < 0)
441 return ret;
442 }
443
444 return 0;
445}
446
447/**
448 * mext_leaf_block - Move one leaf extent block into the inode.
449 *
450 * @handle: journal handle
451 * @orig_inode: original inode
452 * @orig_path: path indicates first extent to be changed
453 * @dext: donor extent
454 * @from: start offset on the target file
455 *
456 * In order to insert extents into the leaf block, we must divide the extent
457 * in the leaf block into three extents. The one is located to be inserted
458 * extents, and the others are located around it.
459 *
460 * Therefore, this function creates structures to save extents of the leaf
461 * block, and inserts extents by calling mext_insert_extents() with
462 * created extents. Return 0 on success, or a negative error value on failure.
463 */
464static int
465mext_leaf_block(handle_t *handle, struct inode *orig_inode,
466 struct ext4_ext_path *orig_path, struct ext4_extent *dext,
467 ext4_lblk_t *from)
468{
469 struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
470 struct ext4_extent new_ext, start_ext, end_ext;
471 ext4_lblk_t new_ext_end;
472 ext4_fsblk_t new_phys_end;
473 int oext_alen, new_ext_alen, end_ext_alen;
474 int depth = ext_depth(orig_inode);
475 int ret;
476
477 o_start = o_end = oext = orig_path[depth].p_ext;
478 oext_alen = ext4_ext_get_actual_len(oext);
479 start_ext.ee_len = end_ext.ee_len = 0;
480
481 new_ext.ee_block = cpu_to_le32(*from);
482 ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
483 new_ext.ee_len = dext->ee_len;
484 new_ext_alen = ext4_ext_get_actual_len(&new_ext);
485 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
486 new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
487
488 /*
489 * Case: original extent is first
490 * oext |--------|
491 * new_ext |--|
492 * start_ext |--|
493 */
494 if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
495 le32_to_cpu(new_ext.ee_block) <
496 le32_to_cpu(oext->ee_block) + oext_alen) {
497 start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
498 le32_to_cpu(oext->ee_block));
499 copy_extent_status(oext, &start_ext);
500 } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
501 prev_ext = oext - 1;
502 /*
503 * We can merge new_ext into previous extent,
504 * if these are contiguous and same extent type.
505 */
506 if (ext4_can_extents_be_merged(orig_inode, prev_ext,
507 &new_ext)) {
508 o_start = prev_ext;
509 start_ext.ee_len = cpu_to_le16(
510 ext4_ext_get_actual_len(prev_ext) +
511 new_ext_alen);
512 copy_extent_status(prev_ext, &start_ext);
513 new_ext.ee_len = 0;
514 }
515 }
516
517 /*
518 * Case: new_ext_end must be less than oext
519 * oext |-----------|
520 * new_ext |-------|
521 */
522 BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
523
524 /*
525 * Case: new_ext is smaller than original extent
526 * oext |---------------|
527 * new_ext |-----------|
528 * end_ext |---|
529 */
530 if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
531 new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
532 end_ext.ee_len =
533 cpu_to_le16(le32_to_cpu(oext->ee_block) +
534 oext_alen - 1 - new_ext_end);
535 copy_extent_status(oext, &end_ext);
536 end_ext_alen = ext4_ext_get_actual_len(&end_ext);
537 ext4_ext_store_pblock(&end_ext,
538 (ext_pblock(o_end) + oext_alen - end_ext_alen));
539 end_ext.ee_block =
540 cpu_to_le32(le32_to_cpu(o_end->ee_block) +
541 oext_alen - end_ext_alen);
542 }
543
544 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
545 o_end, &start_ext, &new_ext, &end_ext);
546 return ret;
547}
548
549/**
550 * mext_calc_swap_extents - Calculate extents for extent swapping.
551 *
552 * @tmp_dext: the extent that will belong to the original inode
553 * @tmp_oext: the extent that will belong to the donor inode
554 * @orig_off: block offset of original inode
555 * @donor_off: block offset of donor inode
556 * @max_count: the maximun length of extents
557 */
558static void
559mext_calc_swap_extents(struct ext4_extent *tmp_dext,
560 struct ext4_extent *tmp_oext,
561 ext4_lblk_t orig_off, ext4_lblk_t donor_off,
562 ext4_lblk_t max_count)
563{
564 ext4_lblk_t diff, orig_diff;
565 struct ext4_extent dext_old, oext_old;
566
567 dext_old = *tmp_dext;
568 oext_old = *tmp_oext;
569
570 /* When tmp_dext is too large, pick up the target range. */
571 diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
572
573 ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
574 tmp_dext->ee_block =
575 cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
576 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
577
578 if (max_count < ext4_ext_get_actual_len(tmp_dext))
579 tmp_dext->ee_len = cpu_to_le16(max_count);
580
581 orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
582 ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
583
584 /* Adjust extent length if donor extent is larger than orig */
585 if (ext4_ext_get_actual_len(tmp_dext) >
586 ext4_ext_get_actual_len(tmp_oext) - orig_diff)
587 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
588 orig_diff);
589
590 tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
591
592 copy_extent_status(&oext_old, tmp_dext);
593 copy_extent_status(&dext_old, tmp_oext);
594}
595
596/**
597 * mext_replace_branches - Replace original extents with new extents
598 *
599 * @handle: journal handle
600 * @orig_inode: original inode
601 * @donor_inode: donor inode
602 * @from: block offset of orig_inode
603 * @count: block count to be replaced
604 *
605 * Replace original inode extents and donor inode extents page by page.
606 * We implement this replacement in the following three steps:
607 * 1. Save the block information of original and donor inodes into
608 * dummy extents.
609 * 2. Change the block information of original inode to point at the
610 * donor inode blocks.
611 * 3. Change the block information of donor inode to point at the saved
612 * original inode blocks in the dummy extents.
613 *
614 * Return 0 on success, or a negative error value on failure.
615 */
616static int
617mext_replace_branches(handle_t *handle, struct inode *orig_inode,
618 struct inode *donor_inode, ext4_lblk_t from,
619 ext4_lblk_t count)
620{
621 struct ext4_ext_path *orig_path = NULL;
622 struct ext4_ext_path *donor_path = NULL;
623 struct ext4_extent *oext, *dext;
624 struct ext4_extent tmp_dext, tmp_oext;
625 ext4_lblk_t orig_off = from, donor_off = from;
626 int err = 0;
627 int depth;
628 int replaced_count = 0;
629 int dext_alen;
630
631 mext_double_down_write(orig_inode, donor_inode);
632
633 /* Get the original extent for the block "orig_off" */
634 get_ext_path(orig_path, orig_inode, orig_off, err);
635 if (orig_path == NULL)
636 goto out;
637
638 /* Get the donor extent for the head */
639 get_ext_path(donor_path, donor_inode, donor_off, err);
640 if (donor_path == NULL)
641 goto out;
642 depth = ext_depth(orig_inode);
643 oext = orig_path[depth].p_ext;
644 tmp_oext = *oext;
645
646 depth = ext_depth(donor_inode);
647 dext = donor_path[depth].p_ext;
648 tmp_dext = *dext;
649
650 mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
651 donor_off, count);
652
653 /* Loop for the donor extents */
654 while (1) {
655 /* The extent for donor must be found. */
656 BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
657
658 /* Set donor extent to orig extent */
659 err = mext_leaf_block(handle, orig_inode,
660 orig_path, &tmp_dext, &orig_off);
661 if (err < 0)
662 goto out;
663
664 /* Set orig extent to donor extent */
665 err = mext_leaf_block(handle, donor_inode,
666 donor_path, &tmp_oext, &donor_off);
667 if (err < 0)
668 goto out;
669
670 dext_alen = ext4_ext_get_actual_len(&tmp_dext);
671 replaced_count += dext_alen;
672 donor_off += dext_alen;
673 orig_off += dext_alen;
674
675 /* Already moved the expected blocks */
676 if (replaced_count >= count)
677 break;
678
679 if (orig_path)
680 ext4_ext_drop_refs(orig_path);
681 get_ext_path(orig_path, orig_inode, orig_off, err);
682 if (orig_path == NULL)
683 goto out;
684 depth = ext_depth(orig_inode);
685 oext = orig_path[depth].p_ext;
686 if (le32_to_cpu(oext->ee_block) +
687 ext4_ext_get_actual_len(oext) <= orig_off) {
688 err = 0;
689 goto out;
690 }
691 tmp_oext = *oext;
692
693 if (donor_path)
694 ext4_ext_drop_refs(donor_path);
695 get_ext_path(donor_path, donor_inode,
696 donor_off, err);
697 if (donor_path == NULL)
698 goto out;
699 depth = ext_depth(donor_inode);
700 dext = donor_path[depth].p_ext;
701 if (le32_to_cpu(dext->ee_block) +
702 ext4_ext_get_actual_len(dext) <= donor_off) {
703 err = 0;
704 goto out;
705 }
706 tmp_dext = *dext;
707
708 mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
709 donor_off,
710 count - replaced_count);
711 }
712
713out:
714 if (orig_path) {
715 ext4_ext_drop_refs(orig_path);
716 kfree(orig_path);
717 }
718 if (donor_path) {
719 ext4_ext_drop_refs(donor_path);
720 kfree(donor_path);
721 }
722
723 mext_double_up_write(orig_inode, donor_inode);
724 return err;
725}
726
727/**
728 * move_extent_per_page - Move extent data per page
729 *
730 * @o_filp: file structure of original file
731 * @donor_inode: donor inode
732 * @orig_page_offset: page index on original file
733 * @data_offset_in_page: block index where data swapping starts
734 * @block_len_in_page: the number of blocks to be swapped
735 * @uninit: orig extent is uninitialized or not
736 *
737 * Save the data in original inode blocks and replace original inode extents
738 * with donor inode extents by calling mext_replace_branches().
739 * Finally, write out the saved data in new original inode blocks. Return 0
740 * on success, or a negative error value on failure.
741 */
742static int
743move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
744 pgoff_t orig_page_offset, int data_offset_in_page,
745 int block_len_in_page, int uninit)
746{
747 struct inode *orig_inode = o_filp->f_dentry->d_inode;
748 struct address_space *mapping = orig_inode->i_mapping;
749 struct buffer_head *bh;
750 struct page *page = NULL;
751 const struct address_space_operations *a_ops = mapping->a_ops;
752 handle_t *handle;
753 ext4_lblk_t orig_blk_offset;
754 long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
755 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
756 unsigned int w_flags = 0;
757 unsigned int tmp_data_len, data_len;
758 void *fsdata;
759 int ret, i, jblocks;
760 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
761
762 /*
763 * It needs twice the amount of ordinary journal buffers because
764 * inode and donor_inode may change each different metadata blocks.
765 */
766 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
767 handle = ext4_journal_start(orig_inode, jblocks);
768 if (IS_ERR(handle)) {
769 ret = PTR_ERR(handle);
770 return ret;
771 }
772
773 if (segment_eq(get_fs(), KERNEL_DS))
774 w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
775
776 orig_blk_offset = orig_page_offset * blocks_per_page +
777 data_offset_in_page;
778
779 /*
780 * If orig extent is uninitialized one,
781 * it's not necessary force the page into memory
782 * and then force it to be written out again.
783 * Just swap data blocks between orig and donor.
784 */
785 if (uninit) {
786 ret = mext_replace_branches(handle, orig_inode,
787 donor_inode, orig_blk_offset,
788 block_len_in_page);
789
790 /* Clear the inode cache not to refer to the old data */
791 ext4_ext_invalidate_cache(orig_inode);
792 ext4_ext_invalidate_cache(donor_inode);
793 goto out2;
794 }
795
796 offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
797
798 /* Calculate data_len */
799 if ((orig_blk_offset + block_len_in_page - 1) ==
800 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
801 /* Replace the last block */
802 tmp_data_len = orig_inode->i_size & (blocksize - 1);
803 /*
804 * If data_len equal zero, it shows data_len is multiples of
805 * blocksize. So we set appropriate value.
806 */
807 if (tmp_data_len == 0)
808 tmp_data_len = blocksize;
809
810 data_len = tmp_data_len +
811 ((block_len_in_page - 1) << orig_inode->i_blkbits);
812 } else {
813 data_len = block_len_in_page << orig_inode->i_blkbits;
814 }
815
816 ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
817 &page, &fsdata);
818 if (unlikely(ret < 0))
819 goto out;
820
821 if (!PageUptodate(page)) {
822 mapping->a_ops->readpage(o_filp, page);
823 lock_page(page);
824 }
825
826 /*
827 * try_to_release_page() doesn't call releasepage in writeback mode.
828 * We should care about the order of writing to the same file
829 * by multiple move extent processes.
830 * It needs to call wait_on_page_writeback() to wait for the
831 * writeback of the page.
832 */
833 if (PageWriteback(page))
834 wait_on_page_writeback(page);
835
836 /* Release old bh and drop refs */
837 try_to_release_page(page, 0);
838
839 ret = mext_replace_branches(handle, orig_inode, donor_inode,
840 orig_blk_offset, block_len_in_page);
841 if (ret < 0)
842 goto out;
843
844 /* Clear the inode cache not to refer to the old data */
845 ext4_ext_invalidate_cache(orig_inode);
846 ext4_ext_invalidate_cache(donor_inode);
847
848 if (!page_has_buffers(page))
849 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
850
851 bh = page_buffers(page);
852 for (i = 0; i < data_offset_in_page; i++)
853 bh = bh->b_this_page;
854
855 for (i = 0; i < block_len_in_page; i++) {
856 ret = ext4_get_block(orig_inode,
857 (sector_t)(orig_blk_offset + i), bh, 0);
858 if (ret < 0)
859 goto out;
860
861 if (bh->b_this_page != NULL)
862 bh = bh->b_this_page;
863 }
864
865 ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
866 page, fsdata);
867 page = NULL;
868
869out:
870 if (unlikely(page)) {
871 if (PageLocked(page))
872 unlock_page(page);
873 page_cache_release(page);
874 }
875out2:
876 ext4_journal_stop(handle);
877
878 return ret < 0 ? ret : 0;
879}
880
881/**
882 * mext_check_argumants - Check whether move extent can be done
883 *
884 * @orig_inode: original inode
885 * @donor_inode: donor inode
886 * @orig_start: logical start offset in block for orig
887 * @donor_start: logical start offset in block for donor
888 * @len: the number of blocks to be moved
889 * @moved_len: moved block length
890 *
891 * Check the arguments of ext4_move_extents() whether the files can be
892 * exchanged with each other.
893 * Return 0 on success, or a negative error value on failure.
894 */
895static int
896mext_check_arguments(struct inode *orig_inode,
897 struct inode *donor_inode, __u64 orig_start,
898 __u64 donor_start, __u64 *len, __u64 moved_len)
899{
900 /* Regular file check */
901 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
902 ext4_debug("ext4 move extent: The argument files should be "
903 "regular file [ino:orig %lu, donor %lu]\n",
904 orig_inode->i_ino, donor_inode->i_ino);
905 return -EINVAL;
906 }
907
908 /* Ext4 move extent does not support swapfile */
909 if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
910 ext4_debug("ext4 move extent: The argument files should "
911 "not be swapfile [ino:orig %lu, donor %lu]\n",
912 orig_inode->i_ino, donor_inode->i_ino);
913 return -EINVAL;
914 }
915
916 /* Files should be in the same ext4 FS */
917 if (orig_inode->i_sb != donor_inode->i_sb) {
918 ext4_debug("ext4 move extent: The argument files "
919 "should be in same FS [ino:orig %lu, donor %lu]\n",
920 orig_inode->i_ino, donor_inode->i_ino);
921 return -EINVAL;
922 }
923
924 /* orig and donor should be different file */
925 if (orig_inode->i_ino == donor_inode->i_ino) {
926 ext4_debug("ext4 move extent: The argument files should not "
927 "be same file [ino:orig %lu, donor %lu]\n",
928 orig_inode->i_ino, donor_inode->i_ino);
929 return -EINVAL;
930 }
931
932 /* Ext4 move extent supports only extent based file */
933 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
934 ext4_debug("ext4 move extent: orig file is not extents "
935 "based file [ino:orig %lu]\n", orig_inode->i_ino);
936 return -EOPNOTSUPP;
937 } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
938 ext4_debug("ext4 move extent: donor file is not extents "
939 "based file [ino:donor %lu]\n", donor_inode->i_ino);
940 return -EOPNOTSUPP;
941 }
942
943 if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
944 ext4_debug("ext4 move extent: File size is 0 byte\n");
945 return -EINVAL;
946 }
947
948 /* Start offset should be same */
949 if (orig_start != donor_start) {
950 ext4_debug("ext4 move extent: orig and donor's start "
951 "offset are not same [ino:orig %lu, donor %lu]\n",
952 orig_inode->i_ino, donor_inode->i_ino);
953 return -EINVAL;
954 }
955
956 if (moved_len) {
957 ext4_debug("ext4 move extent: moved_len should be 0 "
958 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
959 donor_inode->i_ino);
960 return -EINVAL;
961 }
962
963 if ((orig_start > MAX_DEFRAG_SIZE) ||
964 (donor_start > MAX_DEFRAG_SIZE) ||
965 (*len > MAX_DEFRAG_SIZE) ||
966 (orig_start + *len > MAX_DEFRAG_SIZE)) {
967 ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
968 "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
969 orig_inode->i_ino, donor_inode->i_ino);
970 return -EINVAL;
971 }
972
973 if (orig_inode->i_size > donor_inode->i_size) {
974 if (orig_start >= donor_inode->i_size) {
975 ext4_debug("ext4 move extent: orig start offset "
976 "[%llu] should be less than donor file size "
977 "[%lld] [ino:orig %lu, donor_inode %lu]\n",
978 orig_start, donor_inode->i_size,
979 orig_inode->i_ino, donor_inode->i_ino);
980 return -EINVAL;
981 }
982
983 if (orig_start + *len > donor_inode->i_size) {
984 ext4_debug("ext4 move extent: End offset [%llu] should "
985 "be less than donor file size [%lld]."
986 "So adjust length from %llu to %lld "
987 "[ino:orig %lu, donor %lu]\n",
988 orig_start + *len, donor_inode->i_size,
989 *len, donor_inode->i_size - orig_start,
990 orig_inode->i_ino, donor_inode->i_ino);
991 *len = donor_inode->i_size - orig_start;
992 }
993 } else {
994 if (orig_start >= orig_inode->i_size) {
995 ext4_debug("ext4 move extent: start offset [%llu] "
996 "should be less than original file size "
997 "[%lld] [inode:orig %lu, donor %lu]\n",
998 orig_start, orig_inode->i_size,
999 orig_inode->i_ino, donor_inode->i_ino);
1000 return -EINVAL;
1001 }
1002
1003 if (orig_start + *len > orig_inode->i_size) {
1004 ext4_debug("ext4 move extent: Adjust length "
1005 "from %llu to %lld. Because it should be "
1006 "less than original file size "
1007 "[ino:orig %lu, donor %lu]\n",
1008 *len, orig_inode->i_size - orig_start,
1009 orig_inode->i_ino, donor_inode->i_ino);
1010 *len = orig_inode->i_size - orig_start;
1011 }
1012 }
1013
1014 if (!*len) {
1015 ext4_debug("ext4 move extent: len shoudld not be 0 "
1016 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
1017 donor_inode->i_ino);
1018 return -EINVAL;
1019 }
1020
1021 return 0;
1022}
1023
1024/**
1025 * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
1026 *
1027 * @inode1: the inode structure
1028 * @inode2: the inode structure
1029 *
1030 * Lock two inodes' i_mutex by i_ino order. This function is moved from
1031 * fs/inode.c.
1032 */
1033static void
1034mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1035{
1036 if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
1037 if (inode1)
1038 mutex_lock(&inode1->i_mutex);
1039 else if (inode2)
1040 mutex_lock(&inode2->i_mutex);
1041 return;
1042 }
1043
1044 if (inode1->i_ino < inode2->i_ino) {
1045 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
1046 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
1047 } else {
1048 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
1049 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
1050 }
1051}
1052
1053/**
1054 * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
1055 *
1056 * @inode1: the inode that is released first
1057 * @inode2: the inode that is released second
1058 *
1059 * This function is moved from fs/inode.c.
1060 */
1061
1062static void
1063mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1064{
1065 if (inode1)
1066 mutex_unlock(&inode1->i_mutex);
1067
1068 if (inode2 && inode2 != inode1)
1069 mutex_unlock(&inode2->i_mutex);
1070}
1071
1072/**
1073 * ext4_move_extents - Exchange the specified range of a file
1074 *
1075 * @o_filp: file structure of the original file
1076 * @d_filp: file structure of the donor file
1077 * @orig_start: start offset in block for orig
1078 * @donor_start: start offset in block for donor
1079 * @len: the number of blocks to be moved
1080 * @moved_len: moved block length
1081 *
1082 * This function returns 0 and moved block length is set in moved_len
1083 * if succeed, otherwise returns error value.
1084 *
1085 * Note: ext4_move_extents() proceeds the following order.
1086 * 1:ext4_move_extents() calculates the last block number of moving extent
1087 * function by the start block number (orig_start) and the number of blocks
1088 * to be moved (len) specified as arguments.
1089 * If the {orig, donor}_start points a hole, the extent's start offset
1090 * pointed by ext_cur (current extent), holecheck_path, orig_path are set
1091 * after hole behind.
1092 * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
1093 * or the ext_cur exceeds the block_end which is last logical block number.
1094 * 3:To get the length of continues area, call mext_next_extent()
1095 * specified with the ext_cur (initial value is holecheck_path) re-cursive,
1096 * until find un-continuous extent, the start logical block number exceeds
1097 * the block_end or the extent points to the last extent.
1098 * 4:Exchange the original inode data with donor inode data
1099 * from orig_page_offset to seq_end_page.
1100 * The start indexes of data are specified as arguments.
1101 * That of the original inode is orig_page_offset,
1102 * and the donor inode is also orig_page_offset
1103 * (To easily handle blocksize != pagesize case, the offset for the
1104 * donor inode is block unit).
1105 * 5:Update holecheck_path and orig_path to points a next proceeding extent,
1106 * then returns to step 2.
1107 * 6:Release holecheck_path, orig_path and set the len to moved_len
1108 * which shows the number of moved blocks.
1109 * The moved_len is useful for the command to calculate the file offset
1110 * for starting next move extent ioctl.
1111 * 7:Return 0 on success, or a negative error value on failure.
1112 */
1113int
1114ext4_move_extents(struct file *o_filp, struct file *d_filp,
1115 __u64 orig_start, __u64 donor_start, __u64 len,
1116 __u64 *moved_len)
1117{
1118 struct inode *orig_inode = o_filp->f_dentry->d_inode;
1119 struct inode *donor_inode = d_filp->f_dentry->d_inode;
1120 struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
1121 struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
1122 ext4_lblk_t block_start = orig_start;
1123 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
1124 ext4_lblk_t rest_blocks;
1125 pgoff_t orig_page_offset = 0, seq_end_page;
1126 int ret, depth, last_extent = 0;
1127 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
1128 int data_offset_in_page;
1129 int block_len_in_page;
1130 int uninit;
1131
1132 /* protect orig and donor against a truncate */
1133 mext_inode_double_lock(orig_inode, donor_inode);
1134
1135 mext_double_down_read(orig_inode, donor_inode);
1136 /* Check the filesystem environment whether move_extent can be done */
1137 ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
1138 donor_start, &len, *moved_len);
1139 mext_double_up_read(orig_inode, donor_inode);
1140 if (ret)
1141 goto out2;
1142
1143 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
1144 block_end = block_start + len - 1;
1145 if (file_end < block_end)
1146 len -= block_end - file_end;
1147
1148 get_ext_path(orig_path, orig_inode, block_start, ret);
1149 if (orig_path == NULL)
1150 goto out2;
1151
1152 /* Get path structure to check the hole */
1153 get_ext_path(holecheck_path, orig_inode, block_start, ret);
1154 if (holecheck_path == NULL)
1155 goto out;
1156
1157 depth = ext_depth(orig_inode);
1158 ext_cur = holecheck_path[depth].p_ext;
1159 if (ext_cur == NULL) {
1160 ret = -EINVAL;
1161 goto out;
1162 }
1163
1164 /*
1165 * Get proper extent whose ee_block is beyond block_start
1166 * if block_start was within the hole.
1167 */
1168 if (le32_to_cpu(ext_cur->ee_block) +
1169 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
1170 last_extent = mext_next_extent(orig_inode,
1171 holecheck_path, &ext_cur);
1172 if (last_extent < 0) {
1173 ret = last_extent;
1174 goto out;
1175 }
1176 last_extent = mext_next_extent(orig_inode, orig_path,
1177 &ext_dummy);
1178 if (last_extent < 0) {
1179 ret = last_extent;
1180 goto out;
1181 }
1182 }
1183 seq_start = block_start;
1184
1185 /* No blocks within the specified range. */
1186 if (le32_to_cpu(ext_cur->ee_block) > block_end) {
1187 ext4_debug("ext4 move extent: The specified range of file "
1188 "may be the hole\n");
1189 ret = -EINVAL;
1190 goto out;
1191 }
1192
1193 /* Adjust start blocks */
1194 add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
1195 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
1196 max(le32_to_cpu(ext_cur->ee_block), block_start);
1197
1198 while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
1199 seq_blocks += add_blocks;
1200
1201 /* Adjust tail blocks */
1202 if (seq_start + seq_blocks - 1 > block_end)
1203 seq_blocks = block_end - seq_start + 1;
1204
1205 ext_prev = ext_cur;
1206 last_extent = mext_next_extent(orig_inode, holecheck_path,
1207 &ext_cur);
1208 if (last_extent < 0) {
1209 ret = last_extent;
1210 break;
1211 }
1212 add_blocks = ext4_ext_get_actual_len(ext_cur);
1213
1214 /*
1215 * Extend the length of contiguous block (seq_blocks)
1216 * if extents are contiguous.
1217 */
1218 if (ext4_can_extents_be_merged(orig_inode,
1219 ext_prev, ext_cur) &&
1220 block_end >= le32_to_cpu(ext_cur->ee_block) &&
1221 !last_extent)
1222 continue;
1223
1224 /* Is original extent is uninitialized */
1225 uninit = ext4_ext_is_uninitialized(ext_prev);
1226
1227 data_offset_in_page = seq_start % blocks_per_page;
1228
1229 /*
1230 * Calculate data blocks count that should be swapped
1231 * at the first page.
1232 */
1233 if (data_offset_in_page + seq_blocks > blocks_per_page) {
1234 /* Swapped blocks are across pages */
1235 block_len_in_page =
1236 blocks_per_page - data_offset_in_page;
1237 } else {
1238 /* Swapped blocks are in a page */
1239 block_len_in_page = seq_blocks;
1240 }
1241
1242 orig_page_offset = seq_start >>
1243 (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
1244 seq_end_page = (seq_start + seq_blocks - 1) >>
1245 (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
1246 seq_start = le32_to_cpu(ext_cur->ee_block);
1247 rest_blocks = seq_blocks;
1248
1249 /* Discard preallocations of two inodes */
1250 down_write(&EXT4_I(orig_inode)->i_data_sem);
1251 ext4_discard_preallocations(orig_inode);
1252 up_write(&EXT4_I(orig_inode)->i_data_sem);
1253
1254 down_write(&EXT4_I(donor_inode)->i_data_sem);
1255 ext4_discard_preallocations(donor_inode);
1256 up_write(&EXT4_I(donor_inode)->i_data_sem);
1257
1258 while (orig_page_offset <= seq_end_page) {
1259
1260 /* Swap original branches with new branches */
1261 ret = move_extent_par_page(o_filp, donor_inode,
1262 orig_page_offset,
1263 data_offset_in_page,
1264 block_len_in_page, uninit);
1265 if (ret < 0)
1266 goto out;
1267 orig_page_offset++;
1268 /* Count how many blocks we have exchanged */
1269 *moved_len += block_len_in_page;
1270 BUG_ON(*moved_len > len);
1271
1272 data_offset_in_page = 0;
1273 rest_blocks -= block_len_in_page;
1274 if (rest_blocks > blocks_per_page)
1275 block_len_in_page = blocks_per_page;
1276 else
1277 block_len_in_page = rest_blocks;
1278 }
1279
1280 /* Decrease buffer counter */
1281 if (holecheck_path)
1282 ext4_ext_drop_refs(holecheck_path);
1283 get_ext_path(holecheck_path, orig_inode,
1284 seq_start, ret);
1285 if (holecheck_path == NULL)
1286 break;
1287 depth = holecheck_path->p_depth;
1288
1289 /* Decrease buffer counter */
1290 if (orig_path)
1291 ext4_ext_drop_refs(orig_path);
1292 get_ext_path(orig_path, orig_inode, seq_start, ret);
1293 if (orig_path == NULL)
1294 break;
1295
1296 ext_cur = holecheck_path[depth].p_ext;
1297 add_blocks = ext4_ext_get_actual_len(ext_cur);
1298 seq_blocks = 0;
1299
1300 }
1301out:
1302 if (orig_path) {
1303 ext4_ext_drop_refs(orig_path);
1304 kfree(orig_path);
1305 }
1306 if (holecheck_path) {
1307 ext4_ext_drop_refs(holecheck_path);
1308 kfree(holecheck_path);
1309 }
1310out2:
1311 mext_inode_double_unlock(orig_inode, donor_inode);
1312
1313 if (ret)
1314 return ret;
1315
1316 /* All of the specified blocks must be exchanged in succeed */
1317 BUG_ON(*moved_len != len);
1318
1319 return 0;
1320}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 22098e1cd085..de04013d16ff 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -37,7 +37,6 @@
37#include "ext4.h" 37#include "ext4.h"
38#include "ext4_jbd2.h" 38#include "ext4_jbd2.h"
39 39
40#include "namei.h"
41#include "xattr.h" 40#include "xattr.h"
42#include "acl.h" 41#include "acl.h"
43 42
@@ -750,7 +749,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
750 ext4fs_dirhash(de->name, de->name_len, &h); 749 ext4fs_dirhash(de->name, de->name_len, &h);
751 map_tail--; 750 map_tail--;
752 map_tail->hash = h.hash; 751 map_tail->hash = h.hash;
753 map_tail->offs = (u16) ((char *) de - base); 752 map_tail->offs = ((char *) de - base)>>2;
754 map_tail->size = le16_to_cpu(de->rec_len); 753 map_tail->size = le16_to_cpu(de->rec_len);
755 count++; 754 count++;
756 cond_resched(); 755 cond_resched();
@@ -1148,7 +1147,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1148 unsigned rec_len = 0; 1147 unsigned rec_len = 0;
1149 1148
1150 while (count--) { 1149 while (count--) {
1151 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs); 1150 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
1151 (from + (map->offs<<2));
1152 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1152 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1153 memcpy (to, de, rec_len); 1153 memcpy (to, de, rec_len);
1154 ((struct ext4_dir_entry_2 *) to)->rec_len = 1154 ((struct ext4_dir_entry_2 *) to)->rec_len =
@@ -1782,7 +1782,7 @@ retry:
1782 if (IS_DIRSYNC(dir)) 1782 if (IS_DIRSYNC(dir))
1783 ext4_handle_sync(handle); 1783 ext4_handle_sync(handle);
1784 1784
1785 inode = ext4_new_inode (handle, dir, mode); 1785 inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
1786 err = PTR_ERR(inode); 1786 err = PTR_ERR(inode);
1787 if (!IS_ERR(inode)) { 1787 if (!IS_ERR(inode)) {
1788 inode->i_op = &ext4_file_inode_operations; 1788 inode->i_op = &ext4_file_inode_operations;
@@ -1816,7 +1816,7 @@ retry:
1816 if (IS_DIRSYNC(dir)) 1816 if (IS_DIRSYNC(dir))
1817 ext4_handle_sync(handle); 1817 ext4_handle_sync(handle);
1818 1818
1819 inode = ext4_new_inode(handle, dir, mode); 1819 inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
1820 err = PTR_ERR(inode); 1820 err = PTR_ERR(inode);
1821 if (!IS_ERR(inode)) { 1821 if (!IS_ERR(inode)) {
1822 init_special_inode(inode, inode->i_mode, rdev); 1822 init_special_inode(inode, inode->i_mode, rdev);
@@ -1853,7 +1853,8 @@ retry:
1853 if (IS_DIRSYNC(dir)) 1853 if (IS_DIRSYNC(dir))
1854 ext4_handle_sync(handle); 1854 ext4_handle_sync(handle);
1855 1855
1856 inode = ext4_new_inode(handle, dir, S_IFDIR | mode); 1856 inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
1857 &dentry->d_name, 0);
1857 err = PTR_ERR(inode); 1858 err = PTR_ERR(inode);
1858 if (IS_ERR(inode)) 1859 if (IS_ERR(inode))
1859 goto out_stop; 1860 goto out_stop;
@@ -1997,7 +1998,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1997 if (!ext4_handle_valid(handle)) 1998 if (!ext4_handle_valid(handle))
1998 return 0; 1999 return 0;
1999 2000
2000 lock_super(sb); 2001 mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
2001 if (!list_empty(&EXT4_I(inode)->i_orphan)) 2002 if (!list_empty(&EXT4_I(inode)->i_orphan))
2002 goto out_unlock; 2003 goto out_unlock;
2003 2004
@@ -2006,9 +2007,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2006 2007
2007 /* @@@ FIXME: Observation from aviro: 2008 /* @@@ FIXME: Observation from aviro:
2008 * I think I can trigger J_ASSERT in ext4_orphan_add(). We block 2009 * I think I can trigger J_ASSERT in ext4_orphan_add(). We block
2009 * here (on lock_super()), so race with ext4_link() which might bump 2010 * here (on s_orphan_lock), so race with ext4_link() which might bump
2010 * ->i_nlink. For, say it, character device. Not a regular file, 2011 * ->i_nlink. For, say it, character device. Not a regular file,
2011 * not a directory, not a symlink and ->i_nlink > 0. 2012 * not a directory, not a symlink and ->i_nlink > 0.
2013 *
2014 * tytso, 4/25/2009: I'm not sure how that could happen;
2015 * shouldn't the fs core protect us from these sort of
2016 * unlink()/link() races?
2012 */ 2017 */
2013 J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 2018 J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2014 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 2019 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -2045,7 +2050,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2045 jbd_debug(4, "orphan inode %lu will point to %d\n", 2050 jbd_debug(4, "orphan inode %lu will point to %d\n",
2046 inode->i_ino, NEXT_ORPHAN(inode)); 2051 inode->i_ino, NEXT_ORPHAN(inode));
2047out_unlock: 2052out_unlock:
2048 unlock_super(sb); 2053 mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
2049 ext4_std_error(inode->i_sb, err); 2054 ext4_std_error(inode->i_sb, err);
2050 return err; 2055 return err;
2051} 2056}
@@ -2066,11 +2071,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2066 if (!ext4_handle_valid(handle)) 2071 if (!ext4_handle_valid(handle))
2067 return 0; 2072 return 0;
2068 2073
2069 lock_super(inode->i_sb); 2074 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
2070 if (list_empty(&ei->i_orphan)) { 2075 if (list_empty(&ei->i_orphan))
2071 unlock_super(inode->i_sb); 2076 goto out;
2072 return 0;
2073 }
2074 2077
2075 ino_next = NEXT_ORPHAN(inode); 2078 ino_next = NEXT_ORPHAN(inode);
2076 prev = ei->i_orphan.prev; 2079 prev = ei->i_orphan.prev;
@@ -2120,7 +2123,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2120out_err: 2123out_err:
2121 ext4_std_error(inode->i_sb, err); 2124 ext4_std_error(inode->i_sb, err);
2122out: 2125out:
2123 unlock_super(inode->i_sb); 2126 mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
2124 return err; 2127 return err;
2125 2128
2126out_brelse: 2129out_brelse:
@@ -2262,7 +2265,8 @@ retry:
2262 if (IS_DIRSYNC(dir)) 2265 if (IS_DIRSYNC(dir))
2263 ext4_handle_sync(handle); 2266 ext4_handle_sync(handle);
2264 2267
2265 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); 2268 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
2269 &dentry->d_name, 0);
2266 err = PTR_ERR(inode); 2270 err = PTR_ERR(inode);
2267 if (IS_ERR(inode)) 2271 if (IS_ERR(inode))
2268 goto out_stop; 2272 goto out_stop;
@@ -2533,6 +2537,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2533 .removexattr = generic_removexattr, 2537 .removexattr = generic_removexattr,
2534#endif 2538#endif
2535 .permission = ext4_permission, 2539 .permission = ext4_permission,
2540 .fiemap = ext4_fiemap,
2536}; 2541};
2537 2542
2538const struct inode_operations ext4_special_inode_operations = { 2543const struct inode_operations ext4_special_inode_operations = {
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
deleted file mode 100644
index 5e4dfff36a00..000000000000
--- a/fs/ext4/namei.h
+++ /dev/null
@@ -1,8 +0,0 @@
1/* linux/fs/ext4/namei.h
2 *
3 * Copyright (C) 2005 Simtec Electronics
4 * Ben Dooks <ben@simtec.co.uk>
5 *
6*/
7
8extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 546c7dd869e1..68b0351fc647 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -15,7 +15,6 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
18#include "group.h"
19 18
20#define outside(b, first, last) ((b) < (first) || (b) >= (last)) 19#define outside(b, first, last) ((b) < (first) || (b) >= (last))
21#define inside(b, first, last) ((b) >= (first) && (b) < (last)) 20#define inside(b, first, last) ((b) >= (first) && (b) < (last))
@@ -193,7 +192,7 @@ static int setup_new_group_blocks(struct super_block *sb,
193 if (IS_ERR(handle)) 192 if (IS_ERR(handle))
194 return PTR_ERR(handle); 193 return PTR_ERR(handle);
195 194
196 lock_super(sb); 195 mutex_lock(&sbi->s_resize_lock);
197 if (input->group != sbi->s_groups_count) { 196 if (input->group != sbi->s_groups_count) {
198 err = -EBUSY; 197 err = -EBUSY;
199 goto exit_journal; 198 goto exit_journal;
@@ -302,7 +301,7 @@ exit_bh:
302 brelse(bh); 301 brelse(bh);
303 302
304exit_journal: 303exit_journal:
305 unlock_super(sb); 304 mutex_unlock(&sbi->s_resize_lock);
306 if ((err2 = ext4_journal_stop(handle)) && !err) 305 if ((err2 = ext4_journal_stop(handle)) && !err)
307 err = err2; 306 err = err2;
308 307
@@ -643,11 +642,12 @@ exit_free:
643 * important part is that the new block and inode counts are in the backup 642 * important part is that the new block and inode counts are in the backup
644 * superblocks, and the location of the new group metadata in the GDT backups. 643 * superblocks, and the location of the new group metadata in the GDT backups.
645 * 644 *
646 * We do not need lock_super() for this, because these blocks are not 645 * We do not need take the s_resize_lock for this, because these
647 * otherwise touched by the filesystem code when it is mounted. We don't 646 * blocks are not otherwise touched by the filesystem code when it is
648 * need to worry about last changing from sbi->s_groups_count, because the 647 * mounted. We don't need to worry about last changing from
649 * worst that can happen is that we do not copy the full number of backups 648 * sbi->s_groups_count, because the worst that can happen is that we
650 * at this time. The resize which changed s_groups_count will backup again. 649 * do not copy the full number of backups at this time. The resize
650 * which changed s_groups_count will backup again.
651 */ 651 */
652static void update_backups(struct super_block *sb, 652static void update_backups(struct super_block *sb,
653 int blk_off, char *data, int size) 653 int blk_off, char *data, int size)
@@ -809,7 +809,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
809 goto exit_put; 809 goto exit_put;
810 } 810 }
811 811
812 lock_super(sb); 812 mutex_lock(&sbi->s_resize_lock);
813 if (input->group != sbi->s_groups_count) { 813 if (input->group != sbi->s_groups_count) {
814 ext4_warning(sb, __func__, 814 ext4_warning(sb, __func__,
815 "multiple resizers run on filesystem!"); 815 "multiple resizers run on filesystem!");
@@ -840,7 +840,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
840 /* 840 /*
841 * OK, now we've set up the new group. Time to make it active. 841 * OK, now we've set up the new group. Time to make it active.
842 * 842 *
843 * Current kernels don't lock all allocations via lock_super(), 843 * We do not lock all allocations via s_resize_lock
844 * so we have to be safe wrt. concurrent accesses the group 844 * so we have to be safe wrt. concurrent accesses the group
845 * data. So we need to be careful to set all of the relevant 845 * data. So we need to be careful to set all of the relevant
846 * group descriptor data etc. *before* we enable the group. 846 * group descriptor data etc. *before* we enable the group.
@@ -900,12 +900,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
900 * 900 *
901 * The precise rules we use are: 901 * The precise rules we use are:
902 * 902 *
903 * * Writers of s_groups_count *must* hold lock_super 903 * * Writers of s_groups_count *must* hold s_resize_lock
904 * AND 904 * AND
905 * * Writers must perform a smp_wmb() after updating all dependent 905 * * Writers must perform a smp_wmb() after updating all dependent
906 * data and before modifying the groups count 906 * data and before modifying the groups count
907 * 907 *
908 * * Readers must hold lock_super() over the access 908 * * Readers must hold s_resize_lock over the access
909 * OR 909 * OR
910 * * Readers must perform an smp_rmb() after reading the groups count 910 * * Readers must perform an smp_rmb() after reading the groups count
911 * and before reading any dependent data. 911 * and before reading any dependent data.
@@ -948,7 +948,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
948 sb->s_dirt = 1; 948 sb->s_dirt = 1;
949 949
950exit_journal: 950exit_journal:
951 unlock_super(sb); 951 mutex_unlock(&sbi->s_resize_lock);
952 if ((err2 = ext4_journal_stop(handle)) && !err) 952 if ((err2 = ext4_journal_stop(handle)) && !err)
953 err = err2; 953 err = err2;
954 if (!err) { 954 if (!err) {
@@ -986,7 +986,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
986 986
987 /* We don't need to worry about locking wrt other resizers just 987 /* We don't need to worry about locking wrt other resizers just
988 * yet: we're going to revalidate es->s_blocks_count after 988 * yet: we're going to revalidate es->s_blocks_count after
989 * taking lock_super() below. */ 989 * taking the s_resize_lock below. */
990 o_blocks_count = ext4_blocks_count(es); 990 o_blocks_count = ext4_blocks_count(es);
991 o_groups_count = EXT4_SB(sb)->s_groups_count; 991 o_groups_count = EXT4_SB(sb)->s_groups_count;
992 992
@@ -1002,7 +1002,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1002 " too large to resize to %llu blocks safely\n", 1002 " too large to resize to %llu blocks safely\n",
1003 sb->s_id, n_blocks_count); 1003 sb->s_id, n_blocks_count);
1004 if (sizeof(sector_t) < 8) 1004 if (sizeof(sector_t) < 8)
1005 ext4_warning(sb, __func__, "CONFIG_LBD not enabled"); 1005 ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled");
1006 return -EINVAL; 1006 return -EINVAL;
1007 } 1007 }
1008 1008
@@ -1056,11 +1056,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1056 goto exit_put; 1056 goto exit_put;
1057 } 1057 }
1058 1058
1059 lock_super(sb); 1059 mutex_lock(&EXT4_SB(sb)->s_resize_lock);
1060 if (o_blocks_count != ext4_blocks_count(es)) { 1060 if (o_blocks_count != ext4_blocks_count(es)) {
1061 ext4_warning(sb, __func__, 1061 ext4_warning(sb, __func__,
1062 "multiple resizers run on filesystem!"); 1062 "multiple resizers run on filesystem!");
1063 unlock_super(sb); 1063 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1064 ext4_journal_stop(handle); 1064 ext4_journal_stop(handle);
1065 err = -EBUSY; 1065 err = -EBUSY;
1066 goto exit_put; 1066 goto exit_put;
@@ -1070,14 +1070,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1070 EXT4_SB(sb)->s_sbh))) { 1070 EXT4_SB(sb)->s_sbh))) {
1071 ext4_warning(sb, __func__, 1071 ext4_warning(sb, __func__,
1072 "error %d on journal write access", err); 1072 "error %d on journal write access", err);
1073 unlock_super(sb); 1073 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1074 ext4_journal_stop(handle); 1074 ext4_journal_stop(handle);
1075 goto exit_put; 1075 goto exit_put;
1076 } 1076 }
1077 ext4_blocks_count_set(es, o_blocks_count + add); 1077 ext4_blocks_count_set(es, o_blocks_count + add);
1078 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); 1078 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
1079 sb->s_dirt = 1; 1079 sb->s_dirt = 1;
1080 unlock_super(sb); 1080 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1081 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1081 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1082 o_blocks_count + add); 1082 o_blocks_count + add);
1083 /* We add the blocks to the bitmap and set the group need init bit */ 1083 /* We add the blocks to the bitmap and set the group need init bit */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e6f222..8f4f079e6b9a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -20,6 +20,7 @@
20#include <linux/string.h> 20#include <linux/string.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/time.h> 22#include <linux/time.h>
23#include <linux/vmalloc.h>
23#include <linux/jbd2.h> 24#include <linux/jbd2.h>
24#include <linux/slab.h> 25#include <linux/slab.h>
25#include <linux/init.h> 26#include <linux/init.h>
@@ -36,7 +37,6 @@
36#include <linux/seq_file.h> 37#include <linux/seq_file.h>
37#include <linux/proc_fs.h> 38#include <linux/proc_fs.h>
38#include <linux/ctype.h> 39#include <linux/ctype.h>
39#include <linux/marker.h>
40#include <linux/log2.h> 40#include <linux/log2.h>
41#include <linux/crc16.h> 41#include <linux/crc16.h>
42#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -45,16 +45,23 @@
45#include "ext4_jbd2.h" 45#include "ext4_jbd2.h"
46#include "xattr.h" 46#include "xattr.h"
47#include "acl.h" 47#include "acl.h"
48#include "namei.h" 48
49#include "group.h" 49#define CREATE_TRACE_POINTS
50#include <trace/events/ext4.h>
51
52static int default_mb_history_length = 1000;
53
54module_param_named(default_mb_history_length, default_mb_history_length,
55 int, 0644);
56MODULE_PARM_DESC(default_mb_history_length,
57 "Default number of entries saved for mb_history");
50 58
51struct proc_dir_entry *ext4_proc_root; 59struct proc_dir_entry *ext4_proc_root;
52static struct kset *ext4_kset; 60static struct kset *ext4_kset;
53 61
54static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 62static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
55 unsigned long journal_devnum); 63 unsigned long journal_devnum);
56static int ext4_commit_super(struct super_block *sb, 64static int ext4_commit_super(struct super_block *sb, int sync);
57 struct ext4_super_block *es, int sync);
58static void ext4_mark_recovery_complete(struct super_block *sb, 65static void ext4_mark_recovery_complete(struct super_block *sb,
59 struct ext4_super_block *es); 66 struct ext4_super_block *es);
60static void ext4_clear_journal_err(struct super_block *sb, 67static void ext4_clear_journal_err(struct super_block *sb,
@@ -74,7 +81,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
74{ 81{
75 return le32_to_cpu(bg->bg_block_bitmap_lo) | 82 return le32_to_cpu(bg->bg_block_bitmap_lo) |
76 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 83 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
77 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); 84 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
78} 85}
79 86
80ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, 87ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -82,7 +89,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
82{ 89{
83 return le32_to_cpu(bg->bg_inode_bitmap_lo) | 90 return le32_to_cpu(bg->bg_inode_bitmap_lo) |
84 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 91 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
85 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); 92 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
86} 93}
87 94
88ext4_fsblk_t ext4_inode_table(struct super_block *sb, 95ext4_fsblk_t ext4_inode_table(struct super_block *sb,
@@ -90,7 +97,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
90{ 97{
91 return le32_to_cpu(bg->bg_inode_table_lo) | 98 return le32_to_cpu(bg->bg_inode_table_lo) |
92 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 99 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
93 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); 100 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
94} 101}
95 102
96__u32 ext4_free_blks_count(struct super_block *sb, 103__u32 ext4_free_blks_count(struct super_block *sb,
@@ -98,7 +105,7 @@ __u32 ext4_free_blks_count(struct super_block *sb,
98{ 105{
99 return le16_to_cpu(bg->bg_free_blocks_count_lo) | 106 return le16_to_cpu(bg->bg_free_blocks_count_lo) |
100 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 107 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
101 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); 108 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
102} 109}
103 110
104__u32 ext4_free_inodes_count(struct super_block *sb, 111__u32 ext4_free_inodes_count(struct super_block *sb,
@@ -106,7 +113,7 @@ __u32 ext4_free_inodes_count(struct super_block *sb,
106{ 113{
107 return le16_to_cpu(bg->bg_free_inodes_count_lo) | 114 return le16_to_cpu(bg->bg_free_inodes_count_lo) |
108 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 115 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
109 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); 116 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
110} 117}
111 118
112__u32 ext4_used_dirs_count(struct super_block *sb, 119__u32 ext4_used_dirs_count(struct super_block *sb,
@@ -114,7 +121,7 @@ __u32 ext4_used_dirs_count(struct super_block *sb,
114{ 121{
115 return le16_to_cpu(bg->bg_used_dirs_count_lo) | 122 return le16_to_cpu(bg->bg_used_dirs_count_lo) |
116 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 123 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
117 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); 124 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
118} 125}
119 126
120__u32 ext4_itable_unused_count(struct super_block *sb, 127__u32 ext4_itable_unused_count(struct super_block *sb,
@@ -122,7 +129,7 @@ __u32 ext4_itable_unused_count(struct super_block *sb,
122{ 129{
123 return le16_to_cpu(bg->bg_itable_unused_lo) | 130 return le16_to_cpu(bg->bg_itable_unused_lo) |
124 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 131 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
125 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); 132 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
126} 133}
127 134
128void ext4_block_bitmap_set(struct super_block *sb, 135void ext4_block_bitmap_set(struct super_block *sb,
@@ -202,8 +209,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
202 journal = EXT4_SB(sb)->s_journal; 209 journal = EXT4_SB(sb)->s_journal;
203 if (journal) { 210 if (journal) {
204 if (is_journal_aborted(journal)) { 211 if (is_journal_aborted(journal)) {
205 ext4_abort(sb, __func__, 212 ext4_abort(sb, __func__, "Detected aborted journal");
206 "Detected aborted journal");
207 return ERR_PTR(-EROFS); 213 return ERR_PTR(-EROFS);
208 } 214 }
209 return jbd2_journal_start(journal, nblocks); 215 return jbd2_journal_start(journal, nblocks);
@@ -297,15 +303,15 @@ static void ext4_handle_error(struct super_block *sb)
297 if (!test_opt(sb, ERRORS_CONT)) { 303 if (!test_opt(sb, ERRORS_CONT)) {
298 journal_t *journal = EXT4_SB(sb)->s_journal; 304 journal_t *journal = EXT4_SB(sb)->s_journal;
299 305
300 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT; 306 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
301 if (journal) 307 if (journal)
302 jbd2_journal_abort(journal, -EIO); 308 jbd2_journal_abort(journal, -EIO);
303 } 309 }
304 if (test_opt(sb, ERRORS_RO)) { 310 if (test_opt(sb, ERRORS_RO)) {
305 printk(KERN_CRIT "Remounting filesystem read-only\n"); 311 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
306 sb->s_flags |= MS_RDONLY; 312 sb->s_flags |= MS_RDONLY;
307 } 313 }
308 ext4_commit_super(sb, es, 1); 314 ext4_commit_super(sb, 1);
309 if (test_opt(sb, ERRORS_PANIC)) 315 if (test_opt(sb, ERRORS_PANIC))
310 panic("EXT4-fs (device %s): panic forced after error\n", 316 panic("EXT4-fs (device %s): panic forced after error\n",
311 sb->s_id); 317 sb->s_id);
@@ -395,8 +401,6 @@ void ext4_abort(struct super_block *sb, const char *function,
395{ 401{
396 va_list args; 402 va_list args;
397 403
398 printk(KERN_CRIT "ext4_abort called.\n");
399
400 va_start(args, fmt); 404 va_start(args, fmt);
401 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); 405 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
402 vprintk(fmt, args); 406 vprintk(fmt, args);
@@ -409,14 +413,26 @@ void ext4_abort(struct super_block *sb, const char *function,
409 if (sb->s_flags & MS_RDONLY) 413 if (sb->s_flags & MS_RDONLY)
410 return; 414 return;
411 415
412 printk(KERN_CRIT "Remounting filesystem read-only\n"); 416 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
413 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 417 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
414 sb->s_flags |= MS_RDONLY; 418 sb->s_flags |= MS_RDONLY;
415 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT; 419 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
416 if (EXT4_SB(sb)->s_journal) 420 if (EXT4_SB(sb)->s_journal)
417 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); 421 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
418} 422}
419 423
424void ext4_msg (struct super_block * sb, const char *prefix,
425 const char *fmt, ...)
426{
427 va_list args;
428
429 va_start(args, fmt);
430 printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
431 vprintk(fmt, args);
432 printk("\n");
433 va_end(args);
434}
435
420void ext4_warning(struct super_block *sb, const char *function, 436void ext4_warning(struct super_block *sb, const char *function,
421 const char *fmt, ...) 437 const char *fmt, ...)
422{ 438{
@@ -431,7 +447,7 @@ void ext4_warning(struct super_block *sb, const char *function,
431} 447}
432 448
433void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, 449void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
434 const char *function, const char *fmt, ...) 450 const char *function, const char *fmt, ...)
435__releases(bitlock) 451__releases(bitlock)
436__acquires(bitlock) 452__acquires(bitlock)
437{ 453{
@@ -447,7 +463,7 @@ __acquires(bitlock)
447 if (test_opt(sb, ERRORS_CONT)) { 463 if (test_opt(sb, ERRORS_CONT)) {
448 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 464 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
449 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 465 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
450 ext4_commit_super(sb, es, 0); 466 ext4_commit_super(sb, 0);
451 return; 467 return;
452 } 468 }
453 ext4_unlock_group(sb, grp); 469 ext4_unlock_group(sb, grp);
@@ -467,7 +483,6 @@ __acquires(bitlock)
467 return; 483 return;
468} 484}
469 485
470
471void ext4_update_dynamic_rev(struct super_block *sb) 486void ext4_update_dynamic_rev(struct super_block *sb)
472{ 487{
473 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 488 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -496,7 +511,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
496/* 511/*
497 * Open the external journal device 512 * Open the external journal device
498 */ 513 */
499static struct block_device *ext4_blkdev_get(dev_t dev) 514static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
500{ 515{
501 struct block_device *bdev; 516 struct block_device *bdev;
502 char b[BDEVNAME_SIZE]; 517 char b[BDEVNAME_SIZE];
@@ -507,7 +522,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
507 return bdev; 522 return bdev;
508 523
509fail: 524fail:
510 printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n", 525 ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
511 __bdevname(dev, b), PTR_ERR(bdev)); 526 __bdevname(dev, b), PTR_ERR(bdev));
512 return NULL; 527 return NULL;
513} 528}
@@ -543,8 +558,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
543{ 558{
544 struct list_head *l; 559 struct list_head *l;
545 560
546 printk(KERN_ERR "sb orphan head is %d\n", 561 ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
547 le32_to_cpu(sbi->s_es->s_last_orphan)); 562 le32_to_cpu(sbi->s_es->s_last_orphan));
548 563
549 printk(KERN_ERR "sb_info orphan list:\n"); 564 printk(KERN_ERR "sb_info orphan list:\n");
550 list_for_each(l, &sbi->s_orphan) { 565 list_for_each(l, &sbi->s_orphan) {
@@ -563,6 +578,12 @@ static void ext4_put_super(struct super_block *sb)
563 struct ext4_super_block *es = sbi->s_es; 578 struct ext4_super_block *es = sbi->s_es;
564 int i, err; 579 int i, err;
565 580
581 lock_super(sb);
582 lock_kernel();
583 if (sb->s_dirt)
584 ext4_commit_super(sb, 1);
585
586 ext4_release_system_zone(sb);
566 ext4_mb_release(sb); 587 ext4_mb_release(sb);
567 ext4_ext_release(sb); 588 ext4_ext_release(sb);
568 ext4_xattr_put_super(sb); 589 ext4_xattr_put_super(sb);
@@ -576,7 +597,7 @@ static void ext4_put_super(struct super_block *sb)
576 if (!(sb->s_flags & MS_RDONLY)) { 597 if (!(sb->s_flags & MS_RDONLY)) {
577 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 598 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
578 es->s_state = cpu_to_le16(sbi->s_mount_state); 599 es->s_state = cpu_to_le16(sbi->s_mount_state);
579 ext4_commit_super(sb, es, 1); 600 ext4_commit_super(sb, 1);
580 } 601 }
581 if (sbi->s_proc) { 602 if (sbi->s_proc) {
582 remove_proc_entry(sb->s_id, ext4_proc_root); 603 remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -586,7 +607,10 @@ static void ext4_put_super(struct super_block *sb)
586 for (i = 0; i < sbi->s_gdb_count; i++) 607 for (i = 0; i < sbi->s_gdb_count; i++)
587 brelse(sbi->s_group_desc[i]); 608 brelse(sbi->s_group_desc[i]);
588 kfree(sbi->s_group_desc); 609 kfree(sbi->s_group_desc);
589 kfree(sbi->s_flex_groups); 610 if (is_vmalloc_addr(sbi->s_flex_groups))
611 vfree(sbi->s_flex_groups);
612 else
613 kfree(sbi->s_flex_groups);
590 percpu_counter_destroy(&sbi->s_freeblocks_counter); 614 percpu_counter_destroy(&sbi->s_freeblocks_counter);
591 percpu_counter_destroy(&sbi->s_freeinodes_counter); 615 percpu_counter_destroy(&sbi->s_freeinodes_counter);
592 percpu_counter_destroy(&sbi->s_dirs_counter); 616 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -625,11 +649,8 @@ static void ext4_put_super(struct super_block *sb)
625 unlock_super(sb); 649 unlock_super(sb);
626 kobject_put(&sbi->s_kobj); 650 kobject_put(&sbi->s_kobj);
627 wait_for_completion(&sbi->s_kobj_unregister); 651 wait_for_completion(&sbi->s_kobj_unregister);
628 lock_super(sb);
629 lock_kernel();
630 kfree(sbi->s_blockgroup_lock); 652 kfree(sbi->s_blockgroup_lock);
631 kfree(sbi); 653 kfree(sbi);
632 return;
633} 654}
634 655
635static struct kmem_cache *ext4_inode_cachep; 656static struct kmem_cache *ext4_inode_cachep;
@@ -644,10 +665,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
644 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); 665 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
645 if (!ei) 666 if (!ei)
646 return NULL; 667 return NULL;
647#ifdef CONFIG_EXT4_FS_POSIX_ACL 668
648 ei->i_acl = EXT4_ACL_NOT_CACHED;
649 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
650#endif
651 ei->vfs_inode.i_version = 1; 669 ei->vfs_inode.i_version = 1;
652 ei->vfs_inode.i_data.writeback_index = 0; 670 ei->vfs_inode.i_data.writeback_index = 0;
653 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 671 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
@@ -664,14 +682,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
664 ei->i_allocated_meta_blocks = 0; 682 ei->i_allocated_meta_blocks = 0;
665 ei->i_delalloc_reserved_flag = 0; 683 ei->i_delalloc_reserved_flag = 0;
666 spin_lock_init(&(ei->i_block_reservation_lock)); 684 spin_lock_init(&(ei->i_block_reservation_lock));
685
667 return &ei->vfs_inode; 686 return &ei->vfs_inode;
668} 687}
669 688
670static void ext4_destroy_inode(struct inode *inode) 689static void ext4_destroy_inode(struct inode *inode)
671{ 690{
672 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 691 if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
673 printk("EXT4 Inode %p: orphan list check failed!\n", 692 ext4_msg(inode->i_sb, KERN_ERR,
674 EXT4_I(inode)); 693 "Inode %lu (%p): orphan list check failed!",
694 inode->i_ino, EXT4_I(inode));
675 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, 695 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
676 EXT4_I(inode), sizeof(struct ext4_inode_info), 696 EXT4_I(inode), sizeof(struct ext4_inode_info),
677 true); 697 true);
@@ -711,18 +731,6 @@ static void destroy_inodecache(void)
711 731
712static void ext4_clear_inode(struct inode *inode) 732static void ext4_clear_inode(struct inode *inode)
713{ 733{
714#ifdef CONFIG_EXT4_FS_POSIX_ACL
715 if (EXT4_I(inode)->i_acl &&
716 EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
717 posix_acl_release(EXT4_I(inode)->i_acl);
718 EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED;
719 }
720 if (EXT4_I(inode)->i_default_acl &&
721 EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) {
722 posix_acl_release(EXT4_I(inode)->i_default_acl);
723 EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
724 }
725#endif
726 ext4_discard_preallocations(inode); 734 ext4_discard_preallocations(inode);
727 if (EXT4_JOURNAL(inode)) 735 if (EXT4_JOURNAL(inode))
728 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 736 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
@@ -870,12 +878,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
870 seq_puts(seq, ",noauto_da_alloc"); 878 seq_puts(seq, ",noauto_da_alloc");
871 879
872 ext4_show_quota_options(seq, sb); 880 ext4_show_quota_options(seq, sb);
881
873 return 0; 882 return 0;
874} 883}
875 884
876
877static struct inode *ext4_nfs_get_inode(struct super_block *sb, 885static struct inode *ext4_nfs_get_inode(struct super_block *sb,
878 u64 ino, u32 generation) 886 u64 ino, u32 generation)
879{ 887{
880 struct inode *inode; 888 struct inode *inode;
881 889
@@ -904,14 +912,14 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
904} 912}
905 913
906static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, 914static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
907 int fh_len, int fh_type) 915 int fh_len, int fh_type)
908{ 916{
909 return generic_fh_to_dentry(sb, fid, fh_len, fh_type, 917 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
910 ext4_nfs_get_inode); 918 ext4_nfs_get_inode);
911} 919}
912 920
913static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, 921static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
914 int fh_len, int fh_type) 922 int fh_len, int fh_type)
915{ 923{
916 return generic_fh_to_parent(sb, fid, fh_len, fh_type, 924 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
917 ext4_nfs_get_inode); 925 ext4_nfs_get_inode);
@@ -923,7 +931,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
923 * which would prevent try_to_free_buffers() from freeing them, we must use 931 * which would prevent try_to_free_buffers() from freeing them, we must use
924 * jbd2 layer's try_to_free_buffers() function to release them. 932 * jbd2 layer's try_to_free_buffers() function to release them.
925 */ 933 */
926static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait) 934static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
935 gfp_t wait)
927{ 936{
928 journal_t *journal = EXT4_SB(sb)->s_journal; 937 journal_t *journal = EXT4_SB(sb)->s_journal;
929 938
@@ -992,7 +1001,6 @@ static const struct super_operations ext4_sops = {
992 .dirty_inode = ext4_dirty_inode, 1001 .dirty_inode = ext4_dirty_inode,
993 .delete_inode = ext4_delete_inode, 1002 .delete_inode = ext4_delete_inode,
994 .put_super = ext4_put_super, 1003 .put_super = ext4_put_super,
995 .write_super = ext4_write_super,
996 .sync_fs = ext4_sync_fs, 1004 .sync_fs = ext4_sync_fs,
997 .freeze_fs = ext4_freeze, 1005 .freeze_fs = ext4_freeze,
998 .unfreeze_fs = ext4_unfreeze, 1006 .unfreeze_fs = ext4_unfreeze,
@@ -1007,6 +1015,25 @@ static const struct super_operations ext4_sops = {
1007 .bdev_try_to_free_page = bdev_try_to_free_page, 1015 .bdev_try_to_free_page = bdev_try_to_free_page,
1008}; 1016};
1009 1017
1018static const struct super_operations ext4_nojournal_sops = {
1019 .alloc_inode = ext4_alloc_inode,
1020 .destroy_inode = ext4_destroy_inode,
1021 .write_inode = ext4_write_inode,
1022 .dirty_inode = ext4_dirty_inode,
1023 .delete_inode = ext4_delete_inode,
1024 .write_super = ext4_write_super,
1025 .put_super = ext4_put_super,
1026 .statfs = ext4_statfs,
1027 .remount_fs = ext4_remount,
1028 .clear_inode = ext4_clear_inode,
1029 .show_options = ext4_show_options,
1030#ifdef CONFIG_QUOTA
1031 .quota_read = ext4_quota_read,
1032 .quota_write = ext4_quota_write,
1033#endif
1034 .bdev_try_to_free_page = bdev_try_to_free_page,
1035};
1036
1010static const struct export_operations ext4_export_ops = { 1037static const struct export_operations ext4_export_ops = {
1011 .fh_to_dentry = ext4_fh_to_dentry, 1038 .fh_to_dentry = ext4_fh_to_dentry,
1012 .fh_to_parent = ext4_fh_to_parent, 1039 .fh_to_parent = ext4_fh_to_parent,
@@ -1023,12 +1050,13 @@ enum {
1023 Opt_journal_update, Opt_journal_dev, 1050 Opt_journal_update, Opt_journal_dev,
1024 Opt_journal_checksum, Opt_journal_async_commit, 1051 Opt_journal_checksum, Opt_journal_async_commit,
1025 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1052 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1026 Opt_data_err_abort, Opt_data_err_ignore, 1053 Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
1027 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1054 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1028 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1055 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1029 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, 1056 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
1030 Opt_usrquota, Opt_grpquota, Opt_i_version, 1057 Opt_usrquota, Opt_grpquota, Opt_i_version,
1031 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1058 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1059 Opt_block_validity, Opt_noblock_validity,
1032 Opt_inode_readahead_blks, Opt_journal_ioprio 1060 Opt_inode_readahead_blks, Opt_journal_ioprio
1033}; 1061};
1034 1062
@@ -1069,6 +1097,7 @@ static const match_table_t tokens = {
1069 {Opt_data_writeback, "data=writeback"}, 1097 {Opt_data_writeback, "data=writeback"},
1070 {Opt_data_err_abort, "data_err=abort"}, 1098 {Opt_data_err_abort, "data_err=abort"},
1071 {Opt_data_err_ignore, "data_err=ignore"}, 1099 {Opt_data_err_ignore, "data_err=ignore"},
1100 {Opt_mb_history_length, "mb_history_length=%u"},
1072 {Opt_offusrjquota, "usrjquota="}, 1101 {Opt_offusrjquota, "usrjquota="},
1073 {Opt_usrjquota, "usrjquota=%s"}, 1102 {Opt_usrjquota, "usrjquota=%s"},
1074 {Opt_offgrpjquota, "grpjquota="}, 1103 {Opt_offgrpjquota, "grpjquota="},
@@ -1087,6 +1116,8 @@ static const match_table_t tokens = {
1087 {Opt_resize, "resize"}, 1116 {Opt_resize, "resize"},
1088 {Opt_delalloc, "delalloc"}, 1117 {Opt_delalloc, "delalloc"},
1089 {Opt_nodelalloc, "nodelalloc"}, 1118 {Opt_nodelalloc, "nodelalloc"},
1119 {Opt_block_validity, "block_validity"},
1120 {Opt_noblock_validity, "noblock_validity"},
1090 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1121 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1091 {Opt_journal_ioprio, "journal_ioprio=%u"}, 1122 {Opt_journal_ioprio, "journal_ioprio=%u"},
1092 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1123 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
@@ -1102,8 +1133,9 @@ static ext4_fsblk_t get_sb_block(void **data)
1102 1133
1103 if (!options || strncmp(options, "sb=", 3) != 0) 1134 if (!options || strncmp(options, "sb=", 3) != 0)
1104 return 1; /* Default location */ 1135 return 1; /* Default location */
1136
1105 options += 3; 1137 options += 3;
1106 /*todo: use simple_strtoll with >32bit ext4 */ 1138 /* TODO: use simple_strtoll with >32bit ext4 */
1107 sb_block = simple_strtoul(options, &options, 0); 1139 sb_block = simple_strtoul(options, &options, 0);
1108 if (*options && *options != ',') { 1140 if (*options && *options != ',') {
1109 printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", 1141 printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
@@ -1113,6 +1145,7 @@ static ext4_fsblk_t get_sb_block(void **data)
1113 if (*options == ',') 1145 if (*options == ',')
1114 options++; 1146 options++;
1115 *data = (void *) options; 1147 *data = (void *) options;
1148
1116 return sb_block; 1149 return sb_block;
1117} 1150}
1118 1151
@@ -1206,8 +1239,7 @@ static int parse_options(char *options, struct super_block *sb,
1206#else 1239#else
1207 case Opt_user_xattr: 1240 case Opt_user_xattr:
1208 case Opt_nouser_xattr: 1241 case Opt_nouser_xattr:
1209 printk(KERN_ERR "EXT4 (no)user_xattr options " 1242 ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
1210 "not supported\n");
1211 break; 1243 break;
1212#endif 1244#endif
1213#ifdef CONFIG_EXT4_FS_POSIX_ACL 1245#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1220,8 +1252,7 @@ static int parse_options(char *options, struct super_block *sb,
1220#else 1252#else
1221 case Opt_acl: 1253 case Opt_acl:
1222 case Opt_noacl: 1254 case Opt_noacl:
1223 printk(KERN_ERR "EXT4 (no)acl options " 1255 ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
1224 "not supported\n");
1225 break; 1256 break;
1226#endif 1257#endif
1227 case Opt_journal_update: 1258 case Opt_journal_update:
@@ -1231,16 +1262,16 @@ static int parse_options(char *options, struct super_block *sb,
1231 user to specify an existing inode to be the 1262 user to specify an existing inode to be the
1232 journal file. */ 1263 journal file. */
1233 if (is_remount) { 1264 if (is_remount) {
1234 printk(KERN_ERR "EXT4-fs: cannot specify " 1265 ext4_msg(sb, KERN_ERR,
1235 "journal on remount\n"); 1266 "Cannot specify journal on remount");
1236 return 0; 1267 return 0;
1237 } 1268 }
1238 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); 1269 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
1239 break; 1270 break;
1240 case Opt_journal_dev: 1271 case Opt_journal_dev:
1241 if (is_remount) { 1272 if (is_remount) {
1242 printk(KERN_ERR "EXT4-fs: cannot specify " 1273 ext4_msg(sb, KERN_ERR,
1243 "journal on remount\n"); 1274 "Cannot specify journal on remount");
1244 return 0; 1275 return 0;
1245 } 1276 }
1246 if (match_int(&args[0], &option)) 1277 if (match_int(&args[0], &option))
@@ -1294,9 +1325,8 @@ static int parse_options(char *options, struct super_block *sb,
1294 if (is_remount) { 1325 if (is_remount) {
1295 if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS) 1326 if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
1296 != data_opt) { 1327 != data_opt) {
1297 printk(KERN_ERR 1328 ext4_msg(sb, KERN_ERR,
1298 "EXT4-fs: cannot change data " 1329 "Cannot change data mode on remount");
1299 "mode on remount\n");
1300 return 0; 1330 return 0;
1301 } 1331 }
1302 } else { 1332 } else {
@@ -1310,6 +1340,13 @@ static int parse_options(char *options, struct super_block *sb,
1310 case Opt_data_err_ignore: 1340 case Opt_data_err_ignore:
1311 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1341 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1312 break; 1342 break;
1343 case Opt_mb_history_length:
1344 if (match_int(&args[0], &option))
1345 return 0;
1346 if (option < 0)
1347 return 0;
1348 sbi->s_mb_history_max = option;
1349 break;
1313#ifdef CONFIG_QUOTA 1350#ifdef CONFIG_QUOTA
1314 case Opt_usrjquota: 1351 case Opt_usrjquota:
1315 qtype = USRQUOTA; 1352 qtype = USRQUOTA;
@@ -1319,31 +1356,31 @@ static int parse_options(char *options, struct super_block *sb,
1319set_qf_name: 1356set_qf_name:
1320 if (sb_any_quota_loaded(sb) && 1357 if (sb_any_quota_loaded(sb) &&
1321 !sbi->s_qf_names[qtype]) { 1358 !sbi->s_qf_names[qtype]) {
1322 printk(KERN_ERR 1359 ext4_msg(sb, KERN_ERR,
1323 "EXT4-fs: Cannot change journaled " 1360 "Cannot change journaled "
1324 "quota options when quota turned on.\n"); 1361 "quota options when quota turned on");
1325 return 0; 1362 return 0;
1326 } 1363 }
1327 qname = match_strdup(&args[0]); 1364 qname = match_strdup(&args[0]);
1328 if (!qname) { 1365 if (!qname) {
1329 printk(KERN_ERR 1366 ext4_msg(sb, KERN_ERR,
1330 "EXT4-fs: not enough memory for " 1367 "Not enough memory for "
1331 "storing quotafile name.\n"); 1368 "storing quotafile name");
1332 return 0; 1369 return 0;
1333 } 1370 }
1334 if (sbi->s_qf_names[qtype] && 1371 if (sbi->s_qf_names[qtype] &&
1335 strcmp(sbi->s_qf_names[qtype], qname)) { 1372 strcmp(sbi->s_qf_names[qtype], qname)) {
1336 printk(KERN_ERR 1373 ext4_msg(sb, KERN_ERR,
1337 "EXT4-fs: %s quota file already " 1374 "%s quota file already "
1338 "specified.\n", QTYPE2NAME(qtype)); 1375 "specified", QTYPE2NAME(qtype));
1339 kfree(qname); 1376 kfree(qname);
1340 return 0; 1377 return 0;
1341 } 1378 }
1342 sbi->s_qf_names[qtype] = qname; 1379 sbi->s_qf_names[qtype] = qname;
1343 if (strchr(sbi->s_qf_names[qtype], '/')) { 1380 if (strchr(sbi->s_qf_names[qtype], '/')) {
1344 printk(KERN_ERR 1381 ext4_msg(sb, KERN_ERR,
1345 "EXT4-fs: quotafile must be on " 1382 "quotafile must be on "
1346 "filesystem root.\n"); 1383 "filesystem root");
1347 kfree(sbi->s_qf_names[qtype]); 1384 kfree(sbi->s_qf_names[qtype]);
1348 sbi->s_qf_names[qtype] = NULL; 1385 sbi->s_qf_names[qtype] = NULL;
1349 return 0; 1386 return 0;
@@ -1358,9 +1395,9 @@ set_qf_name:
1358clear_qf_name: 1395clear_qf_name:
1359 if (sb_any_quota_loaded(sb) && 1396 if (sb_any_quota_loaded(sb) &&
1360 sbi->s_qf_names[qtype]) { 1397 sbi->s_qf_names[qtype]) {
1361 printk(KERN_ERR "EXT4-fs: Cannot change " 1398 ext4_msg(sb, KERN_ERR, "Cannot change "
1362 "journaled quota options when " 1399 "journaled quota options when "
1363 "quota turned on.\n"); 1400 "quota turned on");
1364 return 0; 1401 return 0;
1365 } 1402 }
1366 /* 1403 /*
@@ -1377,9 +1414,9 @@ clear_qf_name:
1377set_qf_format: 1414set_qf_format:
1378 if (sb_any_quota_loaded(sb) && 1415 if (sb_any_quota_loaded(sb) &&
1379 sbi->s_jquota_fmt != qfmt) { 1416 sbi->s_jquota_fmt != qfmt) {
1380 printk(KERN_ERR "EXT4-fs: Cannot change " 1417 ext4_msg(sb, KERN_ERR, "Cannot change "
1381 "journaled quota options when " 1418 "journaled quota options when "
1382 "quota turned on.\n"); 1419 "quota turned on");
1383 return 0; 1420 return 0;
1384 } 1421 }
1385 sbi->s_jquota_fmt = qfmt; 1422 sbi->s_jquota_fmt = qfmt;
@@ -1395,8 +1432,8 @@ set_qf_format:
1395 break; 1432 break;
1396 case Opt_noquota: 1433 case Opt_noquota:
1397 if (sb_any_quota_loaded(sb)) { 1434 if (sb_any_quota_loaded(sb)) {
1398 printk(KERN_ERR "EXT4-fs: Cannot change quota " 1435 ext4_msg(sb, KERN_ERR, "Cannot change quota "
1399 "options when quota turned on.\n"); 1436 "options when quota turned on");
1400 return 0; 1437 return 0;
1401 } 1438 }
1402 clear_opt(sbi->s_mount_opt, QUOTA); 1439 clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1407,8 +1444,8 @@ set_qf_format:
1407 case Opt_quota: 1444 case Opt_quota:
1408 case Opt_usrquota: 1445 case Opt_usrquota:
1409 case Opt_grpquota: 1446 case Opt_grpquota:
1410 printk(KERN_ERR 1447 ext4_msg(sb, KERN_ERR,
1411 "EXT4-fs: quota options not supported.\n"); 1448 "quota options not supported");
1412 break; 1449 break;
1413 case Opt_usrjquota: 1450 case Opt_usrjquota:
1414 case Opt_grpjquota: 1451 case Opt_grpjquota:
@@ -1416,15 +1453,14 @@ set_qf_format:
1416 case Opt_offgrpjquota: 1453 case Opt_offgrpjquota:
1417 case Opt_jqfmt_vfsold: 1454 case Opt_jqfmt_vfsold:
1418 case Opt_jqfmt_vfsv0: 1455 case Opt_jqfmt_vfsv0:
1419 printk(KERN_ERR 1456 ext4_msg(sb, KERN_ERR,
1420 "EXT4-fs: journaled quota options not " 1457 "journaled quota options not supported");
1421 "supported.\n");
1422 break; 1458 break;
1423 case Opt_noquota: 1459 case Opt_noquota:
1424 break; 1460 break;
1425#endif 1461#endif
1426 case Opt_abort: 1462 case Opt_abort:
1427 set_opt(sbi->s_mount_opt, ABORT); 1463 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1428 break; 1464 break;
1429 case Opt_nobarrier: 1465 case Opt_nobarrier:
1430 clear_opt(sbi->s_mount_opt, BARRIER); 1466 clear_opt(sbi->s_mount_opt, BARRIER);
@@ -1443,8 +1479,9 @@ set_qf_format:
1443 break; 1479 break;
1444 case Opt_resize: 1480 case Opt_resize:
1445 if (!is_remount) { 1481 if (!is_remount) {
1446 printk("EXT4-fs: resize option only available " 1482 ext4_msg(sb, KERN_ERR,
1447 "for remount\n"); 1483 "resize option only available "
1484 "for remount");
1448 return 0; 1485 return 0;
1449 } 1486 }
1450 if (match_int(&args[0], &option) != 0) 1487 if (match_int(&args[0], &option) != 0)
@@ -1474,14 +1511,21 @@ set_qf_format:
1474 case Opt_delalloc: 1511 case Opt_delalloc:
1475 set_opt(sbi->s_mount_opt, DELALLOC); 1512 set_opt(sbi->s_mount_opt, DELALLOC);
1476 break; 1513 break;
1514 case Opt_block_validity:
1515 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
1516 break;
1517 case Opt_noblock_validity:
1518 clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
1519 break;
1477 case Opt_inode_readahead_blks: 1520 case Opt_inode_readahead_blks:
1478 if (match_int(&args[0], &option)) 1521 if (match_int(&args[0], &option))
1479 return 0; 1522 return 0;
1480 if (option < 0 || option > (1 << 30)) 1523 if (option < 0 || option > (1 << 30))
1481 return 0; 1524 return 0;
1482 if (option & (option - 1)) { 1525 if (!is_power_of_2(option)) {
1483 printk(KERN_ERR "EXT4-fs: inode_readahead_blks" 1526 ext4_msg(sb, KERN_ERR,
1484 " must be a power of 2\n"); 1527 "EXT4-fs: inode_readahead_blks"
1528 " must be a power of 2");
1485 return 0; 1529 return 0;
1486 } 1530 }
1487 sbi->s_inode_readahead_blks = option; 1531 sbi->s_inode_readahead_blks = option;
@@ -1508,9 +1552,9 @@ set_qf_format:
1508 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1552 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1509 break; 1553 break;
1510 default: 1554 default:
1511 printk(KERN_ERR 1555 ext4_msg(sb, KERN_ERR,
1512 "EXT4-fs: Unrecognized mount option \"%s\" " 1556 "Unrecognized mount option \"%s\" "
1513 "or missing value\n", p); 1557 "or missing value", p);
1514 return 0; 1558 return 0;
1515 } 1559 }
1516 } 1560 }
@@ -1528,21 +1572,21 @@ set_qf_format:
1528 (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) || 1572 (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
1529 (sbi->s_qf_names[GRPQUOTA] && 1573 (sbi->s_qf_names[GRPQUOTA] &&
1530 (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) { 1574 (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
1531 printk(KERN_ERR "EXT4-fs: old and new quota " 1575 ext4_msg(sb, KERN_ERR, "old and new quota "
1532 "format mixing.\n"); 1576 "format mixing");
1533 return 0; 1577 return 0;
1534 } 1578 }
1535 1579
1536 if (!sbi->s_jquota_fmt) { 1580 if (!sbi->s_jquota_fmt) {
1537 printk(KERN_ERR "EXT4-fs: journaled quota format " 1581 ext4_msg(sb, KERN_ERR, "journaled quota format "
1538 "not specified.\n"); 1582 "not specified");
1539 return 0; 1583 return 0;
1540 } 1584 }
1541 } else { 1585 } else {
1542 if (sbi->s_jquota_fmt) { 1586 if (sbi->s_jquota_fmt) {
1543 printk(KERN_ERR "EXT4-fs: journaled quota format " 1587 ext4_msg(sb, KERN_ERR, "journaled quota format "
1544 "specified with no journaling " 1588 "specified with no journaling "
1545 "enabled.\n"); 1589 "enabled");
1546 return 0; 1590 return 0;
1547 } 1591 }
1548 } 1592 }
@@ -1557,32 +1601,32 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1557 int res = 0; 1601 int res = 0;
1558 1602
1559 if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { 1603 if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
1560 printk(KERN_ERR "EXT4-fs warning: revision level too high, " 1604 ext4_msg(sb, KERN_ERR, "revision level too high, "
1561 "forcing read-only mode\n"); 1605 "forcing read-only mode");
1562 res = MS_RDONLY; 1606 res = MS_RDONLY;
1563 } 1607 }
1564 if (read_only) 1608 if (read_only)
1565 return res; 1609 return res;
1566 if (!(sbi->s_mount_state & EXT4_VALID_FS)) 1610 if (!(sbi->s_mount_state & EXT4_VALID_FS))
1567 printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, " 1611 ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
1568 "running e2fsck is recommended\n"); 1612 "running e2fsck is recommended");
1569 else if ((sbi->s_mount_state & EXT4_ERROR_FS)) 1613 else if ((sbi->s_mount_state & EXT4_ERROR_FS))
1570 printk(KERN_WARNING 1614 ext4_msg(sb, KERN_WARNING,
1571 "EXT4-fs warning: mounting fs with errors, " 1615 "warning: mounting fs with errors, "
1572 "running e2fsck is recommended\n"); 1616 "running e2fsck is recommended");
1573 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && 1617 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
1574 le16_to_cpu(es->s_mnt_count) >= 1618 le16_to_cpu(es->s_mnt_count) >=
1575 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 1619 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1576 printk(KERN_WARNING 1620 ext4_msg(sb, KERN_WARNING,
1577 "EXT4-fs warning: maximal mount count reached, " 1621 "warning: maximal mount count reached, "
1578 "running e2fsck is recommended\n"); 1622 "running e2fsck is recommended");
1579 else if (le32_to_cpu(es->s_checkinterval) && 1623 else if (le32_to_cpu(es->s_checkinterval) &&
1580 (le32_to_cpu(es->s_lastcheck) + 1624 (le32_to_cpu(es->s_lastcheck) +
1581 le32_to_cpu(es->s_checkinterval) <= get_seconds())) 1625 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1582 printk(KERN_WARNING 1626 ext4_msg(sb, KERN_WARNING,
1583 "EXT4-fs warning: checktime reached, " 1627 "warning: checktime reached, "
1584 "running e2fsck is recommended\n"); 1628 "running e2fsck is recommended");
1585 if (!sbi->s_journal) 1629 if (!sbi->s_journal)
1586 es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 1630 es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1587 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 1631 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1588 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); 1632 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
@@ -1592,10 +1636,10 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1592 if (sbi->s_journal) 1636 if (sbi->s_journal)
1593 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 1637 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1594 1638
1595 ext4_commit_super(sb, es, 1); 1639 ext4_commit_super(sb, 1);
1596 if (test_opt(sb, DEBUG)) 1640 if (test_opt(sb, DEBUG))
1597 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " 1641 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1598 "bpg=%lu, ipg=%lu, mo=%04lx]\n", 1642 "bpg=%lu, ipg=%lu, mo=%04x]\n",
1599 sb->s_blocksize, 1643 sb->s_blocksize,
1600 sbi->s_groups_count, 1644 sbi->s_groups_count,
1601 EXT4_BLOCKS_PER_GROUP(sb), 1645 EXT4_BLOCKS_PER_GROUP(sb),
@@ -1603,11 +1647,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1603 sbi->s_mount_opt); 1647 sbi->s_mount_opt);
1604 1648
1605 if (EXT4_SB(sb)->s_journal) { 1649 if (EXT4_SB(sb)->s_journal) {
1606 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", 1650 ext4_msg(sb, KERN_INFO, "%s journal on %s",
1607 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : 1651 EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1608 "external", EXT4_SB(sb)->s_journal->j_devname); 1652 "external", EXT4_SB(sb)->s_journal->j_devname);
1609 } else { 1653 } else {
1610 printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id); 1654 ext4_msg(sb, KERN_INFO, "no journal");
1611 } 1655 }
1612 return res; 1656 return res;
1613} 1657}
@@ -1616,10 +1660,10 @@ static int ext4_fill_flex_info(struct super_block *sb)
1616{ 1660{
1617 struct ext4_sb_info *sbi = EXT4_SB(sb); 1661 struct ext4_sb_info *sbi = EXT4_SB(sb);
1618 struct ext4_group_desc *gdp = NULL; 1662 struct ext4_group_desc *gdp = NULL;
1619 struct buffer_head *bh;
1620 ext4_group_t flex_group_count; 1663 ext4_group_t flex_group_count;
1621 ext4_group_t flex_group; 1664 ext4_group_t flex_group;
1622 int groups_per_flex = 0; 1665 int groups_per_flex = 0;
1666 size_t size;
1623 int i; 1667 int i;
1624 1668
1625 if (!sbi->s_es->s_log_groups_per_flex) { 1669 if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1634,16 +1678,21 @@ static int ext4_fill_flex_info(struct super_block *sb)
1634 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + 1678 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
1635 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << 1679 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
1636 EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; 1680 EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
1637 sbi->s_flex_groups = kzalloc(flex_group_count * 1681 size = flex_group_count * sizeof(struct flex_groups);
1638 sizeof(struct flex_groups), GFP_KERNEL); 1682 sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
1683 if (sbi->s_flex_groups == NULL) {
1684 sbi->s_flex_groups = vmalloc(size);
1685 if (sbi->s_flex_groups)
1686 memset(sbi->s_flex_groups, 0, size);
1687 }
1639 if (sbi->s_flex_groups == NULL) { 1688 if (sbi->s_flex_groups == NULL) {
1640 printk(KERN_ERR "EXT4-fs: not enough memory for " 1689 ext4_msg(sb, KERN_ERR, "not enough memory for "
1641 "%u flex groups\n", flex_group_count); 1690 "%u flex groups", flex_group_count);
1642 goto failed; 1691 goto failed;
1643 } 1692 }
1644 1693
1645 for (i = 0; i < sbi->s_groups_count; i++) { 1694 for (i = 0; i < sbi->s_groups_count; i++) {
1646 gdp = ext4_get_group_desc(sb, i, &bh); 1695 gdp = ext4_get_group_desc(sb, i, NULL);
1647 1696
1648 flex_group = ext4_flex_group(sbi, i); 1697 flex_group = ext4_flex_group(sbi, i);
1649 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, 1698 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
@@ -1724,44 +1773,44 @@ static int ext4_check_descriptors(struct super_block *sb)
1724 1773
1725 block_bitmap = ext4_block_bitmap(sb, gdp); 1774 block_bitmap = ext4_block_bitmap(sb, gdp);
1726 if (block_bitmap < first_block || block_bitmap > last_block) { 1775 if (block_bitmap < first_block || block_bitmap > last_block) {
1727 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1776 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
1728 "Block bitmap for group %u not in group " 1777 "Block bitmap for group %u not in group "
1729 "(block %llu)!\n", i, block_bitmap); 1778 "(block %llu)!", i, block_bitmap);
1730 return 0; 1779 return 0;
1731 } 1780 }
1732 inode_bitmap = ext4_inode_bitmap(sb, gdp); 1781 inode_bitmap = ext4_inode_bitmap(sb, gdp);
1733 if (inode_bitmap < first_block || inode_bitmap > last_block) { 1782 if (inode_bitmap < first_block || inode_bitmap > last_block) {
1734 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1783 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
1735 "Inode bitmap for group %u not in group " 1784 "Inode bitmap for group %u not in group "
1736 "(block %llu)!\n", i, inode_bitmap); 1785 "(block %llu)!", i, inode_bitmap);
1737 return 0; 1786 return 0;
1738 } 1787 }
1739 inode_table = ext4_inode_table(sb, gdp); 1788 inode_table = ext4_inode_table(sb, gdp);
1740 if (inode_table < first_block || 1789 if (inode_table < first_block ||
1741 inode_table + sbi->s_itb_per_group - 1 > last_block) { 1790 inode_table + sbi->s_itb_per_group - 1 > last_block) {
1742 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1791 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
1743 "Inode table for group %u not in group " 1792 "Inode table for group %u not in group "
1744 "(block %llu)!\n", i, inode_table); 1793 "(block %llu)!", i, inode_table);
1745 return 0; 1794 return 0;
1746 } 1795 }
1747 spin_lock(sb_bgl_lock(sbi, i)); 1796 ext4_lock_group(sb, i);
1748 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { 1797 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
1749 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1798 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
1750 "Checksum for group %u failed (%u!=%u)\n", 1799 "Checksum for group %u failed (%u!=%u)",
1751 i, le16_to_cpu(ext4_group_desc_csum(sbi, i, 1800 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1752 gdp)), le16_to_cpu(gdp->bg_checksum)); 1801 gdp)), le16_to_cpu(gdp->bg_checksum));
1753 if (!(sb->s_flags & MS_RDONLY)) { 1802 if (!(sb->s_flags & MS_RDONLY)) {
1754 spin_unlock(sb_bgl_lock(sbi, i)); 1803 ext4_unlock_group(sb, i);
1755 return 0; 1804 return 0;
1756 } 1805 }
1757 } 1806 }
1758 spin_unlock(sb_bgl_lock(sbi, i)); 1807 ext4_unlock_group(sb, i);
1759 if (!flexbg_flag) 1808 if (!flexbg_flag)
1760 first_block += EXT4_BLOCKS_PER_GROUP(sb); 1809 first_block += EXT4_BLOCKS_PER_GROUP(sb);
1761 } 1810 }
1762 1811
1763 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); 1812 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
1764 sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); 1813 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
1765 return 1; 1814 return 1;
1766} 1815}
1767 1816
@@ -1796,8 +1845,8 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1796 } 1845 }
1797 1846
1798 if (bdev_read_only(sb->s_bdev)) { 1847 if (bdev_read_only(sb->s_bdev)) {
1799 printk(KERN_ERR "EXT4-fs: write access " 1848 ext4_msg(sb, KERN_ERR, "write access "
1800 "unavailable, skipping orphan cleanup.\n"); 1849 "unavailable, skipping orphan cleanup");
1801 return; 1850 return;
1802 } 1851 }
1803 1852
@@ -1811,8 +1860,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1811 } 1860 }
1812 1861
1813 if (s_flags & MS_RDONLY) { 1862 if (s_flags & MS_RDONLY) {
1814 printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n", 1863 ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
1815 sb->s_id);
1816 sb->s_flags &= ~MS_RDONLY; 1864 sb->s_flags &= ~MS_RDONLY;
1817 } 1865 }
1818#ifdef CONFIG_QUOTA 1866#ifdef CONFIG_QUOTA
@@ -1823,9 +1871,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1823 if (EXT4_SB(sb)->s_qf_names[i]) { 1871 if (EXT4_SB(sb)->s_qf_names[i]) {
1824 int ret = ext4_quota_on_mount(sb, i); 1872 int ret = ext4_quota_on_mount(sb, i);
1825 if (ret < 0) 1873 if (ret < 0)
1826 printk(KERN_ERR 1874 ext4_msg(sb, KERN_ERR,
1827 "EXT4-fs: Cannot turn on journaled " 1875 "Cannot turn on journaled "
1828 "quota: error %d\n", ret); 1876 "quota: error %d", ret);
1829 } 1877 }
1830 } 1878 }
1831#endif 1879#endif
@@ -1842,16 +1890,16 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1842 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 1890 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1843 vfs_dq_init(inode); 1891 vfs_dq_init(inode);
1844 if (inode->i_nlink) { 1892 if (inode->i_nlink) {
1845 printk(KERN_DEBUG 1893 ext4_msg(sb, KERN_DEBUG,
1846 "%s: truncating inode %lu to %lld bytes\n", 1894 "%s: truncating inode %lu to %lld bytes",
1847 __func__, inode->i_ino, inode->i_size); 1895 __func__, inode->i_ino, inode->i_size);
1848 jbd_debug(2, "truncating inode %lu to %lld bytes\n", 1896 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
1849 inode->i_ino, inode->i_size); 1897 inode->i_ino, inode->i_size);
1850 ext4_truncate(inode); 1898 ext4_truncate(inode);
1851 nr_truncates++; 1899 nr_truncates++;
1852 } else { 1900 } else {
1853 printk(KERN_DEBUG 1901 ext4_msg(sb, KERN_DEBUG,
1854 "%s: deleting unreferenced inode %lu\n", 1902 "%s: deleting unreferenced inode %lu",
1855 __func__, inode->i_ino); 1903 __func__, inode->i_ino);
1856 jbd_debug(2, "deleting unreferenced inode %lu\n", 1904 jbd_debug(2, "deleting unreferenced inode %lu\n",
1857 inode->i_ino); 1905 inode->i_ino);
@@ -1863,11 +1911,11 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1863#define PLURAL(x) (x), ((x) == 1) ? "" : "s" 1911#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
1864 1912
1865 if (nr_orphans) 1913 if (nr_orphans)
1866 printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n", 1914 ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
1867 sb->s_id, PLURAL(nr_orphans)); 1915 PLURAL(nr_orphans));
1868 if (nr_truncates) 1916 if (nr_truncates)
1869 printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n", 1917 ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
1870 sb->s_id, PLURAL(nr_truncates)); 1918 PLURAL(nr_truncates));
1871#ifdef CONFIG_QUOTA 1919#ifdef CONFIG_QUOTA
1872 /* Turn quotas off */ 1920 /* Turn quotas off */
1873 for (i = 0; i < MAXQUOTAS; i++) { 1921 for (i = 0; i < MAXQUOTAS; i++) {
@@ -1877,6 +1925,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1877#endif 1925#endif
1878 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 1926 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
1879} 1927}
1928
1880/* 1929/*
1881 * Maximal extent format file size. 1930 * Maximal extent format file size.
1882 * Resulting logical blkno at s_maxbytes must fit in our on-disk 1931 * Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -1894,7 +1943,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
1894 /* small i_blocks in vfs inode? */ 1943 /* small i_blocks in vfs inode? */
1895 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { 1944 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
1896 /* 1945 /*
1897 * CONFIG_LBD is not enabled implies the inode 1946 * CONFIG_LBDAF is not enabled implies the inode
1898 * i_block represent total blocks in 512 bytes 1947 * i_block represent total blocks in 512 bytes
1899 * 32 == size of vfs inode i_blocks * 8 1948 * 32 == size of vfs inode i_blocks * 8
1900 */ 1949 */
@@ -1927,19 +1976,19 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
1927 loff_t res = EXT4_NDIR_BLOCKS; 1976 loff_t res = EXT4_NDIR_BLOCKS;
1928 int meta_blocks; 1977 int meta_blocks;
1929 loff_t upper_limit; 1978 loff_t upper_limit;
1930 /* This is calculated to be the largest file size for a 1979 /* This is calculated to be the largest file size for a dense, block
1931 * dense, bitmapped file such that the total number of 1980 * mapped file such that the file's total number of 512-byte sectors,
1932 * sectors in the file, including data and all indirect blocks, 1981 * including data and all indirect blocks, does not exceed (2^48 - 1).
1933 * does not exceed 2^48 -1 1982 *
1934 * __u32 i_blocks_lo and _u16 i_blocks_high representing the 1983 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
1935 * total number of 512 bytes blocks of the file 1984 * number of 512-byte sectors of the file.
1936 */ 1985 */
1937 1986
1938 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { 1987 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
1939 /* 1988 /*
1940 * !has_huge_files or CONFIG_LBD is not enabled 1989 * !has_huge_files or CONFIG_LBDAF not enabled implies that
1941 * implies the inode i_block represent total blocks in 1990 * the inode i_block field represents total file blocks in
1942 * 512 bytes 32 == size of vfs inode i_blocks * 8 1991 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
1943 */ 1992 */
1944 upper_limit = (1LL << 32) - 1; 1993 upper_limit = (1LL << 32) - 1;
1945 1994
@@ -1981,7 +2030,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
1981} 2030}
1982 2031
1983static ext4_fsblk_t descriptor_loc(struct super_block *sb, 2032static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1984 ext4_fsblk_t logical_sb_block, int nr) 2033 ext4_fsblk_t logical_sb_block, int nr)
1985{ 2034{
1986 struct ext4_sb_info *sbi = EXT4_SB(sb); 2035 struct ext4_sb_info *sbi = EXT4_SB(sb);
1987 ext4_group_t bg, first_meta_bg; 2036 ext4_group_t bg, first_meta_bg;
@@ -1995,6 +2044,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1995 bg = sbi->s_desc_per_block * nr; 2044 bg = sbi->s_desc_per_block * nr;
1996 if (ext4_bg_has_super(sb, bg)) 2045 if (ext4_bg_has_super(sb, bg))
1997 has_super = 1; 2046 has_super = 1;
2047
1998 return (has_super + ext4_group_first_block_no(sb, bg)); 2048 return (has_super + ext4_group_first_block_no(sb, bg));
1999} 2049}
2000 2050
@@ -2091,8 +2141,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2091 if (parse_strtoul(buf, 0x40000000, &t)) 2141 if (parse_strtoul(buf, 0x40000000, &t))
2092 return -EINVAL; 2142 return -EINVAL;
2093 2143
2094 /* inode_readahead_blks must be a power of 2 */ 2144 if (!is_power_of_2(t))
2095 if (t & (t-1))
2096 return -EINVAL; 2145 return -EINVAL;
2097 2146
2098 sbi->s_inode_readahead_blks = t; 2147 sbi->s_inode_readahead_blks = t;
@@ -2100,7 +2149,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2100} 2149}
2101 2150
2102static ssize_t sbi_ui_show(struct ext4_attr *a, 2151static ssize_t sbi_ui_show(struct ext4_attr *a,
2103 struct ext4_sb_info *sbi, char *buf) 2152 struct ext4_sb_info *sbi, char *buf)
2104{ 2153{
2105 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2154 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2106 2155
@@ -2141,6 +2190,7 @@ EXT4_RO_ATTR(session_write_kbytes);
2141EXT4_RO_ATTR(lifetime_write_kbytes); 2190EXT4_RO_ATTR(lifetime_write_kbytes);
2142EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, 2191EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2143 inode_readahead_blks_store, s_inode_readahead_blks); 2192 inode_readahead_blks_store, s_inode_readahead_blks);
2193EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
2144EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); 2194EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
2145EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); 2195EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
2146EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); 2196EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
@@ -2153,6 +2203,7 @@ static struct attribute *ext4_attrs[] = {
2153 ATTR_LIST(session_write_kbytes), 2203 ATTR_LIST(session_write_kbytes),
2154 ATTR_LIST(lifetime_write_kbytes), 2204 ATTR_LIST(lifetime_write_kbytes),
2155 ATTR_LIST(inode_readahead_blks), 2205 ATTR_LIST(inode_readahead_blks),
2206 ATTR_LIST(inode_goal),
2156 ATTR_LIST(mb_stats), 2207 ATTR_LIST(mb_stats),
2157 ATTR_LIST(mb_max_to_scan), 2208 ATTR_LIST(mb_max_to_scan),
2158 ATTR_LIST(mb_min_to_scan), 2209 ATTR_LIST(mb_min_to_scan),
@@ -2205,7 +2256,6 @@ static struct kobj_type ext4_ktype = {
2205static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2256static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2206 __releases(kernel_lock) 2257 __releases(kernel_lock)
2207 __acquires(kernel_lock) 2258 __acquires(kernel_lock)
2208
2209{ 2259{
2210 struct buffer_head *bh; 2260 struct buffer_head *bh;
2211 struct ext4_super_block *es = NULL; 2261 struct ext4_super_block *es = NULL;
@@ -2256,7 +2306,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2256 2306
2257 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 2307 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
2258 if (!blocksize) { 2308 if (!blocksize) {
2259 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n"); 2309 ext4_msg(sb, KERN_ERR, "unable to set blocksize");
2260 goto out_fail; 2310 goto out_fail;
2261 } 2311 }
2262 2312
@@ -2272,7 +2322,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2272 } 2322 }
2273 2323
2274 if (!(bh = sb_bread(sb, logical_sb_block))) { 2324 if (!(bh = sb_bread(sb, logical_sb_block))) {
2275 printk(KERN_ERR "EXT4-fs: unable to read superblock\n"); 2325 ext4_msg(sb, KERN_ERR, "unable to read superblock");
2276 goto out_fail; 2326 goto out_fail;
2277 } 2327 }
2278 /* 2328 /*
@@ -2321,6 +2371,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2321 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 2371 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2322 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2372 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2323 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2373 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2374 sbi->s_mb_history_max = default_mb_history_length;
2324 2375
2325 set_opt(sbi->s_mount_opt, BARRIER); 2376 set_opt(sbi->s_mount_opt, BARRIER);
2326 2377
@@ -2330,7 +2381,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2330 */ 2381 */
2331 set_opt(sbi->s_mount_opt, DELALLOC); 2382 set_opt(sbi->s_mount_opt, DELALLOC);
2332 2383
2333
2334 if (!parse_options((char *) data, sb, &journal_devnum, 2384 if (!parse_options((char *) data, sb, &journal_devnum,
2335 &journal_ioprio, NULL, 0)) 2385 &journal_ioprio, NULL, 0))
2336 goto failed_mount; 2386 goto failed_mount;
@@ -2342,9 +2392,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2342 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || 2392 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
2343 EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) || 2393 EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
2344 EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U))) 2394 EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
2345 printk(KERN_WARNING 2395 ext4_msg(sb, KERN_WARNING,
2346 "EXT4-fs warning: feature flags set on rev 0 fs, " 2396 "feature flags set on rev 0 fs, "
2347 "running e2fsck is recommended\n"); 2397 "running e2fsck is recommended");
2348 2398
2349 /* 2399 /*
2350 * Check feature flags regardless of the revision level, since we 2400 * Check feature flags regardless of the revision level, since we
@@ -2353,16 +2403,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2353 */ 2403 */
2354 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); 2404 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
2355 if (features) { 2405 if (features) {
2356 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " 2406 ext4_msg(sb, KERN_ERR,
2357 "unsupported optional features (%x).\n", sb->s_id, 2407 "Couldn't mount because of "
2408 "unsupported optional features (%x)",
2358 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & 2409 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2359 ~EXT4_FEATURE_INCOMPAT_SUPP)); 2410 ~EXT4_FEATURE_INCOMPAT_SUPP));
2360 goto failed_mount; 2411 goto failed_mount;
2361 } 2412 }
2362 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); 2413 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
2363 if (!(sb->s_flags & MS_RDONLY) && features) { 2414 if (!(sb->s_flags & MS_RDONLY) && features) {
2364 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " 2415 ext4_msg(sb, KERN_ERR,
2365 "unsupported optional features (%x).\n", sb->s_id, 2416 "Couldn't mount RDWR because of "
2417 "unsupported optional features (%x)",
2366 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & 2418 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2367 ~EXT4_FEATURE_RO_COMPAT_SUPP)); 2419 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2368 goto failed_mount; 2420 goto failed_mount;
@@ -2372,13 +2424,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2372 if (has_huge_files) { 2424 if (has_huge_files) {
2373 /* 2425 /*
2374 * Large file size enabled file system can only be 2426 * Large file size enabled file system can only be
2375 * mount if kernel is build with CONFIG_LBD 2427 * mount if kernel is build with CONFIG_LBDAF
2376 */ 2428 */
2377 if (sizeof(root->i_blocks) < sizeof(u64) && 2429 if (sizeof(root->i_blocks) < sizeof(u64) &&
2378 !(sb->s_flags & MS_RDONLY)) { 2430 !(sb->s_flags & MS_RDONLY)) {
2379 printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge " 2431 ext4_msg(sb, KERN_ERR, "Filesystem with huge "
2380 "files cannot be mounted read-write " 2432 "files cannot be mounted read-write "
2381 "without CONFIG_LBD.\n", sb->s_id); 2433 "without CONFIG_LBDAF");
2382 goto failed_mount; 2434 goto failed_mount;
2383 } 2435 }
2384 } 2436 }
@@ -2386,17 +2438,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2386 2438
2387 if (blocksize < EXT4_MIN_BLOCK_SIZE || 2439 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
2388 blocksize > EXT4_MAX_BLOCK_SIZE) { 2440 blocksize > EXT4_MAX_BLOCK_SIZE) {
2389 printk(KERN_ERR 2441 ext4_msg(sb, KERN_ERR,
2390 "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n", 2442 "Unsupported filesystem blocksize %d", blocksize);
2391 blocksize, sb->s_id);
2392 goto failed_mount; 2443 goto failed_mount;
2393 } 2444 }
2394 2445
2395 if (sb->s_blocksize != blocksize) { 2446 if (sb->s_blocksize != blocksize) {
2396
2397 /* Validate the filesystem blocksize */ 2447 /* Validate the filesystem blocksize */
2398 if (!sb_set_blocksize(sb, blocksize)) { 2448 if (!sb_set_blocksize(sb, blocksize)) {
2399 printk(KERN_ERR "EXT4-fs: bad block size %d.\n", 2449 ext4_msg(sb, KERN_ERR, "bad block size %d",
2400 blocksize); 2450 blocksize);
2401 goto failed_mount; 2451 goto failed_mount;
2402 } 2452 }
@@ -2406,15 +2456,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2406 offset = do_div(logical_sb_block, blocksize); 2456 offset = do_div(logical_sb_block, blocksize);
2407 bh = sb_bread(sb, logical_sb_block); 2457 bh = sb_bread(sb, logical_sb_block);
2408 if (!bh) { 2458 if (!bh) {
2409 printk(KERN_ERR 2459 ext4_msg(sb, KERN_ERR,
2410 "EXT4-fs: Can't read superblock on 2nd try.\n"); 2460 "Can't read superblock on 2nd try");
2411 goto failed_mount; 2461 goto failed_mount;
2412 } 2462 }
2413 es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); 2463 es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
2414 sbi->s_es = es; 2464 sbi->s_es = es;
2415 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { 2465 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
2416 printk(KERN_ERR 2466 ext4_msg(sb, KERN_ERR,
2417 "EXT4-fs: Magic mismatch, very weird !\n"); 2467 "Magic mismatch, very weird!");
2418 goto failed_mount; 2468 goto failed_mount;
2419 } 2469 }
2420 } 2470 }
@@ -2432,30 +2482,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2432 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || 2482 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
2433 (!is_power_of_2(sbi->s_inode_size)) || 2483 (!is_power_of_2(sbi->s_inode_size)) ||
2434 (sbi->s_inode_size > blocksize)) { 2484 (sbi->s_inode_size > blocksize)) {
2435 printk(KERN_ERR 2485 ext4_msg(sb, KERN_ERR,
2436 "EXT4-fs: unsupported inode size: %d\n", 2486 "unsupported inode size: %d",
2437 sbi->s_inode_size); 2487 sbi->s_inode_size);
2438 goto failed_mount; 2488 goto failed_mount;
2439 } 2489 }
2440 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) 2490 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
2441 sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2); 2491 sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
2442 } 2492 }
2493
2443 sbi->s_desc_size = le16_to_cpu(es->s_desc_size); 2494 sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
2444 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) { 2495 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
2445 if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || 2496 if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
2446 sbi->s_desc_size > EXT4_MAX_DESC_SIZE || 2497 sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
2447 !is_power_of_2(sbi->s_desc_size)) { 2498 !is_power_of_2(sbi->s_desc_size)) {
2448 printk(KERN_ERR 2499 ext4_msg(sb, KERN_ERR,
2449 "EXT4-fs: unsupported descriptor size %lu\n", 2500 "unsupported descriptor size %lu",
2450 sbi->s_desc_size); 2501 sbi->s_desc_size);
2451 goto failed_mount; 2502 goto failed_mount;
2452 } 2503 }
2453 } else 2504 } else
2454 sbi->s_desc_size = EXT4_MIN_DESC_SIZE; 2505 sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
2506
2455 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); 2507 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
2456 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); 2508 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
2457 if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) 2509 if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
2458 goto cantfind_ext4; 2510 goto cantfind_ext4;
2511
2459 sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); 2512 sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
2460 if (sbi->s_inodes_per_block == 0) 2513 if (sbi->s_inodes_per_block == 0)
2461 goto cantfind_ext4; 2514 goto cantfind_ext4;
@@ -2466,6 +2519,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2466 sbi->s_mount_state = le16_to_cpu(es->s_state); 2519 sbi->s_mount_state = le16_to_cpu(es->s_state);
2467 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); 2520 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
2468 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); 2521 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
2522
2469 for (i = 0; i < 4; i++) 2523 for (i = 0; i < 4; i++)
2470 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 2524 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2471 sbi->s_def_hash_version = es->s_def_hash_version; 2525 sbi->s_def_hash_version = es->s_def_hash_version;
@@ -2483,25 +2537,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2483 } 2537 }
2484 2538
2485 if (sbi->s_blocks_per_group > blocksize * 8) { 2539 if (sbi->s_blocks_per_group > blocksize * 8) {
2486 printk(KERN_ERR 2540 ext4_msg(sb, KERN_ERR,
2487 "EXT4-fs: #blocks per group too big: %lu\n", 2541 "#blocks per group too big: %lu",
2488 sbi->s_blocks_per_group); 2542 sbi->s_blocks_per_group);
2489 goto failed_mount; 2543 goto failed_mount;
2490 } 2544 }
2491 if (sbi->s_inodes_per_group > blocksize * 8) { 2545 if (sbi->s_inodes_per_group > blocksize * 8) {
2492 printk(KERN_ERR 2546 ext4_msg(sb, KERN_ERR,
2493 "EXT4-fs: #inodes per group too big: %lu\n", 2547 "#inodes per group too big: %lu",
2494 sbi->s_inodes_per_group); 2548 sbi->s_inodes_per_group);
2495 goto failed_mount; 2549 goto failed_mount;
2496 } 2550 }
2497 2551
2498 if (ext4_blocks_count(es) > 2552 if (ext4_blocks_count(es) >
2499 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 2553 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
2500 printk(KERN_ERR "EXT4-fs: filesystem on %s:" 2554 ext4_msg(sb, KERN_ERR, "filesystem"
2501 " too large to mount safely\n", sb->s_id); 2555 " too large to mount safely");
2502 if (sizeof(sector_t) < 8) 2556 if (sizeof(sector_t) < 8)
2503 printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not " 2557 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
2504 "enabled\n");
2505 goto failed_mount; 2558 goto failed_mount;
2506 } 2559 }
2507 2560
@@ -2511,21 +2564,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2511 /* check blocks count against device size */ 2564 /* check blocks count against device size */
2512 blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; 2565 blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
2513 if (blocks_count && ext4_blocks_count(es) > blocks_count) { 2566 if (blocks_count && ext4_blocks_count(es) > blocks_count) {
2514 printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu " 2567 ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
2515 "exceeds size of device (%llu blocks)\n", 2568 "exceeds size of device (%llu blocks)",
2516 ext4_blocks_count(es), blocks_count); 2569 ext4_blocks_count(es), blocks_count);
2517 goto failed_mount; 2570 goto failed_mount;
2518 } 2571 }
2519 2572
2520 /* 2573 /*
2521 * It makes no sense for the first data block to be beyond the end 2574 * It makes no sense for the first data block to be beyond the end
2522 * of the filesystem. 2575 * of the filesystem.
2523 */ 2576 */
2524 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { 2577 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2525 printk(KERN_WARNING "EXT4-fs: bad geometry: first data" 2578 ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
2526 "block %u is beyond end of filesystem (%llu)\n", 2579 "block %u is beyond end of filesystem (%llu)",
2527 le32_to_cpu(es->s_first_data_block), 2580 le32_to_cpu(es->s_first_data_block),
2528 ext4_blocks_count(es)); 2581 ext4_blocks_count(es));
2529 goto failed_mount; 2582 goto failed_mount;
2530 } 2583 }
2531 blocks_count = (ext4_blocks_count(es) - 2584 blocks_count = (ext4_blocks_count(es) -
@@ -2533,9 +2586,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2533 EXT4_BLOCKS_PER_GROUP(sb) - 1); 2586 EXT4_BLOCKS_PER_GROUP(sb) - 1);
2534 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); 2587 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2535 if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { 2588 if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2536 printk(KERN_WARNING "EXT4-fs: groups count too large: %u " 2589 ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
2537 "(block count %llu, first data block %u, " 2590 "(block count %llu, first data block %u, "
2538 "blocks per group %lu)\n", sbi->s_groups_count, 2591 "blocks per group %lu)", sbi->s_groups_count,
2539 ext4_blocks_count(es), 2592 ext4_blocks_count(es),
2540 le32_to_cpu(es->s_first_data_block), 2593 le32_to_cpu(es->s_first_data_block),
2541 EXT4_BLOCKS_PER_GROUP(sb)); 2594 EXT4_BLOCKS_PER_GROUP(sb));
@@ -2547,7 +2600,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2547 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), 2600 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
2548 GFP_KERNEL); 2601 GFP_KERNEL);
2549 if (sbi->s_group_desc == NULL) { 2602 if (sbi->s_group_desc == NULL) {
2550 printk(KERN_ERR "EXT4-fs: not enough memory\n"); 2603 ext4_msg(sb, KERN_ERR, "not enough memory");
2551 goto failed_mount; 2604 goto failed_mount;
2552 } 2605 }
2553 2606
@@ -2562,21 +2615,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2562 block = descriptor_loc(sb, logical_sb_block, i); 2615 block = descriptor_loc(sb, logical_sb_block, i);
2563 sbi->s_group_desc[i] = sb_bread(sb, block); 2616 sbi->s_group_desc[i] = sb_bread(sb, block);
2564 if (!sbi->s_group_desc[i]) { 2617 if (!sbi->s_group_desc[i]) {
2565 printk(KERN_ERR "EXT4-fs: " 2618 ext4_msg(sb, KERN_ERR,
2566 "can't read group descriptor %d\n", i); 2619 "can't read group descriptor %d", i);
2567 db_count = i; 2620 db_count = i;
2568 goto failed_mount2; 2621 goto failed_mount2;
2569 } 2622 }
2570 } 2623 }
2571 if (!ext4_check_descriptors(sb)) { 2624 if (!ext4_check_descriptors(sb)) {
2572 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); 2625 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
2573 goto failed_mount2; 2626 goto failed_mount2;
2574 } 2627 }
2575 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 2628 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
2576 if (!ext4_fill_flex_info(sb)) { 2629 if (!ext4_fill_flex_info(sb)) {
2577 printk(KERN_ERR 2630 ext4_msg(sb, KERN_ERR,
2578 "EXT4-fs: unable to initialize " 2631 "unable to initialize "
2579 "flex_bg meta info!\n"); 2632 "flex_bg meta info!");
2580 goto failed_mount2; 2633 goto failed_mount2;
2581 } 2634 }
2582 2635
@@ -2598,7 +2651,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2598 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); 2651 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2599 } 2652 }
2600 if (err) { 2653 if (err) {
2601 printk(KERN_ERR "EXT4-fs: insufficient memory\n"); 2654 ext4_msg(sb, KERN_ERR, "insufficient memory");
2602 goto failed_mount3; 2655 goto failed_mount3;
2603 } 2656 }
2604 2657
@@ -2607,7 +2660,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2607 /* 2660 /*
2608 * set up enough so that it can read an inode 2661 * set up enough so that it can read an inode
2609 */ 2662 */
2610 sb->s_op = &ext4_sops; 2663 if (!test_opt(sb, NOLOAD) &&
2664 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
2665 sb->s_op = &ext4_sops;
2666 else
2667 sb->s_op = &ext4_nojournal_sops;
2611 sb->s_export_op = &ext4_export_ops; 2668 sb->s_export_op = &ext4_export_ops;
2612 sb->s_xattr = ext4_xattr_handlers; 2669 sb->s_xattr = ext4_xattr_handlers;
2613#ifdef CONFIG_QUOTA 2670#ifdef CONFIG_QUOTA
@@ -2615,6 +2672,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2615 sb->dq_op = &ext4_quota_operations; 2672 sb->dq_op = &ext4_quota_operations;
2616#endif 2673#endif
2617 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 2674 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
2675 mutex_init(&sbi->s_orphan_lock);
2676 mutex_init(&sbi->s_resize_lock);
2618 2677
2619 sb->s_root = NULL; 2678 sb->s_root = NULL;
2620 2679
@@ -2632,13 +2691,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2632 goto failed_mount3; 2691 goto failed_mount3;
2633 if (!(sb->s_flags & MS_RDONLY) && 2692 if (!(sb->s_flags & MS_RDONLY) &&
2634 EXT4_SB(sb)->s_journal->j_failed_commit) { 2693 EXT4_SB(sb)->s_journal->j_failed_commit) {
2635 printk(KERN_CRIT "EXT4-fs error (device %s): " 2694 ext4_msg(sb, KERN_CRIT, "error: "
2636 "ext4_fill_super: Journal transaction " 2695 "ext4_fill_super: Journal transaction "
2637 "%u is corrupt\n", sb->s_id, 2696 "%u is corrupt",
2638 EXT4_SB(sb)->s_journal->j_failed_commit); 2697 EXT4_SB(sb)->s_journal->j_failed_commit);
2639 if (test_opt(sb, ERRORS_RO)) { 2698 if (test_opt(sb, ERRORS_RO)) {
2640 printk(KERN_CRIT 2699 ext4_msg(sb, KERN_CRIT,
2641 "Mounting filesystem read-only\n"); 2700 "Mounting filesystem read-only");
2642 sb->s_flags |= MS_RDONLY; 2701 sb->s_flags |= MS_RDONLY;
2643 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 2702 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2644 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 2703 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2646,14 +2705,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2646 if (test_opt(sb, ERRORS_PANIC)) { 2705 if (test_opt(sb, ERRORS_PANIC)) {
2647 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 2706 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2648 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 2707 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2649 ext4_commit_super(sb, es, 1); 2708 ext4_commit_super(sb, 1);
2650 goto failed_mount4; 2709 goto failed_mount4;
2651 } 2710 }
2652 } 2711 }
2653 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && 2712 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
2654 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2713 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2655 printk(KERN_ERR "EXT4-fs: required journal recovery " 2714 ext4_msg(sb, KERN_ERR, "required journal recovery "
2656 "suppressed and not mounted read-only\n"); 2715 "suppressed and not mounted read-only");
2657 goto failed_mount4; 2716 goto failed_mount4;
2658 } else { 2717 } else {
2659 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 2718 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
@@ -2666,7 +2725,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2666 if (ext4_blocks_count(es) > 0xffffffffULL && 2725 if (ext4_blocks_count(es) > 0xffffffffULL &&
2667 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2726 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2668 JBD2_FEATURE_INCOMPAT_64BIT)) { 2727 JBD2_FEATURE_INCOMPAT_64BIT)) {
2669 printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n"); 2728 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
2670 goto failed_mount4; 2729 goto failed_mount4;
2671 } 2730 }
2672 2731
@@ -2704,8 +2763,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2704 case EXT4_MOUNT_WRITEBACK_DATA: 2763 case EXT4_MOUNT_WRITEBACK_DATA:
2705 if (!jbd2_journal_check_available_features 2764 if (!jbd2_journal_check_available_features
2706 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 2765 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
2707 printk(KERN_ERR "EXT4-fs: Journal does not support " 2766 ext4_msg(sb, KERN_ERR, "Journal does not support "
2708 "requested data journaling mode\n"); 2767 "requested data journaling mode");
2709 goto failed_mount4; 2768 goto failed_mount4;
2710 } 2769 }
2711 default: 2770 default:
@@ -2717,8 +2776,8 @@ no_journal:
2717 2776
2718 if (test_opt(sb, NOBH)) { 2777 if (test_opt(sb, NOBH)) {
2719 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2778 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2720 printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - " 2779 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
2721 "its supported only with writeback mode\n"); 2780 "its supported only with writeback mode");
2722 clear_opt(sbi->s_mount_opt, NOBH); 2781 clear_opt(sbi->s_mount_opt, NOBH);
2723 } 2782 }
2724 } 2783 }
@@ -2729,18 +2788,18 @@ no_journal:
2729 2788
2730 root = ext4_iget(sb, EXT4_ROOT_INO); 2789 root = ext4_iget(sb, EXT4_ROOT_INO);
2731 if (IS_ERR(root)) { 2790 if (IS_ERR(root)) {
2732 printk(KERN_ERR "EXT4-fs: get root inode failed\n"); 2791 ext4_msg(sb, KERN_ERR, "get root inode failed");
2733 ret = PTR_ERR(root); 2792 ret = PTR_ERR(root);
2734 goto failed_mount4; 2793 goto failed_mount4;
2735 } 2794 }
2736 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 2795 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
2737 iput(root); 2796 iput(root);
2738 printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n"); 2797 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
2739 goto failed_mount4; 2798 goto failed_mount4;
2740 } 2799 }
2741 sb->s_root = d_alloc_root(root); 2800 sb->s_root = d_alloc_root(root);
2742 if (!sb->s_root) { 2801 if (!sb->s_root) {
2743 printk(KERN_ERR "EXT4-fs: get root dentry failed\n"); 2802 ext4_msg(sb, KERN_ERR, "get root dentry failed");
2744 iput(root); 2803 iput(root);
2745 ret = -ENOMEM; 2804 ret = -ENOMEM;
2746 goto failed_mount4; 2805 goto failed_mount4;
@@ -2769,22 +2828,29 @@ no_journal:
2769 sbi->s_inode_size) { 2828 sbi->s_inode_size) {
2770 sbi->s_want_extra_isize = sizeof(struct ext4_inode) - 2829 sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
2771 EXT4_GOOD_OLD_INODE_SIZE; 2830 EXT4_GOOD_OLD_INODE_SIZE;
2772 printk(KERN_INFO "EXT4-fs: required extra inode space not" 2831 ext4_msg(sb, KERN_INFO, "required extra inode space not"
2773 "available.\n"); 2832 "available");
2774 } 2833 }
2775 2834
2776 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2835 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2777 printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " 2836 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
2778 "requested data journaling mode\n"); 2837 "requested data journaling mode");
2779 clear_opt(sbi->s_mount_opt, DELALLOC); 2838 clear_opt(sbi->s_mount_opt, DELALLOC);
2780 } else if (test_opt(sb, DELALLOC)) 2839 } else if (test_opt(sb, DELALLOC))
2781 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); 2840 ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
2841
2842 err = ext4_setup_system_zone(sb);
2843 if (err) {
2844 ext4_msg(sb, KERN_ERR, "failed to initialize system "
2845 "zone (%d)\n", err);
2846 goto failed_mount4;
2847 }
2782 2848
2783 ext4_ext_init(sb); 2849 ext4_ext_init(sb);
2784 err = ext4_mb_init(sb, needs_recovery); 2850 err = ext4_mb_init(sb, needs_recovery);
2785 if (err) { 2851 if (err) {
2786 printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", 2852 ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
2787 err); 2853 err);
2788 goto failed_mount4; 2854 goto failed_mount4;
2789 } 2855 }
2790 2856
@@ -2798,19 +2864,11 @@ no_journal:
2798 goto failed_mount4; 2864 goto failed_mount4;
2799 }; 2865 };
2800 2866
2801 /*
2802 * akpm: core read_super() calls in here with the superblock locked.
2803 * That deadlocks, because orphan cleanup needs to lock the superblock
2804 * in numerous places. Here we just pop the lock - it's relatively
2805 * harmless, because we are now ready to accept write_super() requests,
2806 * and aviro says that's the only reason for hanging onto the
2807 * superblock lock.
2808 */
2809 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; 2867 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
2810 ext4_orphan_cleanup(sb, es); 2868 ext4_orphan_cleanup(sb, es);
2811 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; 2869 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
2812 if (needs_recovery) { 2870 if (needs_recovery) {
2813 printk(KERN_INFO "EXT4-fs: recovery complete.\n"); 2871 ext4_msg(sb, KERN_INFO, "recovery complete");
2814 ext4_mark_recovery_complete(sb, es); 2872 ext4_mark_recovery_complete(sb, es);
2815 } 2873 }
2816 if (EXT4_SB(sb)->s_journal) { 2874 if (EXT4_SB(sb)->s_journal) {
@@ -2823,25 +2881,30 @@ no_journal:
2823 } else 2881 } else
2824 descr = "out journal"; 2882 descr = "out journal";
2825 2883
2826 printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n", 2884 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
2827 sb->s_id, descr);
2828 2885
2829 lock_kernel(); 2886 lock_kernel();
2830 return 0; 2887 return 0;
2831 2888
2832cantfind_ext4: 2889cantfind_ext4:
2833 if (!silent) 2890 if (!silent)
2834 printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n", 2891 ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
2835 sb->s_id);
2836 goto failed_mount; 2892 goto failed_mount;
2837 2893
2838failed_mount4: 2894failed_mount4:
2839 printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id); 2895 ext4_msg(sb, KERN_ERR, "mount failed");
2896 ext4_release_system_zone(sb);
2840 if (sbi->s_journal) { 2897 if (sbi->s_journal) {
2841 jbd2_journal_destroy(sbi->s_journal); 2898 jbd2_journal_destroy(sbi->s_journal);
2842 sbi->s_journal = NULL; 2899 sbi->s_journal = NULL;
2843 } 2900 }
2844failed_mount3: 2901failed_mount3:
2902 if (sbi->s_flex_groups) {
2903 if (is_vmalloc_addr(sbi->s_flex_groups))
2904 vfree(sbi->s_flex_groups);
2905 else
2906 kfree(sbi->s_flex_groups);
2907 }
2845 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2908 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2846 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2909 percpu_counter_destroy(&sbi->s_freeinodes_counter);
2847 percpu_counter_destroy(&sbi->s_dirs_counter); 2910 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -2862,6 +2925,7 @@ failed_mount:
2862 brelse(bh); 2925 brelse(bh);
2863out_fail: 2926out_fail:
2864 sb->s_fs_info = NULL; 2927 sb->s_fs_info = NULL;
2928 kfree(sbi->s_blockgroup_lock);
2865 kfree(sbi); 2929 kfree(sbi);
2866 lock_kernel(); 2930 lock_kernel();
2867 return ret; 2931 return ret;
@@ -2906,27 +2970,27 @@ static journal_t *ext4_get_journal(struct super_block *sb,
2906 2970
2907 journal_inode = ext4_iget(sb, journal_inum); 2971 journal_inode = ext4_iget(sb, journal_inum);
2908 if (IS_ERR(journal_inode)) { 2972 if (IS_ERR(journal_inode)) {
2909 printk(KERN_ERR "EXT4-fs: no journal found.\n"); 2973 ext4_msg(sb, KERN_ERR, "no journal found");
2910 return NULL; 2974 return NULL;
2911 } 2975 }
2912 if (!journal_inode->i_nlink) { 2976 if (!journal_inode->i_nlink) {
2913 make_bad_inode(journal_inode); 2977 make_bad_inode(journal_inode);
2914 iput(journal_inode); 2978 iput(journal_inode);
2915 printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n"); 2979 ext4_msg(sb, KERN_ERR, "journal inode is deleted");
2916 return NULL; 2980 return NULL;
2917 } 2981 }
2918 2982
2919 jbd_debug(2, "Journal inode found at %p: %lld bytes\n", 2983 jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
2920 journal_inode, journal_inode->i_size); 2984 journal_inode, journal_inode->i_size);
2921 if (!S_ISREG(journal_inode->i_mode)) { 2985 if (!S_ISREG(journal_inode->i_mode)) {
2922 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n"); 2986 ext4_msg(sb, KERN_ERR, "invalid journal inode");
2923 iput(journal_inode); 2987 iput(journal_inode);
2924 return NULL; 2988 return NULL;
2925 } 2989 }
2926 2990
2927 journal = jbd2_journal_init_inode(journal_inode); 2991 journal = jbd2_journal_init_inode(journal_inode);
2928 if (!journal) { 2992 if (!journal) {
2929 printk(KERN_ERR "EXT4-fs: Could not load journal inode\n"); 2993 ext4_msg(sb, KERN_ERR, "Could not load journal inode");
2930 iput(journal_inode); 2994 iput(journal_inode);
2931 return NULL; 2995 return NULL;
2932 } 2996 }
@@ -2950,22 +3014,22 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
2950 3014
2951 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); 3015 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2952 3016
2953 bdev = ext4_blkdev_get(j_dev); 3017 bdev = ext4_blkdev_get(j_dev, sb);
2954 if (bdev == NULL) 3018 if (bdev == NULL)
2955 return NULL; 3019 return NULL;
2956 3020
2957 if (bd_claim(bdev, sb)) { 3021 if (bd_claim(bdev, sb)) {
2958 printk(KERN_ERR 3022 ext4_msg(sb, KERN_ERR,
2959 "EXT4-fs: failed to claim external journal device.\n"); 3023 "failed to claim external journal device");
2960 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 3024 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
2961 return NULL; 3025 return NULL;
2962 } 3026 }
2963 3027
2964 blocksize = sb->s_blocksize; 3028 blocksize = sb->s_blocksize;
2965 hblock = bdev_hardsect_size(bdev); 3029 hblock = bdev_logical_block_size(bdev);
2966 if (blocksize < hblock) { 3030 if (blocksize < hblock) {
2967 printk(KERN_ERR 3031 ext4_msg(sb, KERN_ERR,
2968 "EXT4-fs: blocksize too small for journal device.\n"); 3032 "blocksize too small for journal device");
2969 goto out_bdev; 3033 goto out_bdev;
2970 } 3034 }
2971 3035
@@ -2973,8 +3037,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
2973 offset = EXT4_MIN_BLOCK_SIZE % blocksize; 3037 offset = EXT4_MIN_BLOCK_SIZE % blocksize;
2974 set_blocksize(bdev, blocksize); 3038 set_blocksize(bdev, blocksize);
2975 if (!(bh = __bread(bdev, sb_block, blocksize))) { 3039 if (!(bh = __bread(bdev, sb_block, blocksize))) {
2976 printk(KERN_ERR "EXT4-fs: couldn't read superblock of " 3040 ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
2977 "external journal\n"); 3041 "external journal");
2978 goto out_bdev; 3042 goto out_bdev;
2979 } 3043 }
2980 3044
@@ -2982,14 +3046,14 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
2982 if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || 3046 if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
2983 !(le32_to_cpu(es->s_feature_incompat) & 3047 !(le32_to_cpu(es->s_feature_incompat) &
2984 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { 3048 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
2985 printk(KERN_ERR "EXT4-fs: external journal has " 3049 ext4_msg(sb, KERN_ERR, "external journal has "
2986 "bad superblock\n"); 3050 "bad superblock");
2987 brelse(bh); 3051 brelse(bh);
2988 goto out_bdev; 3052 goto out_bdev;
2989 } 3053 }
2990 3054
2991 if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { 3055 if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
2992 printk(KERN_ERR "EXT4-fs: journal UUID does not match\n"); 3056 ext4_msg(sb, KERN_ERR, "journal UUID does not match");
2993 brelse(bh); 3057 brelse(bh);
2994 goto out_bdev; 3058 goto out_bdev;
2995 } 3059 }
@@ -3001,25 +3065,26 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
3001 journal = jbd2_journal_init_dev(bdev, sb->s_bdev, 3065 journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
3002 start, len, blocksize); 3066 start, len, blocksize);
3003 if (!journal) { 3067 if (!journal) {
3004 printk(KERN_ERR "EXT4-fs: failed to create device journal\n"); 3068 ext4_msg(sb, KERN_ERR, "failed to create device journal");
3005 goto out_bdev; 3069 goto out_bdev;
3006 } 3070 }
3007 journal->j_private = sb; 3071 journal->j_private = sb;
3008 ll_rw_block(READ, 1, &journal->j_sb_buffer); 3072 ll_rw_block(READ, 1, &journal->j_sb_buffer);
3009 wait_on_buffer(journal->j_sb_buffer); 3073 wait_on_buffer(journal->j_sb_buffer);
3010 if (!buffer_uptodate(journal->j_sb_buffer)) { 3074 if (!buffer_uptodate(journal->j_sb_buffer)) {
3011 printk(KERN_ERR "EXT4-fs: I/O error on journal device\n"); 3075 ext4_msg(sb, KERN_ERR, "I/O error on journal device");
3012 goto out_journal; 3076 goto out_journal;
3013 } 3077 }
3014 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { 3078 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
3015 printk(KERN_ERR "EXT4-fs: External journal has more than one " 3079 ext4_msg(sb, KERN_ERR, "External journal has more than one "
3016 "user (unsupported) - %d\n", 3080 "user (unsupported) - %d",
3017 be32_to_cpu(journal->j_superblock->s_nr_users)); 3081 be32_to_cpu(journal->j_superblock->s_nr_users));
3018 goto out_journal; 3082 goto out_journal;
3019 } 3083 }
3020 EXT4_SB(sb)->journal_bdev = bdev; 3084 EXT4_SB(sb)->journal_bdev = bdev;
3021 ext4_init_journal_params(sb, journal); 3085 ext4_init_journal_params(sb, journal);
3022 return journal; 3086 return journal;
3087
3023out_journal: 3088out_journal:
3024 jbd2_journal_destroy(journal); 3089 jbd2_journal_destroy(journal);
3025out_bdev: 3090out_bdev:
@@ -3041,8 +3106,8 @@ static int ext4_load_journal(struct super_block *sb,
3041 3106
3042 if (journal_devnum && 3107 if (journal_devnum &&
3043 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 3108 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
3044 printk(KERN_INFO "EXT4-fs: external journal device major/minor " 3109 ext4_msg(sb, KERN_INFO, "external journal device major/minor "
3045 "numbers have changed\n"); 3110 "numbers have changed");
3046 journal_dev = new_decode_dev(journal_devnum); 3111 journal_dev = new_decode_dev(journal_devnum);
3047 } else 3112 } else
3048 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); 3113 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -3054,24 +3119,23 @@ static int ext4_load_journal(struct super_block *sb,
3054 * crash? For recovery, we need to check in advance whether we 3119 * crash? For recovery, we need to check in advance whether we
3055 * can get read-write access to the device. 3120 * can get read-write access to the device.
3056 */ 3121 */
3057
3058 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 3122 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
3059 if (sb->s_flags & MS_RDONLY) { 3123 if (sb->s_flags & MS_RDONLY) {
3060 printk(KERN_INFO "EXT4-fs: INFO: recovery " 3124 ext4_msg(sb, KERN_INFO, "INFO: recovery "
3061 "required on readonly filesystem.\n"); 3125 "required on readonly filesystem");
3062 if (really_read_only) { 3126 if (really_read_only) {
3063 printk(KERN_ERR "EXT4-fs: write access " 3127 ext4_msg(sb, KERN_ERR, "write access "
3064 "unavailable, cannot proceed.\n"); 3128 "unavailable, cannot proceed");
3065 return -EROFS; 3129 return -EROFS;
3066 } 3130 }
3067 printk(KERN_INFO "EXT4-fs: write access will " 3131 ext4_msg(sb, KERN_INFO, "write access will "
3068 "be enabled during recovery.\n"); 3132 "be enabled during recovery");
3069 } 3133 }
3070 } 3134 }
3071 3135
3072 if (journal_inum && journal_dev) { 3136 if (journal_inum && journal_dev) {
3073 printk(KERN_ERR "EXT4-fs: filesystem has both journal " 3137 ext4_msg(sb, KERN_ERR, "filesystem has both journal "
3074 "and inode journals!\n"); 3138 "and inode journals!");
3075 return -EINVAL; 3139 return -EINVAL;
3076 } 3140 }
3077 3141
@@ -3084,14 +3148,14 @@ static int ext4_load_journal(struct super_block *sb,
3084 } 3148 }
3085 3149
3086 if (journal->j_flags & JBD2_BARRIER) 3150 if (journal->j_flags & JBD2_BARRIER)
3087 printk(KERN_INFO "EXT4-fs: barriers enabled\n"); 3151 ext4_msg(sb, KERN_INFO, "barriers enabled");
3088 else 3152 else
3089 printk(KERN_INFO "EXT4-fs: barriers disabled\n"); 3153 ext4_msg(sb, KERN_INFO, "barriers disabled");
3090 3154
3091 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 3155 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
3092 err = jbd2_journal_update_format(journal); 3156 err = jbd2_journal_update_format(journal);
3093 if (err) { 3157 if (err) {
3094 printk(KERN_ERR "EXT4-fs: error updating journal.\n"); 3158 ext4_msg(sb, KERN_ERR, "error updating journal");
3095 jbd2_journal_destroy(journal); 3159 jbd2_journal_destroy(journal);
3096 return err; 3160 return err;
3097 } 3161 }
@@ -3103,7 +3167,7 @@ static int ext4_load_journal(struct super_block *sb,
3103 err = jbd2_journal_load(journal); 3167 err = jbd2_journal_load(journal);
3104 3168
3105 if (err) { 3169 if (err) {
3106 printk(KERN_ERR "EXT4-fs: error loading journal.\n"); 3170 ext4_msg(sb, KERN_ERR, "error loading journal");
3107 jbd2_journal_destroy(journal); 3171 jbd2_journal_destroy(journal);
3108 return err; 3172 return err;
3109 } 3173 }
@@ -3114,18 +3178,17 @@ static int ext4_load_journal(struct super_block *sb,
3114 if (journal_devnum && 3178 if (journal_devnum &&
3115 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 3179 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
3116 es->s_journal_dev = cpu_to_le32(journal_devnum); 3180 es->s_journal_dev = cpu_to_le32(journal_devnum);
3117 sb->s_dirt = 1;
3118 3181
3119 /* Make sure we flush the recovery flag to disk. */ 3182 /* Make sure we flush the recovery flag to disk. */
3120 ext4_commit_super(sb, es, 1); 3183 ext4_commit_super(sb, 1);
3121 } 3184 }
3122 3185
3123 return 0; 3186 return 0;
3124} 3187}
3125 3188
3126static int ext4_commit_super(struct super_block *sb, 3189static int ext4_commit_super(struct super_block *sb, int sync)
3127 struct ext4_super_block *es, int sync)
3128{ 3190{
3191 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
3129 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 3192 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
3130 int error = 0; 3193 int error = 0;
3131 3194
@@ -3140,8 +3203,8 @@ static int ext4_commit_super(struct super_block *sb,
3140 * be remapped. Nothing we can do but to retry the 3203 * be remapped. Nothing we can do but to retry the
3141 * write and hope for the best. 3204 * write and hope for the best.
3142 */ 3205 */
3143 printk(KERN_ERR "EXT4-fs: previous I/O error to " 3206 ext4_msg(sb, KERN_ERR, "previous I/O error to "
3144 "superblock detected for %s.\n", sb->s_id); 3207 "superblock detected");
3145 clear_buffer_write_io_error(sbh); 3208 clear_buffer_write_io_error(sbh);
3146 set_buffer_uptodate(sbh); 3209 set_buffer_uptodate(sbh);
3147 } 3210 }
@@ -3154,7 +3217,7 @@ static int ext4_commit_super(struct super_block *sb,
3154 &EXT4_SB(sb)->s_freeblocks_counter)); 3217 &EXT4_SB(sb)->s_freeblocks_counter));
3155 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( 3218 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
3156 &EXT4_SB(sb)->s_freeinodes_counter)); 3219 &EXT4_SB(sb)->s_freeinodes_counter));
3157 3220 sb->s_dirt = 0;
3158 BUFFER_TRACE(sbh, "marking dirty"); 3221 BUFFER_TRACE(sbh, "marking dirty");
3159 mark_buffer_dirty(sbh); 3222 mark_buffer_dirty(sbh);
3160 if (sync) { 3223 if (sync) {
@@ -3164,8 +3227,8 @@ static int ext4_commit_super(struct super_block *sb,
3164 3227
3165 error = buffer_write_io_error(sbh); 3228 error = buffer_write_io_error(sbh);
3166 if (error) { 3229 if (error) {
3167 printk(KERN_ERR "EXT4-fs: I/O error while writing " 3230 ext4_msg(sb, KERN_ERR, "I/O error while writing "
3168 "superblock for %s.\n", sb->s_id); 3231 "superblock");
3169 clear_buffer_write_io_error(sbh); 3232 clear_buffer_write_io_error(sbh);
3170 set_buffer_uptodate(sbh); 3233 set_buffer_uptodate(sbh);
3171 } 3234 }
@@ -3173,7 +3236,6 @@ static int ext4_commit_super(struct super_block *sb,
3173 return error; 3236 return error;
3174} 3237}
3175 3238
3176
3177/* 3239/*
3178 * Have we just finished recovery? If so, and if we are mounting (or 3240 * Have we just finished recovery? If so, and if we are mounting (or
3179 * remounting) the filesystem readonly, then we will end up with a 3241 * remounting) the filesystem readonly, then we will end up with a
@@ -3192,14 +3254,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
3192 if (jbd2_journal_flush(journal) < 0) 3254 if (jbd2_journal_flush(journal) < 0)
3193 goto out; 3255 goto out;
3194 3256
3195 lock_super(sb);
3196 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && 3257 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
3197 sb->s_flags & MS_RDONLY) { 3258 sb->s_flags & MS_RDONLY) {
3198 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3259 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3199 sb->s_dirt = 0; 3260 ext4_commit_super(sb, 1);
3200 ext4_commit_super(sb, es, 1);
3201 } 3261 }
3202 unlock_super(sb);
3203 3262
3204out: 3263out:
3205 jbd2_journal_unlock_updates(journal); 3264 jbd2_journal_unlock_updates(journal);
@@ -3238,7 +3297,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
3238 3297
3239 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 3298 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
3240 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 3299 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
3241 ext4_commit_super(sb, es, 1); 3300 ext4_commit_super(sb, 1);
3242 3301
3243 jbd2_journal_clear_err(journal); 3302 jbd2_journal_clear_err(journal);
3244 } 3303 }
@@ -3257,29 +3316,17 @@ int ext4_force_commit(struct super_block *sb)
3257 return 0; 3316 return 0;
3258 3317
3259 journal = EXT4_SB(sb)->s_journal; 3318 journal = EXT4_SB(sb)->s_journal;
3260 if (journal) { 3319 if (journal)
3261 sb->s_dirt = 0;
3262 ret = ext4_journal_force_commit(journal); 3320 ret = ext4_journal_force_commit(journal);
3263 }
3264 3321
3265 return ret; 3322 return ret;
3266} 3323}
3267 3324
3268/*
3269 * Ext4 always journals updates to the superblock itself, so we don't
3270 * have to propagate any other updates to the superblock on disk at this
3271 * point. (We can probably nuke this function altogether, and remove
3272 * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
3273 */
3274static void ext4_write_super(struct super_block *sb) 3325static void ext4_write_super(struct super_block *sb)
3275{ 3326{
3276 if (EXT4_SB(sb)->s_journal) { 3327 lock_super(sb);
3277 if (mutex_trylock(&sb->s_lock) != 0) 3328 ext4_commit_super(sb, 1);
3278 BUG(); 3329 unlock_super(sb);
3279 sb->s_dirt = 0;
3280 } else {
3281 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3282 }
3283} 3330}
3284 3331
3285static int ext4_sync_fs(struct super_block *sb, int wait) 3332static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3287,17 +3334,10 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
3287 int ret = 0; 3334 int ret = 0;
3288 tid_t target; 3335 tid_t target;
3289 3336
3290 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); 3337 trace_ext4_sync_fs(sb, wait);
3291 sb->s_dirt = 0; 3338 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
3292 if (EXT4_SB(sb)->s_journal) { 3339 if (wait)
3293 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, 3340 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
3294 &target)) {
3295 if (wait)
3296 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
3297 target);
3298 }
3299 } else {
3300 ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
3301 } 3341 }
3302 return ret; 3342 return ret;
3303} 3343}
@@ -3310,34 +3350,32 @@ static int ext4_freeze(struct super_block *sb)
3310{ 3350{
3311 int error = 0; 3351 int error = 0;
3312 journal_t *journal; 3352 journal_t *journal;
3313 sb->s_dirt = 0;
3314 3353
3315 if (!(sb->s_flags & MS_RDONLY)) { 3354 if (sb->s_flags & MS_RDONLY)
3316 journal = EXT4_SB(sb)->s_journal; 3355 return 0;
3317 3356
3318 if (journal) { 3357 journal = EXT4_SB(sb)->s_journal;
3319 /* Now we set up the journal barrier. */
3320 jbd2_journal_lock_updates(journal);
3321 3358
3322 /* 3359 /* Now we set up the journal barrier. */
3323 * We don't want to clear needs_recovery flag when we 3360 jbd2_journal_lock_updates(journal);
3324 * failed to flush the journal.
3325 */
3326 error = jbd2_journal_flush(journal);
3327 if (error < 0)
3328 goto out;
3329 }
3330 3361
3331 /* Journal blocked and flushed, clear needs_recovery flag. */ 3362 /*
3332 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3363 * Don't clear the needs_recovery flag if we failed to flush
3333 error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); 3364 * the journal.
3334 if (error) 3365 */
3335 goto out; 3366 error = jbd2_journal_flush(journal);
3367 if (error < 0) {
3368 out:
3369 jbd2_journal_unlock_updates(journal);
3370 return error;
3336 } 3371 }
3372
3373 /* Journal blocked and flushed, clear needs_recovery flag. */
3374 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3375 error = ext4_commit_super(sb, 1);
3376 if (error)
3377 goto out;
3337 return 0; 3378 return 0;
3338out:
3339 jbd2_journal_unlock_updates(journal);
3340 return error;
3341} 3379}
3342 3380
3343/* 3381/*
@@ -3346,14 +3384,15 @@ out:
3346 */ 3384 */
3347static int ext4_unfreeze(struct super_block *sb) 3385static int ext4_unfreeze(struct super_block *sb)
3348{ 3386{
3349 if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) { 3387 if (sb->s_flags & MS_RDONLY)
3350 lock_super(sb); 3388 return 0;
3351 /* Reser the needs_recovery flag before the fs is unlocked. */ 3389
3352 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3390 lock_super(sb);
3353 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); 3391 /* Reset the needs_recovery flag before the fs is unlocked. */
3354 unlock_super(sb); 3392 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3355 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 3393 ext4_commit_super(sb, 1);
3356 } 3394 unlock_super(sb);
3395 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3357 return 0; 3396 return 0;
3358} 3397}
3359 3398
@@ -3371,7 +3410,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3371 int i; 3410 int i;
3372#endif 3411#endif
3373 3412
3413 lock_kernel();
3414
3374 /* Store the original options */ 3415 /* Store the original options */
3416 lock_super(sb);
3375 old_sb_flags = sb->s_flags; 3417 old_sb_flags = sb->s_flags;
3376 old_opts.s_mount_opt = sbi->s_mount_opt; 3418 old_opts.s_mount_opt = sbi->s_mount_opt;
3377 old_opts.s_resuid = sbi->s_resuid; 3419 old_opts.s_resuid = sbi->s_resuid;
@@ -3396,7 +3438,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3396 goto restore_opts; 3438 goto restore_opts;
3397 } 3439 }
3398 3440
3399 if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) 3441 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
3400 ext4_abort(sb, __func__, "Abort forced by user"); 3442 ext4_abort(sb, __func__, "Abort forced by user");
3401 3443
3402 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 3444 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -3411,7 +3453,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3411 3453
3412 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || 3454 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
3413 n_blocks_count > ext4_blocks_count(es)) { 3455 n_blocks_count > ext4_blocks_count(es)) {
3414 if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) { 3456 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
3415 err = -EROFS; 3457 err = -EROFS;
3416 goto restore_opts; 3458 goto restore_opts;
3417 } 3459 }
@@ -3432,22 +3474,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3432 (sbi->s_mount_state & EXT4_VALID_FS)) 3474 (sbi->s_mount_state & EXT4_VALID_FS))
3433 es->s_state = cpu_to_le16(sbi->s_mount_state); 3475 es->s_state = cpu_to_le16(sbi->s_mount_state);
3434 3476
3435 /* 3477 if (sbi->s_journal)
3436 * We have to unlock super so that we can wait for
3437 * transactions.
3438 */
3439 if (sbi->s_journal) {
3440 unlock_super(sb);
3441 ext4_mark_recovery_complete(sb, es); 3478 ext4_mark_recovery_complete(sb, es);
3442 lock_super(sb);
3443 }
3444 } else { 3479 } else {
3445 int ret; 3480 int ret;
3446 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 3481 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3447 ~EXT4_FEATURE_RO_COMPAT_SUPP))) { 3482 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
3448 printk(KERN_WARNING "EXT4-fs: %s: couldn't " 3483 ext4_msg(sb, KERN_WARNING, "couldn't "
3449 "remount RDWR because of unsupported " 3484 "remount RDWR because of unsupported "
3450 "optional features (%x).\n", sb->s_id, 3485 "optional features (%x)",
3451 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & 3486 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
3452 ~EXT4_FEATURE_RO_COMPAT_SUPP)); 3487 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3453 err = -EROFS; 3488 err = -EROFS;
@@ -3456,17 +3491,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3456 3491
3457 /* 3492 /*
3458 * Make sure the group descriptor checksums 3493 * Make sure the group descriptor checksums
3459 * are sane. If they aren't, refuse to 3494 * are sane. If they aren't, refuse to remount r/w.
3460 * remount r/w.
3461 */ 3495 */
3462 for (g = 0; g < sbi->s_groups_count; g++) { 3496 for (g = 0; g < sbi->s_groups_count; g++) {
3463 struct ext4_group_desc *gdp = 3497 struct ext4_group_desc *gdp =
3464 ext4_get_group_desc(sb, g, NULL); 3498 ext4_get_group_desc(sb, g, NULL);
3465 3499
3466 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { 3500 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
3467 printk(KERN_ERR 3501 ext4_msg(sb, KERN_ERR,
3468 "EXT4-fs: ext4_remount: " 3502 "ext4_remount: Checksum for group %u failed (%u!=%u)",
3469 "Checksum for group %u failed (%u!=%u)\n",
3470 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), 3503 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
3471 le16_to_cpu(gdp->bg_checksum)); 3504 le16_to_cpu(gdp->bg_checksum));
3472 err = -EINVAL; 3505 err = -EINVAL;
@@ -3480,11 +3513,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3480 * require a full umount/remount for now. 3513 * require a full umount/remount for now.
3481 */ 3514 */
3482 if (es->s_last_orphan) { 3515 if (es->s_last_orphan) {
3483 printk(KERN_WARNING "EXT4-fs: %s: couldn't " 3516 ext4_msg(sb, KERN_WARNING, "Couldn't "
3484 "remount RDWR because of unprocessed " 3517 "remount RDWR because of unprocessed "
3485 "orphan inode list. Please " 3518 "orphan inode list. Please "
3486 "umount/remount instead.\n", 3519 "umount/remount instead");
3487 sb->s_id);
3488 err = -EINVAL; 3520 err = -EINVAL;
3489 goto restore_opts; 3521 goto restore_opts;
3490 } 3522 }
@@ -3504,8 +3536,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3504 sb->s_flags &= ~MS_RDONLY; 3536 sb->s_flags &= ~MS_RDONLY;
3505 } 3537 }
3506 } 3538 }
3539 ext4_setup_system_zone(sb);
3507 if (sbi->s_journal == NULL) 3540 if (sbi->s_journal == NULL)
3508 ext4_commit_super(sb, es, 1); 3541 ext4_commit_super(sb, 1);
3509 3542
3510#ifdef CONFIG_QUOTA 3543#ifdef CONFIG_QUOTA
3511 /* Release old quota file names */ 3544 /* Release old quota file names */
@@ -3514,7 +3547,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3514 old_opts.s_qf_names[i] != sbi->s_qf_names[i]) 3547 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
3515 kfree(old_opts.s_qf_names[i]); 3548 kfree(old_opts.s_qf_names[i]);
3516#endif 3549#endif
3550 unlock_super(sb);
3551 unlock_kernel();
3517 return 0; 3552 return 0;
3553
3518restore_opts: 3554restore_opts:
3519 sb->s_flags = old_sb_flags; 3555 sb->s_flags = old_sb_flags;
3520 sbi->s_mount_opt = old_opts.s_mount_opt; 3556 sbi->s_mount_opt = old_opts.s_mount_opt;
@@ -3532,6 +3568,8 @@ restore_opts:
3532 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 3568 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
3533 } 3569 }
3534#endif 3570#endif
3571 unlock_super(sb);
3572 unlock_kernel();
3535 return err; 3573 return err;
3536} 3574}
3537 3575
@@ -3545,9 +3583,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3545 if (test_opt(sb, MINIX_DF)) { 3583 if (test_opt(sb, MINIX_DF)) {
3546 sbi->s_overhead_last = 0; 3584 sbi->s_overhead_last = 0;
3547 } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { 3585 } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
3548 ext4_group_t ngroups = sbi->s_groups_count, i; 3586 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
3549 ext4_fsblk_t overhead = 0; 3587 ext4_fsblk_t overhead = 0;
3550 smp_rmb();
3551 3588
3552 /* 3589 /*
3553 * Compute the overhead (FS structures). This is constant 3590 * Compute the overhead (FS structures). This is constant
@@ -3599,11 +3636,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3599 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 3636 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
3600 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; 3637 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
3601 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; 3638 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
3639
3602 return 0; 3640 return 0;
3603} 3641}
3604 3642
3605/* Helper function for writing quotas on sync - we need to start transaction before quota file 3643/* Helper function for writing quotas on sync - we need to start transaction
3606 * is locked for write. Otherwise the are possible deadlocks: 3644 * before quota file is locked for write. Otherwise the are possible deadlocks:
3607 * Process 1 Process 2 3645 * Process 1 Process 2
3608 * ext4_create() quota_sync() 3646 * ext4_create() quota_sync()
3609 * jbd2_journal_start() write_dquot() 3647 * jbd2_journal_start() write_dquot()
@@ -3627,7 +3665,7 @@ static int ext4_write_dquot(struct dquot *dquot)
3627 3665
3628 inode = dquot_to_inode(dquot); 3666 inode = dquot_to_inode(dquot);
3629 handle = ext4_journal_start(inode, 3667 handle = ext4_journal_start(inode,
3630 EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); 3668 EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
3631 if (IS_ERR(handle)) 3669 if (IS_ERR(handle))
3632 return PTR_ERR(handle); 3670 return PTR_ERR(handle);
3633 ret = dquot_commit(dquot); 3671 ret = dquot_commit(dquot);
@@ -3643,7 +3681,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
3643 handle_t *handle; 3681 handle_t *handle;
3644 3682
3645 handle = ext4_journal_start(dquot_to_inode(dquot), 3683 handle = ext4_journal_start(dquot_to_inode(dquot),
3646 EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); 3684 EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
3647 if (IS_ERR(handle)) 3685 if (IS_ERR(handle))
3648 return PTR_ERR(handle); 3686 return PTR_ERR(handle);
3649 ret = dquot_acquire(dquot); 3687 ret = dquot_acquire(dquot);
@@ -3659,7 +3697,7 @@ static int ext4_release_dquot(struct dquot *dquot)
3659 handle_t *handle; 3697 handle_t *handle;
3660 3698
3661 handle = ext4_journal_start(dquot_to_inode(dquot), 3699 handle = ext4_journal_start(dquot_to_inode(dquot),
3662 EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); 3700 EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
3663 if (IS_ERR(handle)) { 3701 if (IS_ERR(handle)) {
3664 /* Release dquot anyway to avoid endless cycle in dqput() */ 3702 /* Release dquot anyway to avoid endless cycle in dqput() */
3665 dquot_release(dquot); 3703 dquot_release(dquot);
@@ -3707,7 +3745,7 @@ static int ext4_write_info(struct super_block *sb, int type)
3707static int ext4_quota_on_mount(struct super_block *sb, int type) 3745static int ext4_quota_on_mount(struct super_block *sb, int type)
3708{ 3746{
3709 return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], 3747 return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
3710 EXT4_SB(sb)->s_jquota_fmt, type); 3748 EXT4_SB(sb)->s_jquota_fmt, type);
3711} 3749}
3712 3750
3713/* 3751/*
@@ -3738,9 +3776,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3738 if (EXT4_SB(sb)->s_qf_names[type]) { 3776 if (EXT4_SB(sb)->s_qf_names[type]) {
3739 /* Quotafile not in fs root? */ 3777 /* Quotafile not in fs root? */
3740 if (path.dentry->d_parent != sb->s_root) 3778 if (path.dentry->d_parent != sb->s_root)
3741 printk(KERN_WARNING 3779 ext4_msg(sb, KERN_WARNING,
3742 "EXT4-fs: Quota file not on filesystem root. " 3780 "Quota file not on filesystem root. "
3743 "Journaled quota will not work.\n"); 3781 "Journaled quota will not work");
3744 } 3782 }
3745 3783
3746 /* 3784 /*
@@ -3823,8 +3861,8 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3823 handle_t *handle = journal_current_handle(); 3861 handle_t *handle = journal_current_handle();
3824 3862
3825 if (EXT4_SB(sb)->s_journal && !handle) { 3863 if (EXT4_SB(sb)->s_journal && !handle) {
3826 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" 3864 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
3827 " cancelled because transaction is not started.\n", 3865 " cancelled because transaction is not started",
3828 (unsigned long long)off, (unsigned long long)len); 3866 (unsigned long long)off, (unsigned long long)len);
3829 return -EIO; 3867 return -EIO;
3830 } 3868 }
@@ -3878,10 +3916,10 @@ out:
3878 3916
3879#endif 3917#endif
3880 3918
3881static int ext4_get_sb(struct file_system_type *fs_type, 3919static int ext4_get_sb(struct file_system_type *fs_type, int flags,
3882 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 3920 const char *dev_name, void *data, struct vfsmount *mnt)
3883{ 3921{
3884 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); 3922 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3885} 3923}
3886 3924
3887static struct file_system_type ext4_fs_type = { 3925static struct file_system_type ext4_fs_type = {
@@ -3893,14 +3931,14 @@ static struct file_system_type ext4_fs_type = {
3893}; 3931};
3894 3932
3895#ifdef CONFIG_EXT4DEV_COMPAT 3933#ifdef CONFIG_EXT4DEV_COMPAT
3896static int ext4dev_get_sb(struct file_system_type *fs_type, 3934static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
3897 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 3935 const char *dev_name, void *data,struct vfsmount *mnt)
3898{ 3936{
3899 printk(KERN_WARNING "EXT4-fs: Update your userspace programs " 3937 printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
3900 "to mount using ext4\n"); 3938 "to mount using ext4\n", dev_name);
3901 printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility " 3939 printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
3902 "will go away by 2.6.31\n"); 3940 "will go away by 2.6.31\n", dev_name);
3903 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); 3941 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3904} 3942}
3905 3943
3906static struct file_system_type ext4dev_fs_type = { 3944static struct file_system_type ext4dev_fs_type = {
@@ -3917,13 +3955,16 @@ static int __init init_ext4_fs(void)
3917{ 3955{
3918 int err; 3956 int err;
3919 3957
3958 err = init_ext4_system_zone();
3959 if (err)
3960 return err;
3920 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 3961 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
3921 if (!ext4_kset) 3962 if (!ext4_kset)
3922 return -ENOMEM; 3963 goto out4;
3923 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 3964 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
3924 err = init_ext4_mballoc(); 3965 err = init_ext4_mballoc();
3925 if (err) 3966 if (err)
3926 return err; 3967 goto out3;
3927 3968
3928 err = init_ext4_xattr(); 3969 err = init_ext4_xattr();
3929 if (err) 3970 if (err)
@@ -3948,6 +3989,11 @@ out1:
3948 exit_ext4_xattr(); 3989 exit_ext4_xattr();
3949out2: 3990out2:
3950 exit_ext4_mballoc(); 3991 exit_ext4_mballoc();
3992out3:
3993 remove_proc_entry("fs/ext4", NULL);
3994 kset_unregister(ext4_kset);
3995out4:
3996 exit_ext4_system_zone();
3951 return err; 3997 return err;
3952} 3998}
3953 3999
@@ -3962,6 +4008,7 @@ static void __exit exit_ext4_fs(void)
3962 exit_ext4_mballoc(); 4008 exit_ext4_mballoc();
3963 remove_proc_entry("fs/ext4", NULL); 4009 remove_proc_entry("fs/ext4", NULL);
3964 kset_unregister(ext4_kset); 4010 kset_unregister(ext4_kset);
4011 exit_ext4_system_zone();
3965} 4012}
3966 4013
3967MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 4014MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index b42602298087..923990e4f16e 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -241,7 +241,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
241 while (*fclus < cluster) { 241 while (*fclus < cluster) {
242 /* prevent the infinite loop of cluster chain */ 242 /* prevent the infinite loop of cluster chain */
243 if (*fclus > limit) { 243 if (*fclus > limit) {
244 fat_fs_panic(sb, "%s: detected the cluster chain loop" 244 fat_fs_error(sb, "%s: detected the cluster chain loop"
245 " (i_pos %lld)", __func__, 245 " (i_pos %lld)", __func__,
246 MSDOS_I(inode)->i_pos); 246 MSDOS_I(inode)->i_pos);
247 nr = -EIO; 247 nr = -EIO;
@@ -252,7 +252,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
252 if (nr < 0) 252 if (nr < 0)
253 goto out; 253 goto out;
254 else if (nr == FAT_ENT_FREE) { 254 else if (nr == FAT_ENT_FREE) {
255 fat_fs_panic(sb, "%s: invalid cluster chain" 255 fat_fs_error(sb, "%s: invalid cluster chain"
256 " (i_pos %lld)", __func__, 256 " (i_pos %lld)", __func__,
257 MSDOS_I(inode)->i_pos); 257 MSDOS_I(inode)->i_pos);
258 nr = -EIO; 258 nr = -EIO;
@@ -285,7 +285,7 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
285 if (ret < 0) 285 if (ret < 0)
286 return ret; 286 return ret;
287 else if (ret == FAT_ENT_EOF) { 287 else if (ret == FAT_ENT_EOF) {
288 fat_fs_panic(sb, "%s: request beyond EOF (i_pos %lld)", 288 fat_fs_error(sb, "%s: request beyond EOF (i_pos %lld)",
289 __func__, MSDOS_I(inode)->i_pos); 289 __func__, MSDOS_I(inode)->i_pos);
290 return -EIO; 290 return -EIO;
291 } 291 }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 3a7f603b6982..530b4ca01510 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -16,12 +16,24 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/time.h> 18#include <linux/time.h>
19#include <linux/smp_lock.h>
20#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
21#include <linux/compat.h> 20#include <linux/compat.h>
22#include <asm/uaccess.h> 21#include <asm/uaccess.h>
23#include "fat.h" 22#include "fat.h"
24 23
24/*
25 * Maximum buffer size of short name.
26 * [(MSDOS_NAME + '.') * max one char + nul]
27 * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
28 */
29#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
30/*
31 * Maximum buffer size of unicode chars from slots.
32 * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
33 */
34#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1)
35#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
36
25static inline loff_t fat_make_i_pos(struct super_block *sb, 37static inline loff_t fat_make_i_pos(struct super_block *sb,
26 struct buffer_head *bh, 38 struct buffer_head *bh,
27 struct msdos_dir_entry *de) 39 struct msdos_dir_entry *de)
@@ -171,7 +183,8 @@ static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
171 unsigned char *buf, int size) 183 unsigned char *buf, int size)
172{ 184{
173 if (sbi->options.utf8) 185 if (sbi->options.utf8)
174 return utf8_wcstombs(buf, uni, size); 186 return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
187 UTF16_HOST_ENDIAN, buf, size);
175 else 188 else
176 return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate, 189 return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
177 sbi->nls_io); 190 sbi->nls_io);
@@ -325,19 +338,6 @@ parse_long:
325} 338}
326 339
327/* 340/*
328 * Maximum buffer size of short name.
329 * [(MSDOS_NAME + '.') * max one char + nul]
330 * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
331 */
332#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
333/*
334 * Maximum buffer size of unicode chars from slots.
335 * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
336 */
337#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1)
338#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
339
340/*
341 * Return values: negative -> error, 0 -> not found, positive -> found, 341 * Return values: negative -> error, 0 -> not found, positive -> found,
342 * value is the total amount of slots, including the shortname entry. 342 * value is the total amount of slots, including the shortname entry.
343 */ 343 */
@@ -840,7 +840,7 @@ const struct file_operations fat_dir_operations = {
840#ifdef CONFIG_COMPAT 840#ifdef CONFIG_COMPAT
841 .compat_ioctl = fat_compat_dir_ioctl, 841 .compat_ioctl = fat_compat_dir_ioctl,
842#endif 842#endif
843 .fsync = file_fsync, 843 .fsync = fat_file_fsync,
844}; 844};
845 845
846static int fat_get_short_entry(struct inode *dir, loff_t *pos, 846static int fat_get_short_entry(struct inode *dir, loff_t *pos,
@@ -967,7 +967,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
967 de++; 967 de++;
968 nr_slots--; 968 nr_slots--;
969 } 969 }
970 mark_buffer_dirty(bh); 970 mark_buffer_dirty_inode(bh, dir);
971 if (IS_DIRSYNC(dir)) 971 if (IS_DIRSYNC(dir))
972 err = sync_dirty_buffer(bh); 972 err = sync_dirty_buffer(bh);
973 brelse(bh); 973 brelse(bh);
@@ -1001,7 +1001,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
1001 de--; 1001 de--;
1002 nr_slots--; 1002 nr_slots--;
1003 } 1003 }
1004 mark_buffer_dirty(bh); 1004 mark_buffer_dirty_inode(bh, dir);
1005 if (IS_DIRSYNC(dir)) 1005 if (IS_DIRSYNC(dir))
1006 err = sync_dirty_buffer(bh); 1006 err = sync_dirty_buffer(bh);
1007 brelse(bh); 1007 brelse(bh);
@@ -1051,7 +1051,7 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
1051 } 1051 }
1052 memset(bhs[n]->b_data, 0, sb->s_blocksize); 1052 memset(bhs[n]->b_data, 0, sb->s_blocksize);
1053 set_buffer_uptodate(bhs[n]); 1053 set_buffer_uptodate(bhs[n]);
1054 mark_buffer_dirty(bhs[n]); 1054 mark_buffer_dirty_inode(bhs[n], dir);
1055 1055
1056 n++; 1056 n++;
1057 blknr++; 1057 blknr++;
@@ -1131,7 +1131,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
1131 de[0].size = de[1].size = 0; 1131 de[0].size = de[1].size = 0;
1132 memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); 1132 memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
1133 set_buffer_uptodate(bhs[0]); 1133 set_buffer_uptodate(bhs[0]);
1134 mark_buffer_dirty(bhs[0]); 1134 mark_buffer_dirty_inode(bhs[0], dir);
1135 1135
1136 err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); 1136 err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
1137 if (err) 1137 if (err)
@@ -1193,7 +1193,7 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
1193 slots += copy; 1193 slots += copy;
1194 size -= copy; 1194 size -= copy;
1195 set_buffer_uptodate(bhs[n]); 1195 set_buffer_uptodate(bhs[n]);
1196 mark_buffer_dirty(bhs[n]); 1196 mark_buffer_dirty_inode(bhs[n], dir);
1197 if (!size) 1197 if (!size)
1198 break; 1198 break;
1199 n++; 1199 n++;
@@ -1293,7 +1293,7 @@ found:
1293 for (i = 0; i < long_bhs; i++) { 1293 for (i = 0; i < long_bhs; i++) {
1294 int copy = min_t(int, sb->s_blocksize - offset, size); 1294 int copy = min_t(int, sb->s_blocksize - offset, size);
1295 memcpy(bhs[i]->b_data + offset, slots, copy); 1295 memcpy(bhs[i]->b_data + offset, slots, copy);
1296 mark_buffer_dirty(bhs[i]); 1296 mark_buffer_dirty_inode(bhs[i], dir);
1297 offset = 0; 1297 offset = 0;
1298 slots += copy; 1298 slots += copy;
1299 size -= copy; 1299 size -= copy;
@@ -1304,7 +1304,7 @@ found:
1304 /* Fill the short name slot. */ 1304 /* Fill the short name slot. */
1305 int copy = min_t(int, sb->s_blocksize - offset, size); 1305 int copy = min_t(int, sb->s_blocksize - offset, size);
1306 memcpy(bhs[i]->b_data + offset, slots, copy); 1306 memcpy(bhs[i]->b_data + offset, slots, copy);
1307 mark_buffer_dirty(bhs[i]); 1307 mark_buffer_dirty_inode(bhs[i], dir);
1308 if (IS_DIRSYNC(dir)) 1308 if (IS_DIRSYNC(dir))
1309 err = sync_dirty_buffer(bhs[i]); 1309 err = sync_dirty_buffer(bhs[i]);
1310 } 1310 }
@@ -1334,7 +1334,7 @@ found:
1334 goto error_remove; 1334 goto error_remove;
1335 } 1335 }
1336 if (dir->i_size & (sbi->cluster_size - 1)) { 1336 if (dir->i_size & (sbi->cluster_size - 1)) {
1337 fat_fs_panic(sb, "Odd directory size"); 1337 fat_fs_error(sb, "Odd directory size");
1338 dir->i_size = (dir->i_size + sbi->cluster_size - 1) 1338 dir->i_size = (dir->i_size + sbi->cluster_size - 1)
1339 & ~((loff_t)sbi->cluster_size - 1); 1339 & ~((loff_t)sbi->cluster_size - 1);
1340 } 1340 }
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index ea440d65819c..adb0e72a176d 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -17,6 +17,10 @@
17#define VFAT_SFN_CREATE_WIN95 0x0100 /* emulate win95 rule for create */ 17#define VFAT_SFN_CREATE_WIN95 0x0100 /* emulate win95 rule for create */
18#define VFAT_SFN_CREATE_WINNT 0x0200 /* emulate winnt rule for create */ 18#define VFAT_SFN_CREATE_WINNT 0x0200 /* emulate winnt rule for create */
19 19
20#define FAT_ERRORS_CONT 1 /* ignore error and continue */
21#define FAT_ERRORS_PANIC 2 /* panic on error */
22#define FAT_ERRORS_RO 3 /* remount r/o on error */
23
20struct fat_mount_options { 24struct fat_mount_options {
21 uid_t fs_uid; 25 uid_t fs_uid;
22 gid_t fs_gid; 26 gid_t fs_gid;
@@ -26,6 +30,7 @@ struct fat_mount_options {
26 char *iocharset; /* Charset used for filename input/display */ 30 char *iocharset; /* Charset used for filename input/display */
27 unsigned short shortname; /* flags for shortname display/create rule */ 31 unsigned short shortname; /* flags for shortname display/create rule */
28 unsigned char name_check; /* r = relaxed, n = normal, s = strict */ 32 unsigned char name_check; /* r = relaxed, n = normal, s = strict */
33 unsigned char errors; /* On error: continue, panic, remount-ro */
29 unsigned short allow_utime;/* permission for setting the [am]time */ 34 unsigned short allow_utime;/* permission for setting the [am]time */
30 unsigned quiet:1, /* set = fake successful chmods and chowns */ 35 unsigned quiet:1, /* set = fake successful chmods and chowns */
31 showexec:1, /* set = only set x bit for com/exe/bat */ 36 showexec:1, /* set = only set x bit for com/exe/bat */
@@ -74,6 +79,7 @@ struct msdos_sb_info {
74 79
75 int fatent_shift; 80 int fatent_shift;
76 struct fatent_operations *fatent_ops; 81 struct fatent_operations *fatent_ops;
82 struct inode *fat_inode;
77 83
78 spinlock_t inode_hash_lock; 84 spinlock_t inode_hash_lock;
79 struct hlist_head inode_hashtable[FAT_HASH_SIZE]; 85 struct hlist_head inode_hashtable[FAT_HASH_SIZE];
@@ -251,6 +257,7 @@ struct fat_entry {
251 } u; 257 } u;
252 int nr_bhs; 258 int nr_bhs;
253 struct buffer_head *bhs[2]; 259 struct buffer_head *bhs[2];
260 struct inode *fat_inode;
254}; 261};
255 262
256static inline void fatent_init(struct fat_entry *fatent) 263static inline void fatent_init(struct fat_entry *fatent)
@@ -259,6 +266,7 @@ static inline void fatent_init(struct fat_entry *fatent)
259 fatent->entry = 0; 266 fatent->entry = 0;
260 fatent->u.ent32_p = NULL; 267 fatent->u.ent32_p = NULL;
261 fatent->bhs[0] = fatent->bhs[1] = NULL; 268 fatent->bhs[0] = fatent->bhs[1] = NULL;
269 fatent->fat_inode = NULL;
262} 270}
263 271
264static inline void fatent_set_entry(struct fat_entry *fatent, int entry) 272static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
@@ -275,6 +283,7 @@ static inline void fatent_brelse(struct fat_entry *fatent)
275 brelse(fatent->bhs[i]); 283 brelse(fatent->bhs[i]);
276 fatent->nr_bhs = 0; 284 fatent->nr_bhs = 0;
277 fatent->bhs[0] = fatent->bhs[1] = NULL; 285 fatent->bhs[0] = fatent->bhs[1] = NULL;
286 fatent->fat_inode = NULL;
278} 287}
279 288
280extern void fat_ent_access_init(struct super_block *sb); 289extern void fat_ent_access_init(struct super_block *sb);
@@ -296,6 +305,8 @@ extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
296extern void fat_truncate(struct inode *inode); 305extern void fat_truncate(struct inode *inode);
297extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, 306extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
298 struct kstat *stat); 307 struct kstat *stat);
308extern int fat_file_fsync(struct file *file, struct dentry *dentry,
309 int datasync);
299 310
300/* fat/inode.c */ 311/* fat/inode.c */
301extern void fat_attach(struct inode *inode, loff_t i_pos); 312extern void fat_attach(struct inode *inode, loff_t i_pos);
@@ -310,7 +321,7 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
310extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 321extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
311 struct inode *i2); 322 struct inode *i2);
312/* fat/misc.c */ 323/* fat/misc.c */
313extern void fat_fs_panic(struct super_block *s, const char *fmt, ...) 324extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
314 __attribute__ ((format (printf, 2, 3))) __cold; 325 __attribute__ ((format (printf, 2, 3))) __cold;
315extern void fat_clusters_flush(struct super_block *sb); 326extern void fat_clusters_flush(struct super_block *sb);
316extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 327extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index da6eea47872f..a81037721a6f 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -73,6 +73,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
73 struct buffer_head **bhs = fatent->bhs; 73 struct buffer_head **bhs = fatent->bhs;
74 74
75 WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); 75 WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
76 fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
77
76 bhs[0] = sb_bread(sb, blocknr); 78 bhs[0] = sb_bread(sb, blocknr);
77 if (!bhs[0]) 79 if (!bhs[0])
78 goto err; 80 goto err;
@@ -103,6 +105,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
103 struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; 105 struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
104 106
105 WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); 107 WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
108 fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
106 fatent->bhs[0] = sb_bread(sb, blocknr); 109 fatent->bhs[0] = sb_bread(sb, blocknr);
107 if (!fatent->bhs[0]) { 110 if (!fatent->bhs[0]) {
108 printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", 111 printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
@@ -167,9 +170,9 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
167 } 170 }
168 spin_unlock(&fat12_entry_lock); 171 spin_unlock(&fat12_entry_lock);
169 172
170 mark_buffer_dirty(fatent->bhs[0]); 173 mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
171 if (fatent->nr_bhs == 2) 174 if (fatent->nr_bhs == 2)
172 mark_buffer_dirty(fatent->bhs[1]); 175 mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode);
173} 176}
174 177
175static void fat16_ent_put(struct fat_entry *fatent, int new) 178static void fat16_ent_put(struct fat_entry *fatent, int new)
@@ -178,7 +181,7 @@ static void fat16_ent_put(struct fat_entry *fatent, int new)
178 new = EOF_FAT16; 181 new = EOF_FAT16;
179 182
180 *fatent->u.ent16_p = cpu_to_le16(new); 183 *fatent->u.ent16_p = cpu_to_le16(new);
181 mark_buffer_dirty(fatent->bhs[0]); 184 mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
182} 185}
183 186
184static void fat32_ent_put(struct fat_entry *fatent, int new) 187static void fat32_ent_put(struct fat_entry *fatent, int new)
@@ -189,7 +192,7 @@ static void fat32_ent_put(struct fat_entry *fatent, int new)
189 WARN_ON(new & 0xf0000000); 192 WARN_ON(new & 0xf0000000);
190 new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff; 193 new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
191 *fatent->u.ent32_p = cpu_to_le32(new); 194 *fatent->u.ent32_p = cpu_to_le32(new);
192 mark_buffer_dirty(fatent->bhs[0]); 195 mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
193} 196}
194 197
195static int fat12_ent_next(struct fat_entry *fatent) 198static int fat12_ent_next(struct fat_entry *fatent)
@@ -345,7 +348,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
345 348
346 if (entry < FAT_START_ENT || sbi->max_cluster <= entry) { 349 if (entry < FAT_START_ENT || sbi->max_cluster <= entry) {
347 fatent_brelse(fatent); 350 fatent_brelse(fatent);
348 fat_fs_panic(sb, "invalid access to FAT (entry 0x%08x)", entry); 351 fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry);
349 return -EIO; 352 return -EIO;
350 } 353 }
351 354
@@ -381,7 +384,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
381 } 384 }
382 memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); 385 memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
383 set_buffer_uptodate(c_bh); 386 set_buffer_uptodate(c_bh);
384 mark_buffer_dirty(c_bh); 387 mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
385 if (sb->s_flags & MS_SYNCHRONOUS) 388 if (sb->s_flags & MS_SYNCHRONOUS)
386 err = sync_dirty_buffer(c_bh); 389 err = sync_dirty_buffer(c_bh);
387 brelse(c_bh); 390 brelse(c_bh);
@@ -557,7 +560,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
557 err = cluster; 560 err = cluster;
558 goto error; 561 goto error;
559 } else if (cluster == FAT_ENT_FREE) { 562 } else if (cluster == FAT_ENT_FREE) {
560 fat_fs_panic(sb, "%s: deleting FAT entry beyond EOF", 563 fat_fs_error(sb, "%s: deleting FAT entry beyond EOF",
561 __func__); 564 __func__);
562 err = -EIO; 565 err = -EIO;
563 goto error; 566 goto error;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 0a7f4a9918b3..f042b965c95c 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -18,106 +18,112 @@
18#include <linux/security.h> 18#include <linux/security.h>
19#include "fat.h" 19#include "fat.h"
20 20
21int fat_generic_ioctl(struct inode *inode, struct file *filp, 21static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
22 unsigned int cmd, unsigned long arg) 22{
23 u32 attr;
24
25 mutex_lock(&inode->i_mutex);
26 attr = fat_make_attrs(inode);
27 mutex_unlock(&inode->i_mutex);
28
29 return put_user(attr, user_attr);
30}
31
32static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
23{ 33{
34 struct inode *inode = file->f_path.dentry->d_inode;
24 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); 35 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
25 u32 __user *user_attr = (u32 __user *)arg; 36 int is_dir = S_ISDIR(inode->i_mode);
37 u32 attr, oldattr;
38 struct iattr ia;
39 int err;
26 40
27 switch (cmd) { 41 err = get_user(attr, user_attr);
28 case FAT_IOCTL_GET_ATTRIBUTES: 42 if (err)
29 { 43 goto out;
30 u32 attr;
31 44
32 mutex_lock(&inode->i_mutex); 45 mutex_lock(&inode->i_mutex);
33 attr = fat_make_attrs(inode); 46 err = mnt_want_write(file->f_path.mnt);
34 mutex_unlock(&inode->i_mutex); 47 if (err)
48 goto out_unlock_inode;
35 49
36 return put_user(attr, user_attr); 50 /*
51 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
52 * prevents the user from turning us into a VFAT
53 * longname entry. Also, we obviously can't set
54 * any of the NTFS attributes in the high 24 bits.
55 */
56 attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
57 /* Merge in ATTR_VOLUME and ATTR_DIR */
58 attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
59 (is_dir ? ATTR_DIR : 0);
60 oldattr = fat_make_attrs(inode);
61
62 /* Equivalent to a chmod() */
63 ia.ia_valid = ATTR_MODE | ATTR_CTIME;
64 ia.ia_ctime = current_fs_time(inode->i_sb);
65 if (is_dir)
66 ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
67 else {
68 ia.ia_mode = fat_make_mode(sbi, attr,
69 S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
37 } 70 }
38 case FAT_IOCTL_SET_ATTRIBUTES:
39 {
40 u32 attr, oldattr;
41 int err, is_dir = S_ISDIR(inode->i_mode);
42 struct iattr ia;
43 71
44 err = get_user(attr, user_attr); 72 /* The root directory has no attributes */
45 if (err) 73 if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
46 return err; 74 err = -EINVAL;
75 goto out_drop_write;
76 }
47 77
48 mutex_lock(&inode->i_mutex); 78 if (sbi->options.sys_immutable &&
49 79 ((attr | oldattr) & ATTR_SYS) &&
50 err = mnt_want_write(filp->f_path.mnt); 80 !capable(CAP_LINUX_IMMUTABLE)) {
51 if (err) 81 err = -EPERM;
52 goto up_no_drop_write; 82 goto out_drop_write;
53 83 }
54 /*
55 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
56 * prevents the user from turning us into a VFAT
57 * longname entry. Also, we obviously can't set
58 * any of the NTFS attributes in the high 24 bits.
59 */
60 attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
61 /* Merge in ATTR_VOLUME and ATTR_DIR */
62 attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
63 (is_dir ? ATTR_DIR : 0);
64 oldattr = fat_make_attrs(inode);
65
66 /* Equivalent to a chmod() */
67 ia.ia_valid = ATTR_MODE | ATTR_CTIME;
68 ia.ia_ctime = current_fs_time(inode->i_sb);
69 if (is_dir)
70 ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
71 else {
72 ia.ia_mode = fat_make_mode(sbi, attr,
73 S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
74 }
75 84
76 /* The root directory has no attributes */ 85 /*
77 if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { 86 * The security check is questionable... We single
78 err = -EINVAL; 87 * out the RO attribute for checking by the security
79 goto up; 88 * module, just because it maps to a file mode.
80 } 89 */
90 err = security_inode_setattr(file->f_path.dentry, &ia);
91 if (err)
92 goto out_drop_write;
81 93
82 if (sbi->options.sys_immutable) { 94 /* This MUST be done before doing anything irreversible... */
83 if ((attr | oldattr) & ATTR_SYS) { 95 err = fat_setattr(file->f_path.dentry, &ia);
84 if (!capable(CAP_LINUX_IMMUTABLE)) { 96 if (err)
85 err = -EPERM; 97 goto out_drop_write;
86 goto up; 98
87 } 99 fsnotify_change(file->f_path.dentry, ia.ia_valid);
88 } 100 if (sbi->options.sys_immutable) {
89 } 101 if (attr & ATTR_SYS)
102 inode->i_flags |= S_IMMUTABLE;
103 else
104 inode->i_flags &= S_IMMUTABLE;
105 }
90 106
91 /* 107 fat_save_attrs(inode, attr);
92 * The security check is questionable... We single 108 mark_inode_dirty(inode);
93 * out the RO attribute for checking by the security 109out_drop_write:
94 * module, just because it maps to a file mode. 110 mnt_drop_write(file->f_path.mnt);
95 */ 111out_unlock_inode:
96 err = security_inode_setattr(filp->f_path.dentry, &ia); 112 mutex_unlock(&inode->i_mutex);
97 if (err) 113out:
98 goto up; 114 return err;
99 115}
100 /* This MUST be done before doing anything irreversible... */
101 err = fat_setattr(filp->f_path.dentry, &ia);
102 if (err)
103 goto up;
104
105 fsnotify_change(filp->f_path.dentry, ia.ia_valid);
106 if (sbi->options.sys_immutable) {
107 if (attr & ATTR_SYS)
108 inode->i_flags |= S_IMMUTABLE;
109 else
110 inode->i_flags &= S_IMMUTABLE;
111 }
112 116
113 fat_save_attrs(inode, attr); 117int fat_generic_ioctl(struct inode *inode, struct file *filp,
114 mark_inode_dirty(inode); 118 unsigned int cmd, unsigned long arg)
115up: 119{
116 mnt_drop_write(filp->f_path.mnt); 120 u32 __user *user_attr = (u32 __user *)arg;
117up_no_drop_write: 121
118 mutex_unlock(&inode->i_mutex); 122 switch (cmd) {
119 return err; 123 case FAT_IOCTL_GET_ATTRIBUTES:
120 } 124 return fat_ioctl_get_attributes(inode, user_attr);
125 case FAT_IOCTL_SET_ATTRIBUTES:
126 return fat_ioctl_set_attributes(filp, user_attr);
121 default: 127 default:
122 return -ENOTTY; /* Inappropriate ioctl for device */ 128 return -ENOTTY; /* Inappropriate ioctl for device */
123 } 129 }
@@ -128,11 +134,23 @@ static int fat_file_release(struct inode *inode, struct file *filp)
128 if ((filp->f_mode & FMODE_WRITE) && 134 if ((filp->f_mode & FMODE_WRITE) &&
129 MSDOS_SB(inode->i_sb)->options.flush) { 135 MSDOS_SB(inode->i_sb)->options.flush) {
130 fat_flush_inodes(inode->i_sb, inode, NULL); 136 fat_flush_inodes(inode->i_sb, inode, NULL);
131 congestion_wait(WRITE, HZ/10); 137 congestion_wait(BLK_RW_ASYNC, HZ/10);
132 } 138 }
133 return 0; 139 return 0;
134} 140}
135 141
142int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
143{
144 struct inode *inode = dentry->d_inode;
145 int res, err;
146
147 res = simple_fsync(filp, dentry, datasync);
148 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
149
150 return res ? res : err;
151}
152
153
136const struct file_operations fat_file_operations = { 154const struct file_operations fat_file_operations = {
137 .llseek = generic_file_llseek, 155 .llseek = generic_file_llseek,
138 .read = do_sync_read, 156 .read = do_sync_read,
@@ -142,7 +160,7 @@ const struct file_operations fat_file_operations = {
142 .mmap = generic_file_mmap, 160 .mmap = generic_file_mmap,
143 .release = fat_file_release, 161 .release = fat_file_release,
144 .ioctl = fat_generic_ioctl, 162 .ioctl = fat_generic_ioctl,
145 .fsync = file_fsync, 163 .fsync = fat_file_fsync,
146 .splice_read = generic_file_splice_read, 164 .splice_read = generic_file_splice_read,
147}; 165};
148 166
@@ -213,7 +231,7 @@ static int fat_free(struct inode *inode, int skip)
213 fatent_brelse(&fatent); 231 fatent_brelse(&fatent);
214 return 0; 232 return 0;
215 } else if (ret == FAT_ENT_FREE) { 233 } else if (ret == FAT_ENT_FREE) {
216 fat_fs_panic(sb, 234 fat_fs_error(sb,
217 "%s: invalid cluster chain (i_pos %lld)", 235 "%s: invalid cluster chain (i_pos %lld)",
218 __func__, MSDOS_I(inode)->i_pos); 236 __func__, MSDOS_I(inode)->i_pos);
219 ret = -EIO; 237 ret = -EIO;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 296785a0dec8..8970d8c49bb0 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -76,7 +76,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
76 return 0; 76 return 0;
77 77
78 if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) { 78 if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
79 fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)", 79 fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)",
80 MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private); 80 MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
81 return -EIO; 81 return -EIO;
82 } 82 }
@@ -441,16 +441,35 @@ static void fat_clear_inode(struct inode *inode)
441 441
442static void fat_write_super(struct super_block *sb) 442static void fat_write_super(struct super_block *sb)
443{ 443{
444 lock_super(sb);
444 sb->s_dirt = 0; 445 sb->s_dirt = 0;
445 446
446 if (!(sb->s_flags & MS_RDONLY)) 447 if (!(sb->s_flags & MS_RDONLY))
447 fat_clusters_flush(sb); 448 fat_clusters_flush(sb);
449 unlock_super(sb);
450}
451
452static int fat_sync_fs(struct super_block *sb, int wait)
453{
454 lock_super(sb);
455 fat_clusters_flush(sb);
456 sb->s_dirt = 0;
457 unlock_super(sb);
458
459 return 0;
448} 460}
449 461
450static void fat_put_super(struct super_block *sb) 462static void fat_put_super(struct super_block *sb)
451{ 463{
452 struct msdos_sb_info *sbi = MSDOS_SB(sb); 464 struct msdos_sb_info *sbi = MSDOS_SB(sb);
453 465
466 lock_kernel();
467
468 if (sb->s_dirt)
469 fat_write_super(sb);
470
471 iput(sbi->fat_inode);
472
454 if (sbi->nls_disk) { 473 if (sbi->nls_disk) {
455 unload_nls(sbi->nls_disk); 474 unload_nls(sbi->nls_disk);
456 sbi->nls_disk = NULL; 475 sbi->nls_disk = NULL;
@@ -467,6 +486,8 @@ static void fat_put_super(struct super_block *sb)
467 486
468 sb->s_fs_info = NULL; 487 sb->s_fs_info = NULL;
469 kfree(sbi); 488 kfree(sbi);
489
490 unlock_kernel();
470} 491}
471 492
472static struct kmem_cache *fat_inode_cachep; 493static struct kmem_cache *fat_inode_cachep;
@@ -632,6 +653,7 @@ static const struct super_operations fat_sops = {
632 .delete_inode = fat_delete_inode, 653 .delete_inode = fat_delete_inode,
633 .put_super = fat_put_super, 654 .put_super = fat_put_super,
634 .write_super = fat_write_super, 655 .write_super = fat_write_super,
656 .sync_fs = fat_sync_fs,
635 .statfs = fat_statfs, 657 .statfs = fat_statfs,
636 .clear_inode = fat_clear_inode, 658 .clear_inode = fat_clear_inode,
637 .remount_fs = fat_remount, 659 .remount_fs = fat_remount,
@@ -834,6 +856,12 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
834 seq_puts(m, ",flush"); 856 seq_puts(m, ",flush");
835 if (opts->tz_utc) 857 if (opts->tz_utc)
836 seq_puts(m, ",tz=UTC"); 858 seq_puts(m, ",tz=UTC");
859 if (opts->errors == FAT_ERRORS_CONT)
860 seq_puts(m, ",errors=continue");
861 else if (opts->errors == FAT_ERRORS_PANIC)
862 seq_puts(m, ",errors=panic");
863 else
864 seq_puts(m, ",errors=remount-ro");
837 865
838 return 0; 866 return 0;
839} 867}
@@ -846,7 +874,8 @@ enum {
846 Opt_charset, Opt_shortname_lower, Opt_shortname_win95, 874 Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
847 Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, 875 Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
848 Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, 876 Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
849 Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err, 877 Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
878 Opt_err_panic, Opt_err_ro, Opt_err,
850}; 879};
851 880
852static const match_table_t fat_tokens = { 881static const match_table_t fat_tokens = {
@@ -869,6 +898,11 @@ static const match_table_t fat_tokens = {
869 {Opt_showexec, "showexec"}, 898 {Opt_showexec, "showexec"},
870 {Opt_debug, "debug"}, 899 {Opt_debug, "debug"},
871 {Opt_immutable, "sys_immutable"}, 900 {Opt_immutable, "sys_immutable"},
901 {Opt_flush, "flush"},
902 {Opt_tz_utc, "tz=UTC"},
903 {Opt_err_cont, "errors=continue"},
904 {Opt_err_panic, "errors=panic"},
905 {Opt_err_ro, "errors=remount-ro"},
872 {Opt_obsolate, "conv=binary"}, 906 {Opt_obsolate, "conv=binary"},
873 {Opt_obsolate, "conv=text"}, 907 {Opt_obsolate, "conv=text"},
874 {Opt_obsolate, "conv=auto"}, 908 {Opt_obsolate, "conv=auto"},
@@ -880,8 +914,6 @@ static const match_table_t fat_tokens = {
880 {Opt_obsolate, "cvf_format=%20s"}, 914 {Opt_obsolate, "cvf_format=%20s"},
881 {Opt_obsolate, "cvf_options=%100s"}, 915 {Opt_obsolate, "cvf_options=%100s"},
882 {Opt_obsolate, "posix"}, 916 {Opt_obsolate, "posix"},
883 {Opt_flush, "flush"},
884 {Opt_tz_utc, "tz=UTC"},
885 {Opt_err, NULL}, 917 {Opt_err, NULL},
886}; 918};
887static const match_table_t msdos_tokens = { 919static const match_table_t msdos_tokens = {
@@ -934,7 +966,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
934 966
935 opts->fs_uid = current_uid(); 967 opts->fs_uid = current_uid();
936 opts->fs_gid = current_gid(); 968 opts->fs_gid = current_gid();
937 opts->fs_fmask = current_umask(); 969 opts->fs_fmask = opts->fs_dmask = current_umask();
938 opts->allow_utime = -1; 970 opts->allow_utime = -1;
939 opts->codepage = fat_default_codepage; 971 opts->codepage = fat_default_codepage;
940 opts->iocharset = fat_default_iocharset; 972 opts->iocharset = fat_default_iocharset;
@@ -951,6 +983,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
951 opts->numtail = 1; 983 opts->numtail = 1;
952 opts->usefree = opts->nocase = 0; 984 opts->usefree = opts->nocase = 0;
953 opts->tz_utc = 0; 985 opts->tz_utc = 0;
986 opts->errors = FAT_ERRORS_RO;
954 *debug = 0; 987 *debug = 0;
955 988
956 if (!options) 989 if (!options)
@@ -1043,6 +1076,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
1043 case Opt_tz_utc: 1076 case Opt_tz_utc:
1044 opts->tz_utc = 1; 1077 opts->tz_utc = 1;
1045 break; 1078 break;
1079 case Opt_err_cont:
1080 opts->errors = FAT_ERRORS_CONT;
1081 break;
1082 case Opt_err_panic:
1083 opts->errors = FAT_ERRORS_PANIC;
1084 break;
1085 case Opt_err_ro:
1086 opts->errors = FAT_ERRORS_RO;
1087 break;
1046 1088
1047 /* msdos specific */ 1089 /* msdos specific */
1048 case Opt_dots: 1090 case Opt_dots:
@@ -1174,7 +1216,7 @@ static int fat_read_root(struct inode *inode)
1174int fat_fill_super(struct super_block *sb, void *data, int silent, 1216int fat_fill_super(struct super_block *sb, void *data, int silent,
1175 const struct inode_operations *fs_dir_inode_ops, int isvfat) 1217 const struct inode_operations *fs_dir_inode_ops, int isvfat)
1176{ 1218{
1177 struct inode *root_inode = NULL; 1219 struct inode *root_inode = NULL, *fat_inode = NULL;
1178 struct buffer_head *bh; 1220 struct buffer_head *bh;
1179 struct fat_boot_sector *b; 1221 struct fat_boot_sector *b;
1180 struct msdos_sb_info *sbi; 1222 struct msdos_sb_info *sbi;
@@ -1414,6 +1456,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1414 } 1456 }
1415 1457
1416 error = -ENOMEM; 1458 error = -ENOMEM;
1459 fat_inode = new_inode(sb);
1460 if (!fat_inode)
1461 goto out_fail;
1462 MSDOS_I(fat_inode)->i_pos = 0;
1463 sbi->fat_inode = fat_inode;
1417 root_inode = new_inode(sb); 1464 root_inode = new_inode(sb);
1418 if (!root_inode) 1465 if (!root_inode)
1419 goto out_fail; 1466 goto out_fail;
@@ -1439,6 +1486,8 @@ out_invalid:
1439 " on dev %s.\n", sb->s_id); 1486 " on dev %s.\n", sb->s_id);
1440 1487
1441out_fail: 1488out_fail:
1489 if (fat_inode)
1490 iput(fat_inode);
1442 if (root_inode) 1491 if (root_inode)
1443 iput(root_inode); 1492 iput(root_inode);
1444 if (sbi->nls_io) 1493 if (sbi->nls_io)
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index ac39ebcc1496..a6c20473dfd7 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -12,14 +12,19 @@
12#include "fat.h" 12#include "fat.h"
13 13
14/* 14/*
15 * fat_fs_panic reports a severe file system problem and sets the file system 15 * fat_fs_error reports a file system problem that might indicate fa data
16 * read-only. The file system can be made writable again by remounting it. 16 * corruption/inconsistency. Depending on 'errors' mount option the
17 * panic() is called, or error message is printed FAT and nothing is done,
18 * or filesystem is remounted read-only (default behavior).
19 * In case the file system is remounted read-only, it can be made writable
20 * again by remounting it.
17 */ 21 */
18void fat_fs_panic(struct super_block *s, const char *fmt, ...) 22void fat_fs_error(struct super_block *s, const char *fmt, ...)
19{ 23{
24 struct fat_mount_options *opts = &MSDOS_SB(s)->options;
20 va_list args; 25 va_list args;
21 26
22 printk(KERN_ERR "FAT: Filesystem panic (dev %s)\n", s->s_id); 27 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
23 28
24 printk(KERN_ERR " "); 29 printk(KERN_ERR " ");
25 va_start(args, fmt); 30 va_start(args, fmt);
@@ -27,13 +32,14 @@ void fat_fs_panic(struct super_block *s, const char *fmt, ...)
27 va_end(args); 32 va_end(args);
28 printk("\n"); 33 printk("\n");
29 34
30 if (!(s->s_flags & MS_RDONLY)) { 35 if (opts->errors == FAT_ERRORS_PANIC)
36 panic(" FAT fs panic from previous error\n");
37 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
31 s->s_flags |= MS_RDONLY; 38 s->s_flags |= MS_RDONLY;
32 printk(KERN_ERR " File system has been set read-only\n"); 39 printk(KERN_ERR " File system has been set read-only\n");
33 } 40 }
34} 41}
35 42EXPORT_SYMBOL_GPL(fat_fs_error);
36EXPORT_SYMBOL_GPL(fat_fs_panic);
37 43
38/* Flushes the number of free clusters on FAT32 */ 44/* Flushes the number of free clusters on FAT32 */
39/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 45/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
@@ -124,7 +130,7 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
124 mark_inode_dirty(inode); 130 mark_inode_dirty(inode);
125 } 131 }
126 if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) { 132 if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
127 fat_fs_panic(sb, "clusters badly computed (%d != %llu)", 133 fat_fs_error(sb, "clusters badly computed (%d != %llu)",
128 new_fclus, 134 new_fclus,
129 (llu)(inode->i_blocks >> (sbi->cluster_bits - 9))); 135 (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
130 fat_cache_inval_inode(inode); 136 fat_cache_inval_inode(inode);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index da3f361a37dd..bbc94ae4fd77 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -9,7 +9,6 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
12#include <linux/smp_lock.h>
13#include "fat.h" 12#include "fat.h"
14 13
15/* Characters that are undesirable in an MS-DOS file name */ 14/* Characters that are undesirable in an MS-DOS file name */
@@ -544,7 +543,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
544 int start = MSDOS_I(new_dir)->i_logstart; 543 int start = MSDOS_I(new_dir)->i_logstart;
545 dotdot_de->start = cpu_to_le16(start); 544 dotdot_de->start = cpu_to_le16(start);
546 dotdot_de->starthi = cpu_to_le16(start >> 16); 545 dotdot_de->starthi = cpu_to_le16(start >> 16);
547 mark_buffer_dirty(dotdot_bh); 546 mark_buffer_dirty_inode(dotdot_bh, old_inode);
548 if (IS_DIRSYNC(new_dir)) { 547 if (IS_DIRSYNC(new_dir)) {
549 err = sync_dirty_buffer(dotdot_bh); 548 err = sync_dirty_buffer(dotdot_bh);
550 if (err) 549 if (err)
@@ -586,7 +585,7 @@ error_dotdot:
586 int start = MSDOS_I(old_dir)->i_logstart; 585 int start = MSDOS_I(old_dir)->i_logstart;
587 dotdot_de->start = cpu_to_le16(start); 586 dotdot_de->start = cpu_to_le16(start);
588 dotdot_de->starthi = cpu_to_le16(start >> 16); 587 dotdot_de->starthi = cpu_to_le16(start >> 16);
589 mark_buffer_dirty(dotdot_bh); 588 mark_buffer_dirty_inode(dotdot_bh, old_inode);
590 corrupt |= sync_dirty_buffer(dotdot_bh); 589 corrupt |= sync_dirty_buffer(dotdot_bh);
591 } 590 }
592error_inode: 591error_inode:
@@ -608,7 +607,7 @@ error_inode:
608 sinfo.bh = NULL; 607 sinfo.bh = NULL;
609 } 608 }
610 if (corrupt < 0) { 609 if (corrupt < 0) {
611 fat_fs_panic(new_dir->i_sb, 610 fat_fs_error(new_dir->i_sb,
612 "%s: Filesystem corrupted (i_pos %lld)", 611 "%s: Filesystem corrupted (i_pos %lld)",
613 __func__, sinfo.i_pos); 612 __func__, sinfo.i_pos);
614 } 613 }
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a0e00e3a46e9..cb6e83557112 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -19,7 +19,6 @@
19#include <linux/jiffies.h> 19#include <linux/jiffies.h>
20#include <linux/ctype.h> 20#include <linux/ctype.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/smp_lock.h>
23#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
24#include <linux/namei.h> 23#include <linux/namei.h>
25#include "fat.h" 24#include "fat.h"
@@ -502,11 +501,11 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
502 if (utf8) { 501 if (utf8) {
503 int name_len = strlen(name); 502 int name_len = strlen(name);
504 503
505 *outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX); 504 *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname);
506 505
507 /* 506 /*
508 * We stripped '.'s before and set len appropriately, 507 * We stripped '.'s before and set len appropriately,
509 * but utf8_mbstowcs doesn't care about len 508 * but utf8s_to_utf16s doesn't care about len
510 */ 509 */
511 *outlen -= (name_len - len); 510 *outlen -= (name_len - len);
512 511
@@ -965,7 +964,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
965 int start = MSDOS_I(new_dir)->i_logstart; 964 int start = MSDOS_I(new_dir)->i_logstart;
966 dotdot_de->start = cpu_to_le16(start); 965 dotdot_de->start = cpu_to_le16(start);
967 dotdot_de->starthi = cpu_to_le16(start >> 16); 966 dotdot_de->starthi = cpu_to_le16(start >> 16);
968 mark_buffer_dirty(dotdot_bh); 967 mark_buffer_dirty_inode(dotdot_bh, old_inode);
969 if (IS_DIRSYNC(new_dir)) { 968 if (IS_DIRSYNC(new_dir)) {
970 err = sync_dirty_buffer(dotdot_bh); 969 err = sync_dirty_buffer(dotdot_bh);
971 if (err) 970 if (err)
@@ -1009,7 +1008,7 @@ error_dotdot:
1009 int start = MSDOS_I(old_dir)->i_logstart; 1008 int start = MSDOS_I(old_dir)->i_logstart;
1010 dotdot_de->start = cpu_to_le16(start); 1009 dotdot_de->start = cpu_to_le16(start);
1011 dotdot_de->starthi = cpu_to_le16(start >> 16); 1010 dotdot_de->starthi = cpu_to_le16(start >> 16);
1012 mark_buffer_dirty(dotdot_bh); 1011 mark_buffer_dirty_inode(dotdot_bh, old_inode);
1013 corrupt |= sync_dirty_buffer(dotdot_bh); 1012 corrupt |= sync_dirty_buffer(dotdot_bh);
1014 } 1013 }
1015error_inode: 1014error_inode:
@@ -1030,7 +1029,7 @@ error_inode:
1030 sinfo.bh = NULL; 1029 sinfo.bh = NULL;
1031 } 1030 }
1032 if (corrupt < 0) { 1031 if (corrupt < 0) {
1033 fat_fs_panic(new_dir->i_sb, 1032 fat_fs_error(new_dir->i_sb,
1034 "%s: Filesystem corrupted (i_pos %lld)", 1033 "%s: Filesystem corrupted (i_pos %lld)",
1035 __func__, sinfo.i_pos); 1034 __func__, sinfo.i_pos);
1036 } 1035 }
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 1ad703150dee..ae413086db97 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -19,7 +19,6 @@
19#include <linux/signal.h> 19#include <linux/signal.h>
20#include <linux/rcupdate.h> 20#include <linux/rcupdate.h>
21#include <linux/pid_namespace.h> 21#include <linux/pid_namespace.h>
22#include <linux/smp_lock.h>
23 22
24#include <asm/poll.h> 23#include <asm/poll.h>
25#include <asm/siginfo.h> 24#include <asm/siginfo.h>
@@ -198,15 +197,19 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
198} 197}
199 198
200static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, 199static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
201 uid_t uid, uid_t euid, int force) 200 int force)
202{ 201{
203 write_lock_irq(&filp->f_owner.lock); 202 write_lock_irq(&filp->f_owner.lock);
204 if (force || !filp->f_owner.pid) { 203 if (force || !filp->f_owner.pid) {
205 put_pid(filp->f_owner.pid); 204 put_pid(filp->f_owner.pid);
206 filp->f_owner.pid = get_pid(pid); 205 filp->f_owner.pid = get_pid(pid);
207 filp->f_owner.pid_type = type; 206 filp->f_owner.pid_type = type;
208 filp->f_owner.uid = uid; 207
209 filp->f_owner.euid = euid; 208 if (pid) {
209 const struct cred *cred = current_cred();
210 filp->f_owner.uid = cred->uid;
211 filp->f_owner.euid = cred->euid;
212 }
210 } 213 }
211 write_unlock_irq(&filp->f_owner.lock); 214 write_unlock_irq(&filp->f_owner.lock);
212} 215}
@@ -214,14 +217,13 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
214int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, 217int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
215 int force) 218 int force)
216{ 219{
217 const struct cred *cred = current_cred();
218 int err; 220 int err;
219 221
220 err = security_file_set_fowner(filp); 222 err = security_file_set_fowner(filp);
221 if (err) 223 if (err)
222 return err; 224 return err;
223 225
224 f_modown(filp, pid, type, cred->uid, cred->euid, force); 226 f_modown(filp, pid, type, force);
225 return 0; 227 return 0;
226} 228}
227EXPORT_SYMBOL(__f_setown); 229EXPORT_SYMBOL(__f_setown);
@@ -247,7 +249,7 @@ EXPORT_SYMBOL(f_setown);
247 249
248void f_delown(struct file *filp) 250void f_delown(struct file *filp)
249{ 251{
250 f_modown(filp, NULL, PIDTYPE_PID, 0, 0, 1); 252 f_modown(filp, NULL, PIDTYPE_PID, 1);
251} 253}
252 254
253pid_t f_getown(struct file *filp) 255pid_t f_getown(struct file *filp)
@@ -425,14 +427,20 @@ static inline int sigio_perm(struct task_struct *p,
425} 427}
426 428
427static void send_sigio_to_task(struct task_struct *p, 429static void send_sigio_to_task(struct task_struct *p,
428 struct fown_struct *fown, 430 struct fown_struct *fown,
429 int fd, 431 int fd,
430 int reason) 432 int reason)
431{ 433{
432 if (!sigio_perm(p, fown, fown->signum)) 434 /*
435 * F_SETSIG can change ->signum lockless in parallel, make
436 * sure we read it once and use the same value throughout.
437 */
438 int signum = ACCESS_ONCE(fown->signum);
439
440 if (!sigio_perm(p, fown, signum))
433 return; 441 return;
434 442
435 switch (fown->signum) { 443 switch (signum) {
436 siginfo_t si; 444 siginfo_t si;
437 default: 445 default:
438 /* Queue a rt signal with the appropriate fd as its 446 /* Queue a rt signal with the appropriate fd as its
@@ -441,7 +449,7 @@ static void send_sigio_to_task(struct task_struct *p,
441 delivered even if we can't queue. Failure to 449 delivered even if we can't queue. Failure to
442 queue in this case _should_ be reported; we fall 450 queue in this case _should_ be reported; we fall
443 back to SIGIO in that case. --sct */ 451 back to SIGIO in that case. --sct */
444 si.si_signo = fown->signum; 452 si.si_signo = signum;
445 si.si_errno = 0; 453 si.si_errno = 0;
446 si.si_code = reason; 454 si.si_code = reason;
447 /* Make sure we are called with one of the POLL_* 455 /* Make sure we are called with one of the POLL_*
@@ -453,7 +461,7 @@ static void send_sigio_to_task(struct task_struct *p,
453 else 461 else
454 si.si_band = band_table[reason - POLL_IN]; 462 si.si_band = band_table[reason - POLL_IN];
455 si.si_fd = fd; 463 si.si_fd = fd;
456 if (!group_send_sig_info(fown->signum, &si, p)) 464 if (!group_send_sig_info(signum, &si, p))
457 break; 465 break;
458 /* fall-through: fall back on the old plain SIGIO signal */ 466 /* fall-through: fall back on the old plain SIGIO signal */
459 case 0: 467 case 0:
diff --git a/fs/file_table.c b/fs/file_table.c
index 54018fe48840..334ce39881f8 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -214,7 +214,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
214 */ 214 */
215 if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { 215 if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
216 file_take_write(file); 216 file_take_write(file);
217 error = mnt_want_write(mnt); 217 error = mnt_clone_write(mnt);
218 WARN_ON(error); 218 WARN_ON(error);
219 } 219 }
220 return error; 220 return error;
@@ -399,6 +399,44 @@ too_bad:
399 return 0; 399 return 0;
400} 400}
401 401
402/**
403 * mark_files_ro - mark all files read-only
404 * @sb: superblock in question
405 *
406 * All files are marked read-only. We don't care about pending
407 * delete files so this should be used in 'force' mode only.
408 */
409void mark_files_ro(struct super_block *sb)
410{
411 struct file *f;
412
413retry:
414 file_list_lock();
415 list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
416 struct vfsmount *mnt;
417 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
418 continue;
419 if (!file_count(f))
420 continue;
421 if (!(f->f_mode & FMODE_WRITE))
422 continue;
423 f->f_mode &= ~FMODE_WRITE;
424 if (file_check_writeable(f) != 0)
425 continue;
426 file_release_write(f);
427 mnt = mntget(f->f_path.mnt);
428 file_list_unlock();
429 /*
430 * This can sleep, so we can't hold
431 * the file_list_lock() spinlock.
432 */
433 mnt_drop_write(mnt);
434 mntput(mnt);
435 goto retry;
436 }
437 file_list_unlock();
438}
439
402void __init files_init(unsigned long mempages) 440void __init files_init(unsigned long mempages)
403{ 441{
404 int n; 442 int n;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1dacda831577..1e8af939b3e4 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -38,6 +38,7 @@
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/kernel.h> 39#include <linux/kernel.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/smp_lock.h>
41#include <linux/stat.h> 42#include <linux/stat.h>
42#include <linux/vfs.h> 43#include <linux/vfs.h>
43#include <linux/mount.h> 44#include <linux/mount.h>
@@ -80,12 +81,16 @@ vxfs_put_super(struct super_block *sbp)
80{ 81{
81 struct vxfs_sb_info *infp = VXFS_SBI(sbp); 82 struct vxfs_sb_info *infp = VXFS_SBI(sbp);
82 83
84 lock_kernel();
85
83 vxfs_put_fake_inode(infp->vsi_fship); 86 vxfs_put_fake_inode(infp->vsi_fship);
84 vxfs_put_fake_inode(infp->vsi_ilist); 87 vxfs_put_fake_inode(infp->vsi_ilist);
85 vxfs_put_fake_inode(infp->vsi_stilist); 88 vxfs_put_fake_inode(infp->vsi_stilist);
86 89
87 brelse(infp->vsi_bp); 90 brelse(infp->vsi_bp);
88 kfree(infp); 91 kfree(infp);
92
93 unlock_kernel();
89} 94}
90 95
91/** 96/**
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 91013ff7dd53..c54226be5294 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -64,6 +64,28 @@ static void writeback_release(struct backing_dev_info *bdi)
64 clear_bit(BDI_pdflush, &bdi->state); 64 clear_bit(BDI_pdflush, &bdi->state);
65} 65}
66 66
67static noinline void block_dump___mark_inode_dirty(struct inode *inode)
68{
69 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
70 struct dentry *dentry;
71 const char *name = "?";
72
73 dentry = d_find_alias(inode);
74 if (dentry) {
75 spin_lock(&dentry->d_lock);
76 name = (const char *) dentry->d_name.name;
77 }
78 printk(KERN_DEBUG
79 "%s(%d): dirtied inode %lu (%s) on %s\n",
80 current->comm, task_pid_nr(current), inode->i_ino,
81 name, inode->i_sb->s_id);
82 if (dentry) {
83 spin_unlock(&dentry->d_lock);
84 dput(dentry);
85 }
86 }
87}
88
67/** 89/**
68 * __mark_inode_dirty - internal function 90 * __mark_inode_dirty - internal function
69 * @inode: inode to mark 91 * @inode: inode to mark
@@ -114,23 +136,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
114 if ((inode->i_state & flags) == flags) 136 if ((inode->i_state & flags) == flags)
115 return; 137 return;
116 138
117 if (unlikely(block_dump)) { 139 if (unlikely(block_dump))
118 struct dentry *dentry = NULL; 140 block_dump___mark_inode_dirty(inode);
119 const char *name = "?";
120
121 if (!list_empty(&inode->i_dentry)) {
122 dentry = list_entry(inode->i_dentry.next,
123 struct dentry, d_alias);
124 if (dentry && dentry->d_name.name)
125 name = (const char *) dentry->d_name.name;
126 }
127
128 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev"))
129 printk(KERN_DEBUG
130 "%s(%d): dirtied inode %lu (%s) on %s\n",
131 current->comm, task_pid_nr(current), inode->i_ino,
132 name, inode->i_sb->s_id);
133 }
134 141
135 spin_lock(&inode_lock); 142 spin_lock(&inode_lock);
136 if ((inode->i_state & flags) != flags) { 143 if ((inode->i_state & flags) != flags) {
@@ -271,7 +278,26 @@ int sb_has_dirty_inodes(struct super_block *sb)
271EXPORT_SYMBOL(sb_has_dirty_inodes); 278EXPORT_SYMBOL(sb_has_dirty_inodes);
272 279
273/* 280/*
274 * Write a single inode's dirty pages and inode data out to disk. 281 * Wait for writeback on an inode to complete.
282 */
283static void inode_wait_for_writeback(struct inode *inode)
284{
285 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
286 wait_queue_head_t *wqh;
287
288 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
289 do {
290 spin_unlock(&inode_lock);
291 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
292 spin_lock(&inode_lock);
293 } while (inode->i_state & I_SYNC);
294}
295
296/*
297 * Write out an inode's dirty pages. Called under inode_lock. Either the
298 * caller has ref on the inode (either via __iget or via syscall against an fd)
299 * or the inode has I_WILL_FREE set (via generic_forget_inode)
300 *
275 * If `wait' is set, wait on the writeout. 301 * If `wait' is set, wait on the writeout.
276 * 302 *
277 * The whole writeout design is quite complex and fragile. We want to avoid 303 * The whole writeout design is quite complex and fragile. We want to avoid
@@ -281,15 +307,39 @@ EXPORT_SYMBOL(sb_has_dirty_inodes);
281 * Called under inode_lock. 307 * Called under inode_lock.
282 */ 308 */
283static int 309static int
284__sync_single_inode(struct inode *inode, struct writeback_control *wbc) 310writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
285{ 311{
286 unsigned dirty;
287 struct address_space *mapping = inode->i_mapping; 312 struct address_space *mapping = inode->i_mapping;
288 int wait = wbc->sync_mode == WB_SYNC_ALL; 313 int wait = wbc->sync_mode == WB_SYNC_ALL;
314 unsigned dirty;
289 int ret; 315 int ret;
290 316
317 if (!atomic_read(&inode->i_count))
318 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
319 else
320 WARN_ON(inode->i_state & I_WILL_FREE);
321
322 if (inode->i_state & I_SYNC) {
323 /*
324 * If this inode is locked for writeback and we are not doing
325 * writeback-for-data-integrity, move it to s_more_io so that
326 * writeback can proceed with the other inodes on s_io.
327 *
328 * We'll have another go at writing back this inode when we
329 * completed a full scan of s_io.
330 */
331 if (!wait) {
332 requeue_io(inode);
333 return 0;
334 }
335
336 /*
337 * It's a data-integrity sync. We must wait.
338 */
339 inode_wait_for_writeback(inode);
340 }
341
291 BUG_ON(inode->i_state & I_SYNC); 342 BUG_ON(inode->i_state & I_SYNC);
292 WARN_ON(inode->i_state & I_NEW);
293 343
294 /* Set I_SYNC, reset I_DIRTY */ 344 /* Set I_SYNC, reset I_DIRTY */
295 dirty = inode->i_state & I_DIRTY; 345 dirty = inode->i_state & I_DIRTY;
@@ -314,9 +364,8 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
314 } 364 }
315 365
316 spin_lock(&inode_lock); 366 spin_lock(&inode_lock);
317 WARN_ON(inode->i_state & I_NEW);
318 inode->i_state &= ~I_SYNC; 367 inode->i_state &= ~I_SYNC;
319 if (!(inode->i_state & I_FREEING)) { 368 if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
320 if (!(inode->i_state & I_DIRTY) && 369 if (!(inode->i_state & I_DIRTY) &&
321 mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 370 mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
322 /* 371 /*
@@ -385,50 +434,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
385} 434}
386 435
387/* 436/*
388 * Write out an inode's dirty pages. Called under inode_lock. Either the
389 * caller has ref on the inode (either via __iget or via syscall against an fd)
390 * or the inode has I_WILL_FREE set (via generic_forget_inode)
391 */
392static int
393__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
394{
395 wait_queue_head_t *wqh;
396
397 if (!atomic_read(&inode->i_count))
398 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
399 else
400 WARN_ON(inode->i_state & I_WILL_FREE);
401
402 if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_SYNC)) {
403 /*
404 * We're skipping this inode because it's locked, and we're not
405 * doing writeback-for-data-integrity. Move it to s_more_io so
406 * that writeback can proceed with the other inodes on s_io.
407 * We'll have another go at writing back this inode when we
408 * completed a full scan of s_io.
409 */
410 requeue_io(inode);
411 return 0;
412 }
413
414 /*
415 * It's a data-integrity sync. We must wait.
416 */
417 if (inode->i_state & I_SYNC) {
418 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
419
420 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
421 do {
422 spin_unlock(&inode_lock);
423 __wait_on_bit(wqh, &wq, inode_wait,
424 TASK_UNINTERRUPTIBLE);
425 spin_lock(&inode_lock);
426 } while (inode->i_state & I_SYNC);
427 }
428 return __sync_single_inode(inode, wbc);
429}
430
431/*
432 * Write out a superblock's list of dirty inodes. A wait will be performed 437 * Write out a superblock's list of dirty inodes. A wait will be performed
433 * upon no inodes, all inodes or the final one, depending upon sync_mode. 438 * upon no inodes, all inodes or the final one, depending upon sync_mode.
434 * 439 *
@@ -487,7 +492,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
487 break; 492 break;
488 } 493 }
489 494
490 if (inode->i_state & I_NEW) { 495 if (inode->i_state & (I_NEW | I_WILL_FREE)) {
491 requeue_io(inode); 496 requeue_io(inode);
492 continue; 497 continue;
493 } 498 }
@@ -518,10 +523,10 @@ void generic_sync_sb_inodes(struct super_block *sb,
518 if (current_is_pdflush() && !writeback_acquire(bdi)) 523 if (current_is_pdflush() && !writeback_acquire(bdi))
519 break; 524 break;
520 525
521 BUG_ON(inode->i_state & I_FREEING); 526 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
522 __iget(inode); 527 __iget(inode);
523 pages_skipped = wbc->pages_skipped; 528 pages_skipped = wbc->pages_skipped;
524 __writeback_single_inode(inode, wbc); 529 writeback_single_inode(inode, wbc);
525 if (current_is_pdflush()) 530 if (current_is_pdflush())
526 writeback_release(bdi); 531 writeback_release(bdi);
527 if (wbc->pages_skipped != pages_skipped) { 532 if (wbc->pages_skipped != pages_skipped) {
@@ -679,55 +684,6 @@ void sync_inodes_sb(struct super_block *sb, int wait)
679} 684}
680 685
681/** 686/**
682 * sync_inodes - writes all inodes to disk
683 * @wait: wait for completion
684 *
685 * sync_inodes() goes through each super block's dirty inode list, writes the
686 * inodes out, waits on the writeout and puts the inodes back on the normal
687 * list.
688 *
689 * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle
690 * part of the sync functions is that the blockdev "superblock" is processed
691 * last. This is because the write_inode() function of a typical fs will
692 * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
693 * What we want to do is to perform all that dirtying first, and then write
694 * back all those inode blocks via the blockdev mapping in one sweep. So the
695 * additional (somewhat redundant) sync_blockdev() calls here are to make
696 * sure that really happens. Because if we call sync_inodes_sb(wait=1) with
697 * outstanding dirty inodes, the writeback goes block-at-a-time within the
698 * filesystem's write_inode(). This is extremely slow.
699 */
700static void __sync_inodes(int wait)
701{
702 struct super_block *sb;
703
704 spin_lock(&sb_lock);
705restart:
706 list_for_each_entry(sb, &super_blocks, s_list) {
707 sb->s_count++;
708 spin_unlock(&sb_lock);
709 down_read(&sb->s_umount);
710 if (sb->s_root) {
711 sync_inodes_sb(sb, wait);
712 sync_blockdev(sb->s_bdev);
713 }
714 up_read(&sb->s_umount);
715 spin_lock(&sb_lock);
716 if (__put_super_and_need_restart(sb))
717 goto restart;
718 }
719 spin_unlock(&sb_lock);
720}
721
722void sync_inodes(int wait)
723{
724 __sync_inodes(0);
725
726 if (wait)
727 __sync_inodes(1);
728}
729
730/**
731 * write_inode_now - write an inode to disk 687 * write_inode_now - write an inode to disk
732 * @inode: inode to write to disk 688 * @inode: inode to write to disk
733 * @sync: whether the write should be synchronous or not 689 * @sync: whether the write should be synchronous or not
@@ -752,7 +708,7 @@ int write_inode_now(struct inode *inode, int sync)
752 708
753 might_sleep(); 709 might_sleep();
754 spin_lock(&inode_lock); 710 spin_lock(&inode_lock);
755 ret = __writeback_single_inode(inode, &wbc); 711 ret = writeback_single_inode(inode, &wbc);
756 spin_unlock(&inode_lock); 712 spin_unlock(&inode_lock);
757 if (sync) 713 if (sync)
758 inode_sync_wait(inode); 714 inode_sync_wait(inode);
@@ -776,7 +732,7 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
776 int ret; 732 int ret;
777 733
778 spin_lock(&inode_lock); 734 spin_lock(&inode_lock);
779 ret = __writeback_single_inode(inode, wbc); 735 ret = writeback_single_inode(inode, wbc);
780 spin_unlock(&inode_lock); 736 spin_unlock(&inode_lock);
781 return ret; 737 return ret;
782} 738}
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 72437065f6ad..e95eeb445e58 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -3,5 +3,6 @@
3# 3#
4 4
5obj-$(CONFIG_FUSE_FS) += fuse.o 5obj-$(CONFIG_FUSE_FS) += fuse.o
6obj-$(CONFIG_CUSE) += cuse.o
6 7
7fuse-objs := dev.o dir.o file.o inode.o control.o 8fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
new file mode 100644
index 000000000000..de792dcf3274
--- /dev/null
+++ b/fs/fuse/cuse.c
@@ -0,0 +1,610 @@
1/*
2 * CUSE: Character device in Userspace
3 *
4 * Copyright (C) 2008-2009 SUSE Linux Products GmbH
5 * Copyright (C) 2008-2009 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * CUSE enables character devices to be implemented from userland much
10 * like FUSE allows filesystems. On initialization /dev/cuse is
11 * created. By opening the file and replying to the CUSE_INIT request
12 * userland CUSE server can create a character device. After that the
13 * operation is very similar to FUSE.
14 *
15 * A CUSE instance involves the following objects.
16 *
17 * cuse_conn : contains fuse_conn and serves as bonding structure
18 * channel : file handle connected to the userland CUSE server
19 * cdev : the implemented character device
20 * dev : generic device for cdev
21 *
22 * Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with
23 * devices, it's called 'channel' to reduce confusion.
24 *
25 * channel determines when the character device dies. When channel is
26 * closed, everything begins to destruct. The cuse_conn is taken off
27 * the lookup table preventing further access from cdev, cdev and
28 * generic device are removed and the base reference of cuse_conn is
29 * put.
30 *
31 * On each open, the matching cuse_conn is looked up and if found an
32 * additional reference is taken which is released when the file is
33 * closed.
34 */
35
36#include <linux/fuse.h>
37#include <linux/cdev.h>
38#include <linux/device.h>
39#include <linux/file.h>
40#include <linux/fs.h>
41#include <linux/kdev_t.h>
42#include <linux/kthread.h>
43#include <linux/list.h>
44#include <linux/magic.h>
45#include <linux/miscdevice.h>
46#include <linux/mutex.h>
47#include <linux/spinlock.h>
48#include <linux/stat.h>
49
50#include "fuse_i.h"
51
52#define CUSE_CONNTBL_LEN 64
53
54struct cuse_conn {
55 struct list_head list; /* linked on cuse_conntbl */
56 struct fuse_conn fc; /* fuse connection */
57 struct cdev *cdev; /* associated character device */
58 struct device *dev; /* device representing @cdev */
59
60 /* init parameters, set once during initialization */
61 bool unrestricted_ioctl;
62};
63
64static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */
65static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
66static struct class *cuse_class;
67
68static struct cuse_conn *fc_to_cc(struct fuse_conn *fc)
69{
70 return container_of(fc, struct cuse_conn, fc);
71}
72
73static struct list_head *cuse_conntbl_head(dev_t devt)
74{
75 return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN];
76}
77
78
79/**************************************************************************
80 * CUSE frontend operations
81 *
82 * These are file operations for the character device.
83 *
84 * On open, CUSE opens a file from the FUSE mnt and stores it to
85 * private_data of the open file. All other ops call FUSE ops on the
86 * FUSE file.
87 */
88
89static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
90 loff_t *ppos)
91{
92 loff_t pos = 0;
93
94 return fuse_direct_io(file, buf, count, &pos, 0);
95}
96
97static ssize_t cuse_write(struct file *file, const char __user *buf,
98 size_t count, loff_t *ppos)
99{
100 loff_t pos = 0;
101 /*
102 * No locking or generic_write_checks(), the server is
103 * responsible for locking and sanity checks.
104 */
105 return fuse_direct_io(file, buf, count, &pos, 1);
106}
107
108static int cuse_open(struct inode *inode, struct file *file)
109{
110 dev_t devt = inode->i_cdev->dev;
111 struct cuse_conn *cc = NULL, *pos;
112 int rc;
113
114 /* look up and get the connection */
115 spin_lock(&cuse_lock);
116 list_for_each_entry(pos, cuse_conntbl_head(devt), list)
117 if (pos->dev->devt == devt) {
118 fuse_conn_get(&pos->fc);
119 cc = pos;
120 break;
121 }
122 spin_unlock(&cuse_lock);
123
124 /* dead? */
125 if (!cc)
126 return -ENODEV;
127
128 /*
129 * Generic permission check is already done against the chrdev
130 * file, proceed to open.
131 */
132 rc = fuse_do_open(&cc->fc, 0, file, 0);
133 if (rc)
134 fuse_conn_put(&cc->fc);
135 return rc;
136}
137
138static int cuse_release(struct inode *inode, struct file *file)
139{
140 struct fuse_file *ff = file->private_data;
141 struct fuse_conn *fc = ff->fc;
142
143 fuse_sync_release(ff, file->f_flags);
144 fuse_conn_put(fc);
145
146 return 0;
147}
148
149static long cuse_file_ioctl(struct file *file, unsigned int cmd,
150 unsigned long arg)
151{
152 struct fuse_file *ff = file->private_data;
153 struct cuse_conn *cc = fc_to_cc(ff->fc);
154 unsigned int flags = 0;
155
156 if (cc->unrestricted_ioctl)
157 flags |= FUSE_IOCTL_UNRESTRICTED;
158
159 return fuse_do_ioctl(file, cmd, arg, flags);
160}
161
162static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
163 unsigned long arg)
164{
165 struct fuse_file *ff = file->private_data;
166 struct cuse_conn *cc = fc_to_cc(ff->fc);
167 unsigned int flags = FUSE_IOCTL_COMPAT;
168
169 if (cc->unrestricted_ioctl)
170 flags |= FUSE_IOCTL_UNRESTRICTED;
171
172 return fuse_do_ioctl(file, cmd, arg, flags);
173}
174
175static const struct file_operations cuse_frontend_fops = {
176 .owner = THIS_MODULE,
177 .read = cuse_read,
178 .write = cuse_write,
179 .open = cuse_open,
180 .release = cuse_release,
181 .unlocked_ioctl = cuse_file_ioctl,
182 .compat_ioctl = cuse_file_compat_ioctl,
183 .poll = fuse_file_poll,
184};
185
186
187/**************************************************************************
188 * CUSE channel initialization and destruction
189 */
190
191struct cuse_devinfo {
192 const char *name;
193};
194
195/**
196 * cuse_parse_one - parse one key=value pair
197 * @pp: i/o parameter for the current position
198 * @end: points to one past the end of the packed string
199 * @keyp: out parameter for key
200 * @valp: out parameter for value
201 *
202 * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends
203 * at @end - 1. This function parses one pair and set *@keyp to the
204 * start of the key and *@valp to the start of the value. Note that
205 * the original string is modified such that the key string is
206 * terminated with '\0'. *@pp is updated to point to the next string.
207 *
208 * RETURNS:
209 * 1 on successful parse, 0 on EOF, -errno on failure.
210 */
211static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
212{
213 char *p = *pp;
214 char *key, *val;
215
216 while (p < end && *p == '\0')
217 p++;
218 if (p == end)
219 return 0;
220
221 if (end[-1] != '\0') {
222 printk(KERN_ERR "CUSE: info not properly terminated\n");
223 return -EINVAL;
224 }
225
226 key = val = p;
227 p += strlen(p);
228
229 if (valp) {
230 strsep(&val, "=");
231 if (!val)
232 val = key + strlen(key);
233 key = strstrip(key);
234 val = strstrip(val);
235 } else
236 key = strstrip(key);
237
238 if (!strlen(key)) {
239 printk(KERN_ERR "CUSE: zero length info key specified\n");
240 return -EINVAL;
241 }
242
243 *pp = p;
244 *keyp = key;
245 if (valp)
246 *valp = val;
247
248 return 1;
249}
250
251/**
252 * cuse_parse_dev_info - parse device info
253 * @p: device info string
254 * @len: length of device info string
255 * @devinfo: out parameter for parsed device info
256 *
257 * Parse @p to extract device info and store it into @devinfo. String
258 * pointed to by @p is modified by parsing and @devinfo points into
259 * them, so @p shouldn't be freed while @devinfo is in use.
260 *
261 * RETURNS:
262 * 0 on success, -errno on failure.
263 */
264static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
265{
266 char *end = p + len;
267 char *key, *val;
268 int rc;
269
270 while (true) {
271 rc = cuse_parse_one(&p, end, &key, &val);
272 if (rc < 0)
273 return rc;
274 if (!rc)
275 break;
276 if (strcmp(key, "DEVNAME") == 0)
277 devinfo->name = val;
278 else
279 printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n",
280 key);
281 }
282
283 if (!devinfo->name || !strlen(devinfo->name)) {
284 printk(KERN_ERR "CUSE: DEVNAME unspecified\n");
285 return -EINVAL;
286 }
287
288 return 0;
289}
290
291static void cuse_gendev_release(struct device *dev)
292{
293 kfree(dev);
294}
295
296/**
297 * cuse_process_init_reply - finish initializing CUSE channel
298 *
299 * This function creates the character device and sets up all the
300 * required data structures for it. Please read the comment at the
301 * top of this file for high level overview.
302 */
303static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
304{
305 struct cuse_conn *cc = fc_to_cc(fc);
306 struct cuse_init_out *arg = &req->misc.cuse_init_out;
307 struct page *page = req->pages[0];
308 struct cuse_devinfo devinfo = { };
309 struct device *dev;
310 struct cdev *cdev;
311 dev_t devt;
312 int rc;
313
314 if (req->out.h.error ||
315 arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
316 goto err;
317 }
318
319 fc->minor = arg->minor;
320 fc->max_read = max_t(unsigned, arg->max_read, 4096);
321 fc->max_write = max_t(unsigned, arg->max_write, 4096);
322
323 /* parse init reply */
324 cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
325
326 rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size,
327 &devinfo);
328 if (rc)
329 goto err;
330
331 /* determine and reserve devt */
332 devt = MKDEV(arg->dev_major, arg->dev_minor);
333 if (!MAJOR(devt))
334 rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name);
335 else
336 rc = register_chrdev_region(devt, 1, devinfo.name);
337 if (rc) {
338 printk(KERN_ERR "CUSE: failed to register chrdev region\n");
339 goto err;
340 }
341
342 /* devt determined, create device */
343 rc = -ENOMEM;
344 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
345 if (!dev)
346 goto err_region;
347
348 device_initialize(dev);
349 dev_set_uevent_suppress(dev, 1);
350 dev->class = cuse_class;
351 dev->devt = devt;
352 dev->release = cuse_gendev_release;
353 dev_set_drvdata(dev, cc);
354 dev_set_name(dev, "%s", devinfo.name);
355
356 rc = device_add(dev);
357 if (rc)
358 goto err_device;
359
360 /* register cdev */
361 rc = -ENOMEM;
362 cdev = cdev_alloc();
363 if (!cdev)
364 goto err_device;
365
366 cdev->owner = THIS_MODULE;
367 cdev->ops = &cuse_frontend_fops;
368
369 rc = cdev_add(cdev, devt, 1);
370 if (rc)
371 goto err_cdev;
372
373 cc->dev = dev;
374 cc->cdev = cdev;
375
376 /* make the device available */
377 spin_lock(&cuse_lock);
378 list_add(&cc->list, cuse_conntbl_head(devt));
379 spin_unlock(&cuse_lock);
380
381 /* announce device availability */
382 dev_set_uevent_suppress(dev, 0);
383 kobject_uevent(&dev->kobj, KOBJ_ADD);
384out:
385 __free_page(page);
386 return;
387
388err_cdev:
389 cdev_del(cdev);
390err_device:
391 put_device(dev);
392err_region:
393 unregister_chrdev_region(devt, 1);
394err:
395 fc->conn_error = 1;
396 goto out;
397}
398
399static int cuse_send_init(struct cuse_conn *cc)
400{
401 int rc;
402 struct fuse_req *req;
403 struct page *page;
404 struct fuse_conn *fc = &cc->fc;
405 struct cuse_init_in *arg;
406
407 BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
408
409 req = fuse_get_req(fc);
410 if (IS_ERR(req)) {
411 rc = PTR_ERR(req);
412 goto err;
413 }
414
415 rc = -ENOMEM;
416 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
417 if (!page)
418 goto err_put_req;
419
420 arg = &req->misc.cuse_init_in;
421 arg->major = FUSE_KERNEL_VERSION;
422 arg->minor = FUSE_KERNEL_MINOR_VERSION;
423 arg->flags |= CUSE_UNRESTRICTED_IOCTL;
424 req->in.h.opcode = CUSE_INIT;
425 req->in.numargs = 1;
426 req->in.args[0].size = sizeof(struct cuse_init_in);
427 req->in.args[0].value = arg;
428 req->out.numargs = 2;
429 req->out.args[0].size = sizeof(struct cuse_init_out);
430 req->out.args[0].value = &req->misc.cuse_init_out;
431 req->out.args[1].size = CUSE_INIT_INFO_MAX;
432 req->out.argvar = 1;
433 req->out.argpages = 1;
434 req->pages[0] = page;
435 req->num_pages = 1;
436 req->end = cuse_process_init_reply;
437 fuse_request_send_background(fc, req);
438
439 return 0;
440
441err_put_req:
442 fuse_put_request(fc, req);
443err:
444 return rc;
445}
446
447static void cuse_fc_release(struct fuse_conn *fc)
448{
449 struct cuse_conn *cc = fc_to_cc(fc);
450 kfree(cc);
451}
452
453/**
454 * cuse_channel_open - open method for /dev/cuse
455 * @inode: inode for /dev/cuse
456 * @file: file struct being opened
457 *
458 * Userland CUSE server can create a CUSE device by opening /dev/cuse
459 * and replying to the initilaization request kernel sends. This
460 * function is responsible for handling CUSE device initialization.
461 * Because the fd opened by this function is used during
462 * initialization, this function only creates cuse_conn and sends
463 * init. The rest is delegated to a kthread.
464 *
465 * RETURNS:
466 * 0 on success, -errno on failure.
467 */
468static int cuse_channel_open(struct inode *inode, struct file *file)
469{
470 struct cuse_conn *cc;
471 int rc;
472
473 /* set up cuse_conn */
474 cc = kzalloc(sizeof(*cc), GFP_KERNEL);
475 if (!cc)
476 return -ENOMEM;
477
478 fuse_conn_init(&cc->fc);
479
480 INIT_LIST_HEAD(&cc->list);
481 cc->fc.release = cuse_fc_release;
482
483 cc->fc.connected = 1;
484 cc->fc.blocked = 0;
485 rc = cuse_send_init(cc);
486 if (rc) {
487 fuse_conn_put(&cc->fc);
488 return rc;
489 }
490 file->private_data = &cc->fc; /* channel owns base reference to cc */
491
492 return 0;
493}
494
495/**
496 * cuse_channel_release - release method for /dev/cuse
497 * @inode: inode for /dev/cuse
498 * @file: file struct being closed
499 *
500 * Disconnect the channel, deregister CUSE device and initiate
501 * destruction by putting the default reference.
502 *
503 * RETURNS:
504 * 0 on success, -errno on failure.
505 */
506static int cuse_channel_release(struct inode *inode, struct file *file)
507{
508 struct cuse_conn *cc = fc_to_cc(file->private_data);
509 int rc;
510
511 /* remove from the conntbl, no more access from this point on */
512 spin_lock(&cuse_lock);
513 list_del_init(&cc->list);
514 spin_unlock(&cuse_lock);
515
516 /* remove device */
517 if (cc->dev)
518 device_unregister(cc->dev);
519 if (cc->cdev) {
520 unregister_chrdev_region(cc->cdev->dev, 1);
521 cdev_del(cc->cdev);
522 }
523
524 /* kill connection and shutdown channel */
525 fuse_conn_kill(&cc->fc);
526 rc = fuse_dev_release(inode, file); /* puts the base reference */
527
528 return rc;
529}
530
531static struct file_operations cuse_channel_fops; /* initialized during init */
532
533
534/**************************************************************************
535 * Misc stuff and module initializatiion
536 *
537 * CUSE exports the same set of attributes to sysfs as fusectl.
538 */
539
540static ssize_t cuse_class_waiting_show(struct device *dev,
541 struct device_attribute *attr, char *buf)
542{
543 struct cuse_conn *cc = dev_get_drvdata(dev);
544
545 return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
546}
547
548static ssize_t cuse_class_abort_store(struct device *dev,
549 struct device_attribute *attr,
550 const char *buf, size_t count)
551{
552 struct cuse_conn *cc = dev_get_drvdata(dev);
553
554 fuse_abort_conn(&cc->fc);
555 return count;
556}
557
558static struct device_attribute cuse_class_dev_attrs[] = {
559 __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL),
560 __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store),
561 { }
562};
563
564static struct miscdevice cuse_miscdev = {
565 .minor = MISC_DYNAMIC_MINOR,
566 .name = "cuse",
567 .fops = &cuse_channel_fops,
568};
569
570static int __init cuse_init(void)
571{
572 int i, rc;
573
574 /* init conntbl */
575 for (i = 0; i < CUSE_CONNTBL_LEN; i++)
576 INIT_LIST_HEAD(&cuse_conntbl[i]);
577
578 /* inherit and extend fuse_dev_operations */
579 cuse_channel_fops = fuse_dev_operations;
580 cuse_channel_fops.owner = THIS_MODULE;
581 cuse_channel_fops.open = cuse_channel_open;
582 cuse_channel_fops.release = cuse_channel_release;
583
584 cuse_class = class_create(THIS_MODULE, "cuse");
585 if (IS_ERR(cuse_class))
586 return PTR_ERR(cuse_class);
587
588 cuse_class->dev_attrs = cuse_class_dev_attrs;
589
590 rc = misc_register(&cuse_miscdev);
591 if (rc) {
592 class_destroy(cuse_class);
593 return rc;
594 }
595
596 return 0;
597}
598
599static void __exit cuse_exit(void)
600{
601 misc_deregister(&cuse_miscdev);
602 class_destroy(cuse_class);
603}
604
605module_init(cuse_init);
606module_exit(cuse_exit);
607
608MODULE_AUTHOR("Tejun Heo <tj@kernel.org>");
609MODULE_DESCRIPTION("Character device in Userspace");
610MODULE_LICENSE("GPL");
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ba76b68c52ff..6484eb75acd6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -46,6 +46,7 @@ struct fuse_req *fuse_request_alloc(void)
46 fuse_request_init(req); 46 fuse_request_init(req);
47 return req; 47 return req;
48} 48}
49EXPORT_SYMBOL_GPL(fuse_request_alloc);
49 50
50struct fuse_req *fuse_request_alloc_nofs(void) 51struct fuse_req *fuse_request_alloc_nofs(void)
51{ 52{
@@ -124,6 +125,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
124 atomic_dec(&fc->num_waiting); 125 atomic_dec(&fc->num_waiting);
125 return ERR_PTR(err); 126 return ERR_PTR(err);
126} 127}
128EXPORT_SYMBOL_GPL(fuse_get_req);
127 129
128/* 130/*
129 * Return request in fuse_file->reserved_req. However that may 131 * Return request in fuse_file->reserved_req. However that may
@@ -208,6 +210,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
208 fuse_request_free(req); 210 fuse_request_free(req);
209 } 211 }
210} 212}
213EXPORT_SYMBOL_GPL(fuse_put_request);
211 214
212static unsigned len_args(unsigned numargs, struct fuse_arg *args) 215static unsigned len_args(unsigned numargs, struct fuse_arg *args)
213{ 216{
@@ -282,9 +285,9 @@ __releases(&fc->lock)
282 wake_up_all(&fc->blocked_waitq); 285 wake_up_all(&fc->blocked_waitq);
283 } 286 }
284 if (fc->num_background == FUSE_CONGESTION_THRESHOLD && 287 if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
285 fc->connected) { 288 fc->connected && fc->bdi_initialized) {
286 clear_bdi_congested(&fc->bdi, READ); 289 clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
287 clear_bdi_congested(&fc->bdi, WRITE); 290 clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
288 } 291 }
289 fc->num_background--; 292 fc->num_background--;
290 fc->active_background--; 293 fc->active_background--;
@@ -400,6 +403,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
400 } 403 }
401 spin_unlock(&fc->lock); 404 spin_unlock(&fc->lock);
402} 405}
406EXPORT_SYMBOL_GPL(fuse_request_send);
403 407
404static void fuse_request_send_nowait_locked(struct fuse_conn *fc, 408static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
405 struct fuse_req *req) 409 struct fuse_req *req)
@@ -408,9 +412,10 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
408 fc->num_background++; 412 fc->num_background++;
409 if (fc->num_background == FUSE_MAX_BACKGROUND) 413 if (fc->num_background == FUSE_MAX_BACKGROUND)
410 fc->blocked = 1; 414 fc->blocked = 1;
411 if (fc->num_background == FUSE_CONGESTION_THRESHOLD) { 415 if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
412 set_bdi_congested(&fc->bdi, READ); 416 fc->bdi_initialized) {
413 set_bdi_congested(&fc->bdi, WRITE); 417 set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
418 set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
414 } 419 }
415 list_add_tail(&req->list, &fc->bg_queue); 420 list_add_tail(&req->list, &fc->bg_queue);
416 flush_bg_queue(fc); 421 flush_bg_queue(fc);
@@ -439,6 +444,7 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
439 req->isreply = 1; 444 req->isreply = 1;
440 fuse_request_send_nowait(fc, req); 445 fuse_request_send_nowait(fc, req);
441} 446}
447EXPORT_SYMBOL_GPL(fuse_request_send_background);
442 448
443/* 449/*
444 * Called under fc->lock 450 * Called under fc->lock
@@ -843,6 +849,81 @@ err:
843 return err; 849 return err;
844} 850}
845 851
852static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
853 struct fuse_copy_state *cs)
854{
855 struct fuse_notify_inval_inode_out outarg;
856 int err = -EINVAL;
857
858 if (size != sizeof(outarg))
859 goto err;
860
861 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
862 if (err)
863 goto err;
864 fuse_copy_finish(cs);
865
866 down_read(&fc->killsb);
867 err = -ENOENT;
868 if (!fc->sb)
869 goto err_unlock;
870
871 err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
872 outarg.off, outarg.len);
873
874err_unlock:
875 up_read(&fc->killsb);
876 return err;
877
878err:
879 fuse_copy_finish(cs);
880 return err;
881}
882
883static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
884 struct fuse_copy_state *cs)
885{
886 struct fuse_notify_inval_entry_out outarg;
887 int err = -EINVAL;
888 char buf[FUSE_NAME_MAX+1];
889 struct qstr name;
890
891 if (size < sizeof(outarg))
892 goto err;
893
894 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
895 if (err)
896 goto err;
897
898 err = -ENAMETOOLONG;
899 if (outarg.namelen > FUSE_NAME_MAX)
900 goto err;
901
902 name.name = buf;
903 name.len = outarg.namelen;
904 err = fuse_copy_one(cs, buf, outarg.namelen + 1);
905 if (err)
906 goto err;
907 fuse_copy_finish(cs);
908 buf[outarg.namelen] = 0;
909 name.hash = full_name_hash(name.name, name.len);
910
911 down_read(&fc->killsb);
912 err = -ENOENT;
913 if (!fc->sb)
914 goto err_unlock;
915
916 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
917
918err_unlock:
919 up_read(&fc->killsb);
920 return err;
921
922err:
923 fuse_copy_finish(cs);
924 return err;
925}
926
846static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, 927static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
847 unsigned int size, struct fuse_copy_state *cs) 928 unsigned int size, struct fuse_copy_state *cs)
848{ 929{
@@ -850,6 +931,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
850 case FUSE_NOTIFY_POLL: 931 case FUSE_NOTIFY_POLL:
851 return fuse_notify_poll(fc, size, cs); 932 return fuse_notify_poll(fc, size, cs);
852 933
934 case FUSE_NOTIFY_INVAL_INODE:
935 return fuse_notify_inval_inode(fc, size, cs);
936
937 case FUSE_NOTIFY_INVAL_ENTRY:
938 return fuse_notify_inval_entry(fc, size, cs);
939
853 default: 940 default:
854 fuse_copy_finish(cs); 941 fuse_copy_finish(cs);
855 return -EINVAL; 942 return -EINVAL;
@@ -904,7 +991,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
904 unsigned long nr_segs, loff_t pos) 991 unsigned long nr_segs, loff_t pos)
905{ 992{
906 int err; 993 int err;
907 unsigned nbytes = iov_length(iov, nr_segs); 994 size_t nbytes = iov_length(iov, nr_segs);
908 struct fuse_req *req; 995 struct fuse_req *req;
909 struct fuse_out_header oh; 996 struct fuse_out_header oh;
910 struct fuse_copy_state cs; 997 struct fuse_copy_state cs;
@@ -1105,8 +1192,9 @@ void fuse_abort_conn(struct fuse_conn *fc)
1105 } 1192 }
1106 spin_unlock(&fc->lock); 1193 spin_unlock(&fc->lock);
1107} 1194}
1195EXPORT_SYMBOL_GPL(fuse_abort_conn);
1108 1196
1109static int fuse_dev_release(struct inode *inode, struct file *file) 1197int fuse_dev_release(struct inode *inode, struct file *file)
1110{ 1198{
1111 struct fuse_conn *fc = fuse_get_conn(file); 1199 struct fuse_conn *fc = fuse_get_conn(file);
1112 if (fc) { 1200 if (fc) {
@@ -1120,6 +1208,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
1120 1208
1121 return 0; 1209 return 0;
1122} 1210}
1211EXPORT_SYMBOL_GPL(fuse_dev_release);
1123 1212
1124static int fuse_dev_fasync(int fd, struct file *file, int on) 1213static int fuse_dev_fasync(int fd, struct file *file, int on)
1125{ 1214{
@@ -1142,6 +1231,7 @@ const struct file_operations fuse_dev_operations = {
1142 .release = fuse_dev_release, 1231 .release = fuse_dev_release,
1143 .fasync = fuse_dev_fasync, 1232 .fasync = fuse_dev_fasync,
1144}; 1233};
1234EXPORT_SYMBOL_GPL(fuse_dev_operations);
1145 1235
1146static struct miscdevice fuse_miscdevice = { 1236static struct miscdevice fuse_miscdevice = {
1147 .minor = FUSE_MINOR, 1237 .minor = FUSE_MINOR,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8b8eebc5614b..e703654e7f40 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -362,19 +362,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
362} 362}
363 363
364/* 364/*
365 * Synchronous release for the case when something goes wrong in CREATE_OPEN
366 */
367static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
368 u64 nodeid, int flags)
369{
370 fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
371 ff->reserved_req->force = 1;
372 fuse_request_send(fc, ff->reserved_req);
373 fuse_put_request(fc, ff->reserved_req);
374 kfree(ff);
375}
376
377/*
378 * Atomic create+open operation 365 * Atomic create+open operation
379 * 366 *
380 * If the filesystem doesn't support this, then fall back to separate 367 * If the filesystem doesn't support this, then fall back to separate
@@ -388,7 +375,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
388 struct fuse_conn *fc = get_fuse_conn(dir); 375 struct fuse_conn *fc = get_fuse_conn(dir);
389 struct fuse_req *req; 376 struct fuse_req *req;
390 struct fuse_req *forget_req; 377 struct fuse_req *forget_req;
391 struct fuse_open_in inarg; 378 struct fuse_create_in inarg;
392 struct fuse_open_out outopen; 379 struct fuse_open_out outopen;
393 struct fuse_entry_out outentry; 380 struct fuse_entry_out outentry;
394 struct fuse_file *ff; 381 struct fuse_file *ff;
@@ -412,15 +399,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
412 if (!ff) 399 if (!ff)
413 goto out_put_request; 400 goto out_put_request;
414 401
402 if (!fc->dont_mask)
403 mode &= ~current_umask();
404
415 flags &= ~O_NOCTTY; 405 flags &= ~O_NOCTTY;
416 memset(&inarg, 0, sizeof(inarg)); 406 memset(&inarg, 0, sizeof(inarg));
417 memset(&outentry, 0, sizeof(outentry)); 407 memset(&outentry, 0, sizeof(outentry));
418 inarg.flags = flags; 408 inarg.flags = flags;
419 inarg.mode = mode; 409 inarg.mode = mode;
410 inarg.umask = current_umask();
420 req->in.h.opcode = FUSE_CREATE; 411 req->in.h.opcode = FUSE_CREATE;
421 req->in.h.nodeid = get_node_id(dir); 412 req->in.h.nodeid = get_node_id(dir);
422 req->in.numargs = 2; 413 req->in.numargs = 2;
423 req->in.args[0].size = sizeof(inarg); 414 req->in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) :
415 sizeof(inarg);
424 req->in.args[0].value = &inarg; 416 req->in.args[0].value = &inarg;
425 req->in.args[1].size = entry->d_name.len + 1; 417 req->in.args[1].size = entry->d_name.len + 1;
426 req->in.args[1].value = entry->d_name.name; 418 req->in.args[1].value = entry->d_name.name;
@@ -445,12 +437,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
445 goto out_free_ff; 437 goto out_free_ff;
446 438
447 fuse_put_request(fc, req); 439 fuse_put_request(fc, req);
440 ff->fh = outopen.fh;
441 ff->nodeid = outentry.nodeid;
442 ff->open_flags = outopen.open_flags;
448 inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, 443 inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
449 &outentry.attr, entry_attr_timeout(&outentry), 0); 444 &outentry.attr, entry_attr_timeout(&outentry), 0);
450 if (!inode) { 445 if (!inode) {
451 flags &= ~(O_CREAT | O_EXCL | O_TRUNC); 446 flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
452 ff->fh = outopen.fh; 447 fuse_sync_release(ff, flags);
453 fuse_sync_release(fc, ff, outentry.nodeid, flags);
454 fuse_send_forget(fc, forget_req, outentry.nodeid, 1); 448 fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
455 return -ENOMEM; 449 return -ENOMEM;
456 } 450 }
@@ -460,11 +454,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
460 fuse_invalidate_attr(dir); 454 fuse_invalidate_attr(dir);
461 file = lookup_instantiate_filp(nd, entry, generic_file_open); 455 file = lookup_instantiate_filp(nd, entry, generic_file_open);
462 if (IS_ERR(file)) { 456 if (IS_ERR(file)) {
463 ff->fh = outopen.fh; 457 fuse_sync_release(ff, flags);
464 fuse_sync_release(fc, ff, outentry.nodeid, flags);
465 return PTR_ERR(file); 458 return PTR_ERR(file);
466 } 459 }
467 fuse_finish_open(inode, file, ff, &outopen); 460 file->private_data = fuse_file_get(ff);
461 fuse_finish_open(inode, file);
468 return 0; 462 return 0;
469 463
470 out_free_ff: 464 out_free_ff:
@@ -557,12 +551,17 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
557 if (IS_ERR(req)) 551 if (IS_ERR(req))
558 return PTR_ERR(req); 552 return PTR_ERR(req);
559 553
554 if (!fc->dont_mask)
555 mode &= ~current_umask();
556
560 memset(&inarg, 0, sizeof(inarg)); 557 memset(&inarg, 0, sizeof(inarg));
561 inarg.mode = mode; 558 inarg.mode = mode;
562 inarg.rdev = new_encode_dev(rdev); 559 inarg.rdev = new_encode_dev(rdev);
560 inarg.umask = current_umask();
563 req->in.h.opcode = FUSE_MKNOD; 561 req->in.h.opcode = FUSE_MKNOD;
564 req->in.numargs = 2; 562 req->in.numargs = 2;
565 req->in.args[0].size = sizeof(inarg); 563 req->in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE :
564 sizeof(inarg);
566 req->in.args[0].value = &inarg; 565 req->in.args[0].value = &inarg;
567 req->in.args[1].size = entry->d_name.len + 1; 566 req->in.args[1].size = entry->d_name.len + 1;
568 req->in.args[1].value = entry->d_name.name; 567 req->in.args[1].value = entry->d_name.name;
@@ -589,8 +588,12 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
589 if (IS_ERR(req)) 588 if (IS_ERR(req))
590 return PTR_ERR(req); 589 return PTR_ERR(req);
591 590
591 if (!fc->dont_mask)
592 mode &= ~current_umask();
593
592 memset(&inarg, 0, sizeof(inarg)); 594 memset(&inarg, 0, sizeof(inarg));
593 inarg.mode = mode; 595 inarg.mode = mode;
596 inarg.umask = current_umask();
594 req->in.h.opcode = FUSE_MKDIR; 597 req->in.h.opcode = FUSE_MKDIR;
595 req->in.numargs = 2; 598 req->in.numargs = 2;
596 req->in.args[0].size = sizeof(inarg); 599 req->in.args[0].size = sizeof(inarg);
@@ -856,6 +859,43 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
856 return err; 859 return err;
857} 860}
858 861
862int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
863 struct qstr *name)
864{
865 int err = -ENOTDIR;
866 struct inode *parent;
867 struct dentry *dir;
868 struct dentry *entry;
869
870 parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
871 if (!parent)
872 return -ENOENT;
873
874 mutex_lock(&parent->i_mutex);
875 if (!S_ISDIR(parent->i_mode))
876 goto unlock;
877
878 err = -ENOENT;
879 dir = d_find_alias(parent);
880 if (!dir)
881 goto unlock;
882
883 entry = d_lookup(dir, name);
884 dput(dir);
885 if (!entry)
886 goto unlock;
887
888 fuse_invalidate_attr(parent);
889 fuse_invalidate_entry(entry);
890 dput(entry);
891 err = 0;
892
893 unlock:
894 mutex_unlock(&parent->i_mutex);
895 iput(parent);
896 return err;
897}
898
859/* 899/*
860 * Calling into a user-controlled filesystem gives the filesystem 900 * Calling into a user-controlled filesystem gives the filesystem
861 * daemon ptrace-like capabilities over the requester process. This 901 * daemon ptrace-like capabilities over the requester process. This
@@ -1035,7 +1075,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1035 req->out.argpages = 1; 1075 req->out.argpages = 1;
1036 req->num_pages = 1; 1076 req->num_pages = 1;
1037 req->pages[0] = page; 1077 req->pages[0] = page;
1038 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); 1078 fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR);
1039 fuse_request_send(fc, req); 1079 fuse_request_send(fc, req);
1040 nbytes = req->out.args[0].size; 1080 nbytes = req->out.args[0].size;
1041 err = req->out.h.error; 1081 err = req->out.h.error;
@@ -1101,12 +1141,14 @@ static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
1101 1141
1102static int fuse_dir_open(struct inode *inode, struct file *file) 1142static int fuse_dir_open(struct inode *inode, struct file *file)
1103{ 1143{
1104 return fuse_open_common(inode, file, 1); 1144 return fuse_open_common(inode, file, true);
1105} 1145}
1106 1146
1107static int fuse_dir_release(struct inode *inode, struct file *file) 1147static int fuse_dir_release(struct inode *inode, struct file *file)
1108{ 1148{
1109 return fuse_release_common(inode, file, 1); 1149 fuse_release_common(file, FUSE_RELEASEDIR);
1150
1151 return 0;
1110} 1152}
1111 1153
1112static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync) 1154static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 06f30e965676..cbc464043b6f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -12,13 +12,13 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/module.h>
15 16
16static const struct file_operations fuse_direct_io_file_operations; 17static const struct file_operations fuse_direct_io_file_operations;
17 18
18static int fuse_send_open(struct inode *inode, struct file *file, int isdir, 19static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
19 struct fuse_open_out *outargp) 20 int opcode, struct fuse_open_out *outargp)
20{ 21{
21 struct fuse_conn *fc = get_fuse_conn(inode);
22 struct fuse_open_in inarg; 22 struct fuse_open_in inarg;
23 struct fuse_req *req; 23 struct fuse_req *req;
24 int err; 24 int err;
@@ -31,8 +31,8 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
31 inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); 31 inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
32 if (!fc->atomic_o_trunc) 32 if (!fc->atomic_o_trunc)
33 inarg.flags &= ~O_TRUNC; 33 inarg.flags &= ~O_TRUNC;
34 req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; 34 req->in.h.opcode = opcode;
35 req->in.h.nodeid = get_node_id(inode); 35 req->in.h.nodeid = nodeid;
36 req->in.numargs = 1; 36 req->in.numargs = 1;
37 req->in.args[0].size = sizeof(inarg); 37 req->in.args[0].size = sizeof(inarg);
38 req->in.args[0].value = &inarg; 38 req->in.args[0].value = &inarg;
@@ -49,22 +49,27 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
49struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) 49struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
50{ 50{
51 struct fuse_file *ff; 51 struct fuse_file *ff;
52
52 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); 53 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
53 if (ff) { 54 if (unlikely(!ff))
54 ff->reserved_req = fuse_request_alloc(); 55 return NULL;
55 if (!ff->reserved_req) { 56
56 kfree(ff); 57 ff->fc = fc;
57 return NULL; 58 ff->reserved_req = fuse_request_alloc();
58 } else { 59 if (unlikely(!ff->reserved_req)) {
59 INIT_LIST_HEAD(&ff->write_entry); 60 kfree(ff);
60 atomic_set(&ff->count, 0); 61 return NULL;
61 spin_lock(&fc->lock);
62 ff->kh = ++fc->khctr;
63 spin_unlock(&fc->lock);
64 }
65 RB_CLEAR_NODE(&ff->polled_node);
66 init_waitqueue_head(&ff->poll_wait);
67 } 62 }
63
64 INIT_LIST_HEAD(&ff->write_entry);
65 atomic_set(&ff->count, 0);
66 RB_CLEAR_NODE(&ff->polled_node);
67 init_waitqueue_head(&ff->poll_wait);
68
69 spin_lock(&fc->lock);
70 ff->kh = ++fc->khctr;
71 spin_unlock(&fc->lock);
72
68 return ff; 73 return ff;
69} 74}
70 75
@@ -74,7 +79,7 @@ void fuse_file_free(struct fuse_file *ff)
74 kfree(ff); 79 kfree(ff);
75} 80}
76 81
77static struct fuse_file *fuse_file_get(struct fuse_file *ff) 82struct fuse_file *fuse_file_get(struct fuse_file *ff)
78{ 83{
79 atomic_inc(&ff->count); 84 atomic_inc(&ff->count);
80 return ff; 85 return ff;
@@ -82,40 +87,65 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff)
82 87
83static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) 88static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
84{ 89{
85 dput(req->misc.release.dentry); 90 path_put(&req->misc.release.path);
86 mntput(req->misc.release.vfsmount);
87} 91}
88 92
89static void fuse_file_put(struct fuse_file *ff) 93static void fuse_file_put(struct fuse_file *ff)
90{ 94{
91 if (atomic_dec_and_test(&ff->count)) { 95 if (atomic_dec_and_test(&ff->count)) {
92 struct fuse_req *req = ff->reserved_req; 96 struct fuse_req *req = ff->reserved_req;
93 struct inode *inode = req->misc.release.dentry->d_inode; 97
94 struct fuse_conn *fc = get_fuse_conn(inode);
95 req->end = fuse_release_end; 98 req->end = fuse_release_end;
96 fuse_request_send_background(fc, req); 99 fuse_request_send_background(ff->fc, req);
97 kfree(ff); 100 kfree(ff);
98 } 101 }
99} 102}
100 103
101void fuse_finish_open(struct inode *inode, struct file *file, 104int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
102 struct fuse_file *ff, struct fuse_open_out *outarg) 105 bool isdir)
103{ 106{
104 if (outarg->open_flags & FOPEN_DIRECT_IO) 107 struct fuse_open_out outarg;
108 struct fuse_file *ff;
109 int err;
110 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
111
112 ff = fuse_file_alloc(fc);
113 if (!ff)
114 return -ENOMEM;
115
116 err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
117 if (err) {
118 fuse_file_free(ff);
119 return err;
120 }
121
122 if (isdir)
123 outarg.open_flags &= ~FOPEN_DIRECT_IO;
124
125 ff->fh = outarg.fh;
126 ff->nodeid = nodeid;
127 ff->open_flags = outarg.open_flags;
128 file->private_data = fuse_file_get(ff);
129
130 return 0;
131}
132EXPORT_SYMBOL_GPL(fuse_do_open);
133
134void fuse_finish_open(struct inode *inode, struct file *file)
135{
136 struct fuse_file *ff = file->private_data;
137
138 if (ff->open_flags & FOPEN_DIRECT_IO)
105 file->f_op = &fuse_direct_io_file_operations; 139 file->f_op = &fuse_direct_io_file_operations;
106 if (!(outarg->open_flags & FOPEN_KEEP_CACHE)) 140 if (!(ff->open_flags & FOPEN_KEEP_CACHE))
107 invalidate_inode_pages2(inode->i_mapping); 141 invalidate_inode_pages2(inode->i_mapping);
108 if (outarg->open_flags & FOPEN_NONSEEKABLE) 142 if (ff->open_flags & FOPEN_NONSEEKABLE)
109 nonseekable_open(inode, file); 143 nonseekable_open(inode, file);
110 ff->fh = outarg->fh;
111 file->private_data = fuse_file_get(ff);
112} 144}
113 145
114int fuse_open_common(struct inode *inode, struct file *file, int isdir) 146int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
115{ 147{
116 struct fuse_conn *fc = get_fuse_conn(inode); 148 struct fuse_conn *fc = get_fuse_conn(inode);
117 struct fuse_open_out outarg;
118 struct fuse_file *ff;
119 int err; 149 int err;
120 150
121 /* VFS checks this, but only _after_ ->open() */ 151 /* VFS checks this, but only _after_ ->open() */
@@ -126,78 +156,85 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
126 if (err) 156 if (err)
127 return err; 157 return err;
128 158
129 ff = fuse_file_alloc(fc); 159 err = fuse_do_open(fc, get_node_id(inode), file, isdir);
130 if (!ff)
131 return -ENOMEM;
132
133 err = fuse_send_open(inode, file, isdir, &outarg);
134 if (err) 160 if (err)
135 fuse_file_free(ff); 161 return err;
136 else {
137 if (isdir)
138 outarg.open_flags &= ~FOPEN_DIRECT_IO;
139 fuse_finish_open(inode, file, ff, &outarg);
140 }
141 162
142 return err; 163 fuse_finish_open(inode, file);
164
165 return 0;
143} 166}
144 167
145void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode) 168static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
146{ 169{
170 struct fuse_conn *fc = ff->fc;
147 struct fuse_req *req = ff->reserved_req; 171 struct fuse_req *req = ff->reserved_req;
148 struct fuse_release_in *inarg = &req->misc.release.in; 172 struct fuse_release_in *inarg = &req->misc.release.in;
149 173
174 spin_lock(&fc->lock);
175 list_del(&ff->write_entry);
176 if (!RB_EMPTY_NODE(&ff->polled_node))
177 rb_erase(&ff->polled_node, &fc->polled_files);
178 spin_unlock(&fc->lock);
179
180 wake_up_interruptible_sync(&ff->poll_wait);
181
150 inarg->fh = ff->fh; 182 inarg->fh = ff->fh;
151 inarg->flags = flags; 183 inarg->flags = flags;
152 req->in.h.opcode = opcode; 184 req->in.h.opcode = opcode;
153 req->in.h.nodeid = nodeid; 185 req->in.h.nodeid = ff->nodeid;
154 req->in.numargs = 1; 186 req->in.numargs = 1;
155 req->in.args[0].size = sizeof(struct fuse_release_in); 187 req->in.args[0].size = sizeof(struct fuse_release_in);
156 req->in.args[0].value = inarg; 188 req->in.args[0].value = inarg;
157} 189}
158 190
159int fuse_release_common(struct inode *inode, struct file *file, int isdir) 191void fuse_release_common(struct file *file, int opcode)
160{ 192{
161 struct fuse_file *ff = file->private_data; 193 struct fuse_file *ff;
162 if (ff) { 194 struct fuse_req *req;
163 struct fuse_conn *fc = get_fuse_conn(inode);
164 struct fuse_req *req = ff->reserved_req;
165
166 fuse_release_fill(ff, get_node_id(inode), file->f_flags,
167 isdir ? FUSE_RELEASEDIR : FUSE_RELEASE);
168 195
169 /* Hold vfsmount and dentry until release is finished */ 196 ff = file->private_data;
170 req->misc.release.vfsmount = mntget(file->f_path.mnt); 197 if (unlikely(!ff))
171 req->misc.release.dentry = dget(file->f_path.dentry); 198 return;
172 199
173 spin_lock(&fc->lock); 200 req = ff->reserved_req;
174 list_del(&ff->write_entry); 201 fuse_prepare_release(ff, file->f_flags, opcode);
175 if (!RB_EMPTY_NODE(&ff->polled_node))
176 rb_erase(&ff->polled_node, &fc->polled_files);
177 spin_unlock(&fc->lock);
178 202
179 wake_up_interruptible_sync(&ff->poll_wait); 203 /* Hold vfsmount and dentry until release is finished */
180 /* 204 path_get(&file->f_path);
181 * Normally this will send the RELEASE request, 205 req->misc.release.path = file->f_path;
182 * however if some asynchronous READ or WRITE requests
183 * are outstanding, the sending will be delayed
184 */
185 fuse_file_put(ff);
186 }
187 206
188 /* Return value is ignored by VFS */ 207 /*
189 return 0; 208 * Normally this will send the RELEASE request, however if
209 * some asynchronous READ or WRITE requests are outstanding,
210 * the sending will be delayed.
211 */
212 fuse_file_put(ff);
190} 213}
191 214
192static int fuse_open(struct inode *inode, struct file *file) 215static int fuse_open(struct inode *inode, struct file *file)
193{ 216{
194 return fuse_open_common(inode, file, 0); 217 return fuse_open_common(inode, file, false);
195} 218}
196 219
197static int fuse_release(struct inode *inode, struct file *file) 220static int fuse_release(struct inode *inode, struct file *file)
198{ 221{
199 return fuse_release_common(inode, file, 0); 222 fuse_release_common(file, FUSE_RELEASE);
223
224 /* return value is ignored by VFS */
225 return 0;
226}
227
228void fuse_sync_release(struct fuse_file *ff, int flags)
229{
230 WARN_ON(atomic_read(&ff->count) > 1);
231 fuse_prepare_release(ff, flags, FUSE_RELEASE);
232 ff->reserved_req->force = 1;
233 fuse_request_send(ff->fc, ff->reserved_req);
234 fuse_put_request(ff->fc, ff->reserved_req);
235 kfree(ff);
200} 236}
237EXPORT_SYMBOL_GPL(fuse_sync_release);
201 238
202/* 239/*
203 * Scramble the ID space with XTEA, so that the value of the files_struct 240 * Scramble the ID space with XTEA, so that the value of the files_struct
@@ -371,8 +408,8 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
371 return fuse_fsync_common(file, de, datasync, 0); 408 return fuse_fsync_common(file, de, datasync, 0);
372} 409}
373 410
374void fuse_read_fill(struct fuse_req *req, struct file *file, 411void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
375 struct inode *inode, loff_t pos, size_t count, int opcode) 412 size_t count, int opcode)
376{ 413{
377 struct fuse_read_in *inarg = &req->misc.read.in; 414 struct fuse_read_in *inarg = &req->misc.read.in;
378 struct fuse_file *ff = file->private_data; 415 struct fuse_file *ff = file->private_data;
@@ -382,7 +419,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
382 inarg->size = count; 419 inarg->size = count;
383 inarg->flags = file->f_flags; 420 inarg->flags = file->f_flags;
384 req->in.h.opcode = opcode; 421 req->in.h.opcode = opcode;
385 req->in.h.nodeid = get_node_id(inode); 422 req->in.h.nodeid = ff->nodeid;
386 req->in.numargs = 1; 423 req->in.numargs = 1;
387 req->in.args[0].size = sizeof(struct fuse_read_in); 424 req->in.args[0].size = sizeof(struct fuse_read_in);
388 req->in.args[0].value = inarg; 425 req->in.args[0].value = inarg;
@@ -392,12 +429,12 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
392} 429}
393 430
394static size_t fuse_send_read(struct fuse_req *req, struct file *file, 431static size_t fuse_send_read(struct fuse_req *req, struct file *file,
395 struct inode *inode, loff_t pos, size_t count, 432 loff_t pos, size_t count, fl_owner_t owner)
396 fl_owner_t owner)
397{ 433{
398 struct fuse_conn *fc = get_fuse_conn(inode); 434 struct fuse_file *ff = file->private_data;
435 struct fuse_conn *fc = ff->fc;
399 436
400 fuse_read_fill(req, file, inode, pos, count, FUSE_READ); 437 fuse_read_fill(req, file, pos, count, FUSE_READ);
401 if (owner != NULL) { 438 if (owner != NULL) {
402 struct fuse_read_in *inarg = &req->misc.read.in; 439 struct fuse_read_in *inarg = &req->misc.read.in;
403 440
@@ -455,7 +492,7 @@ static int fuse_readpage(struct file *file, struct page *page)
455 req->out.argpages = 1; 492 req->out.argpages = 1;
456 req->num_pages = 1; 493 req->num_pages = 1;
457 req->pages[0] = page; 494 req->pages[0] = page;
458 num_read = fuse_send_read(req, file, inode, pos, count, NULL); 495 num_read = fuse_send_read(req, file, pos, count, NULL);
459 err = req->out.h.error; 496 err = req->out.h.error;
460 fuse_put_request(fc, req); 497 fuse_put_request(fc, req);
461 498
@@ -504,19 +541,18 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
504 fuse_file_put(req->ff); 541 fuse_file_put(req->ff);
505} 542}
506 543
507static void fuse_send_readpages(struct fuse_req *req, struct file *file, 544static void fuse_send_readpages(struct fuse_req *req, struct file *file)
508 struct inode *inode)
509{ 545{
510 struct fuse_conn *fc = get_fuse_conn(inode); 546 struct fuse_file *ff = file->private_data;
547 struct fuse_conn *fc = ff->fc;
511 loff_t pos = page_offset(req->pages[0]); 548 loff_t pos = page_offset(req->pages[0]);
512 size_t count = req->num_pages << PAGE_CACHE_SHIFT; 549 size_t count = req->num_pages << PAGE_CACHE_SHIFT;
513 550
514 req->out.argpages = 1; 551 req->out.argpages = 1;
515 req->out.page_zeroing = 1; 552 req->out.page_zeroing = 1;
516 fuse_read_fill(req, file, inode, pos, count, FUSE_READ); 553 fuse_read_fill(req, file, pos, count, FUSE_READ);
517 req->misc.read.attr_ver = fuse_get_attr_version(fc); 554 req->misc.read.attr_ver = fuse_get_attr_version(fc);
518 if (fc->async_read) { 555 if (fc->async_read) {
519 struct fuse_file *ff = file->private_data;
520 req->ff = fuse_file_get(ff); 556 req->ff = fuse_file_get(ff);
521 req->end = fuse_readpages_end; 557 req->end = fuse_readpages_end;
522 fuse_request_send_background(fc, req); 558 fuse_request_send_background(fc, req);
@@ -546,7 +582,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
546 (req->num_pages == FUSE_MAX_PAGES_PER_REQ || 582 (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
547 (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || 583 (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
548 req->pages[req->num_pages - 1]->index + 1 != page->index)) { 584 req->pages[req->num_pages - 1]->index + 1 != page->index)) {
549 fuse_send_readpages(req, data->file, inode); 585 fuse_send_readpages(req, data->file);
550 data->req = req = fuse_get_req(fc); 586 data->req = req = fuse_get_req(fc);
551 if (IS_ERR(req)) { 587 if (IS_ERR(req)) {
552 unlock_page(page); 588 unlock_page(page);
@@ -580,7 +616,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
580 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); 616 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
581 if (!err) { 617 if (!err) {
582 if (data.req->num_pages) 618 if (data.req->num_pages)
583 fuse_send_readpages(data.req, file, inode); 619 fuse_send_readpages(data.req, file);
584 else 620 else
585 fuse_put_request(fc, data.req); 621 fuse_put_request(fc, data.req);
586 } 622 }
@@ -607,24 +643,19 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
607 return generic_file_aio_read(iocb, iov, nr_segs, pos); 643 return generic_file_aio_read(iocb, iov, nr_segs, pos);
608} 644}
609 645
610static void fuse_write_fill(struct fuse_req *req, struct file *file, 646static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
611 struct fuse_file *ff, struct inode *inode, 647 loff_t pos, size_t count)
612 loff_t pos, size_t count, int writepage)
613{ 648{
614 struct fuse_conn *fc = get_fuse_conn(inode);
615 struct fuse_write_in *inarg = &req->misc.write.in; 649 struct fuse_write_in *inarg = &req->misc.write.in;
616 struct fuse_write_out *outarg = &req->misc.write.out; 650 struct fuse_write_out *outarg = &req->misc.write.out;
617 651
618 memset(inarg, 0, sizeof(struct fuse_write_in));
619 inarg->fh = ff->fh; 652 inarg->fh = ff->fh;
620 inarg->offset = pos; 653 inarg->offset = pos;
621 inarg->size = count; 654 inarg->size = count;
622 inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0;
623 inarg->flags = file ? file->f_flags : 0;
624 req->in.h.opcode = FUSE_WRITE; 655 req->in.h.opcode = FUSE_WRITE;
625 req->in.h.nodeid = get_node_id(inode); 656 req->in.h.nodeid = ff->nodeid;
626 req->in.numargs = 2; 657 req->in.numargs = 2;
627 if (fc->minor < 9) 658 if (ff->fc->minor < 9)
628 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; 659 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
629 else 660 else
630 req->in.args[0].size = sizeof(struct fuse_write_in); 661 req->in.args[0].size = sizeof(struct fuse_write_in);
@@ -636,13 +667,15 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
636} 667}
637 668
638static size_t fuse_send_write(struct fuse_req *req, struct file *file, 669static size_t fuse_send_write(struct fuse_req *req, struct file *file,
639 struct inode *inode, loff_t pos, size_t count, 670 loff_t pos, size_t count, fl_owner_t owner)
640 fl_owner_t owner)
641{ 671{
642 struct fuse_conn *fc = get_fuse_conn(inode); 672 struct fuse_file *ff = file->private_data;
643 fuse_write_fill(req, file, file->private_data, inode, pos, count, 0); 673 struct fuse_conn *fc = ff->fc;
674 struct fuse_write_in *inarg = &req->misc.write.in;
675
676 fuse_write_fill(req, ff, pos, count);
677 inarg->flags = file->f_flags;
644 if (owner != NULL) { 678 if (owner != NULL) {
645 struct fuse_write_in *inarg = &req->misc.write.in;
646 inarg->write_flags |= FUSE_WRITE_LOCKOWNER; 679 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
647 inarg->lock_owner = fuse_lock_owner_id(fc, owner); 680 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
648 } 681 }
@@ -700,7 +733,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
700 req->num_pages = 1; 733 req->num_pages = 1;
701 req->pages[0] = page; 734 req->pages[0] = page;
702 req->page_offset = offset; 735 req->page_offset = offset;
703 nres = fuse_send_write(req, file, inode, pos, count, NULL); 736 nres = fuse_send_write(req, file, pos, count, NULL);
704 err = req->out.h.error; 737 err = req->out.h.error;
705 fuse_put_request(fc, req); 738 fuse_put_request(fc, req);
706 if (!err && !nres) 739 if (!err && !nres)
@@ -741,7 +774,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
741 for (i = 0; i < req->num_pages; i++) 774 for (i = 0; i < req->num_pages; i++)
742 fuse_wait_on_page_writeback(inode, req->pages[i]->index); 775 fuse_wait_on_page_writeback(inode, req->pages[i]->index);
743 776
744 res = fuse_send_write(req, file, inode, pos, count, NULL); 777 res = fuse_send_write(req, file, pos, count, NULL);
745 778
746 offset = req->page_offset; 779 offset = req->page_offset;
747 count = res; 780 count = res;
@@ -979,25 +1012,23 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
979 return 0; 1012 return 0;
980} 1013}
981 1014
982static ssize_t fuse_direct_io(struct file *file, const char __user *buf, 1015ssize_t fuse_direct_io(struct file *file, const char __user *buf,
983 size_t count, loff_t *ppos, int write) 1016 size_t count, loff_t *ppos, int write)
984{ 1017{
985 struct inode *inode = file->f_path.dentry->d_inode; 1018 struct fuse_file *ff = file->private_data;
986 struct fuse_conn *fc = get_fuse_conn(inode); 1019 struct fuse_conn *fc = ff->fc;
987 size_t nmax = write ? fc->max_write : fc->max_read; 1020 size_t nmax = write ? fc->max_write : fc->max_read;
988 loff_t pos = *ppos; 1021 loff_t pos = *ppos;
989 ssize_t res = 0; 1022 ssize_t res = 0;
990 struct fuse_req *req; 1023 struct fuse_req *req;
991 1024
992 if (is_bad_inode(inode))
993 return -EIO;
994
995 req = fuse_get_req(fc); 1025 req = fuse_get_req(fc);
996 if (IS_ERR(req)) 1026 if (IS_ERR(req))
997 return PTR_ERR(req); 1027 return PTR_ERR(req);
998 1028
999 while (count) { 1029 while (count) {
1000 size_t nres; 1030 size_t nres;
1031 fl_owner_t owner = current->files;
1001 size_t nbytes = min(count, nmax); 1032 size_t nbytes = min(count, nmax);
1002 int err = fuse_get_user_pages(req, buf, &nbytes, write); 1033 int err = fuse_get_user_pages(req, buf, &nbytes, write);
1003 if (err) { 1034 if (err) {
@@ -1006,11 +1037,10 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
1006 } 1037 }
1007 1038
1008 if (write) 1039 if (write)
1009 nres = fuse_send_write(req, file, inode, pos, nbytes, 1040 nres = fuse_send_write(req, file, pos, nbytes, owner);
1010 current->files);
1011 else 1041 else
1012 nres = fuse_send_read(req, file, inode, pos, nbytes, 1042 nres = fuse_send_read(req, file, pos, nbytes, owner);
1013 current->files); 1043
1014 fuse_release_user_pages(req, !write); 1044 fuse_release_user_pages(req, !write);
1015 if (req->out.h.error) { 1045 if (req->out.h.error) {
1016 if (!res) 1046 if (!res)
@@ -1034,20 +1064,27 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
1034 } 1064 }
1035 } 1065 }
1036 fuse_put_request(fc, req); 1066 fuse_put_request(fc, req);
1037 if (res > 0) { 1067 if (res > 0)
1038 if (write)
1039 fuse_write_update_size(inode, pos);
1040 *ppos = pos; 1068 *ppos = pos;
1041 }
1042 fuse_invalidate_attr(inode);
1043 1069
1044 return res; 1070 return res;
1045} 1071}
1072EXPORT_SYMBOL_GPL(fuse_direct_io);
1046 1073
1047static ssize_t fuse_direct_read(struct file *file, char __user *buf, 1074static ssize_t fuse_direct_read(struct file *file, char __user *buf,
1048 size_t count, loff_t *ppos) 1075 size_t count, loff_t *ppos)
1049{ 1076{
1050 return fuse_direct_io(file, buf, count, ppos, 0); 1077 ssize_t res;
1078 struct inode *inode = file->f_path.dentry->d_inode;
1079
1080 if (is_bad_inode(inode))
1081 return -EIO;
1082
1083 res = fuse_direct_io(file, buf, count, ppos, 0);
1084
1085 fuse_invalidate_attr(inode);
1086
1087 return res;
1051} 1088}
1052 1089
1053static ssize_t fuse_direct_write(struct file *file, const char __user *buf, 1090static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
@@ -1055,12 +1092,22 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
1055{ 1092{
1056 struct inode *inode = file->f_path.dentry->d_inode; 1093 struct inode *inode = file->f_path.dentry->d_inode;
1057 ssize_t res; 1094 ssize_t res;
1095
1096 if (is_bad_inode(inode))
1097 return -EIO;
1098
1058 /* Don't allow parallel writes to the same file */ 1099 /* Don't allow parallel writes to the same file */
1059 mutex_lock(&inode->i_mutex); 1100 mutex_lock(&inode->i_mutex);
1060 res = generic_write_checks(file, ppos, &count, 0); 1101 res = generic_write_checks(file, ppos, &count, 0);
1061 if (!res) 1102 if (!res) {
1062 res = fuse_direct_io(file, buf, count, ppos, 1); 1103 res = fuse_direct_io(file, buf, count, ppos, 1);
1104 if (res > 0)
1105 fuse_write_update_size(inode, *ppos);
1106 }
1063 mutex_unlock(&inode->i_mutex); 1107 mutex_unlock(&inode->i_mutex);
1108
1109 fuse_invalidate_attr(inode);
1110
1064 return res; 1111 return res;
1065} 1112}
1066 1113
@@ -1177,9 +1224,10 @@ static int fuse_writepage_locked(struct page *page)
1177 req->ff = fuse_file_get(ff); 1224 req->ff = fuse_file_get(ff);
1178 spin_unlock(&fc->lock); 1225 spin_unlock(&fc->lock);
1179 1226
1180 fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1); 1227 fuse_write_fill(req, ff, page_offset(page), 0);
1181 1228
1182 copy_highpage(tmp_page, page); 1229 copy_highpage(tmp_page, page);
1230 req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
1183 req->in.argpages = 1; 1231 req->in.argpages = 1;
1184 req->num_pages = 1; 1232 req->num_pages = 1;
1185 req->pages[0] = tmp_page; 1233 req->pages[0] = tmp_page;
@@ -1603,12 +1651,11 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1603 * limits ioctl data transfers to well-formed ioctls and is the forced 1651 * limits ioctl data transfers to well-formed ioctls and is the forced
1604 * behavior for all FUSE servers. 1652 * behavior for all FUSE servers.
1605 */ 1653 */
1606static long fuse_file_do_ioctl(struct file *file, unsigned int cmd, 1654long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1607 unsigned long arg, unsigned int flags) 1655 unsigned int flags)
1608{ 1656{
1609 struct inode *inode = file->f_dentry->d_inode;
1610 struct fuse_file *ff = file->private_data; 1657 struct fuse_file *ff = file->private_data;
1611 struct fuse_conn *fc = get_fuse_conn(inode); 1658 struct fuse_conn *fc = ff->fc;
1612 struct fuse_ioctl_in inarg = { 1659 struct fuse_ioctl_in inarg = {
1613 .fh = ff->fh, 1660 .fh = ff->fh,
1614 .cmd = cmd, 1661 .cmd = cmd,
@@ -1627,13 +1674,6 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
1627 /* assume all the iovs returned by client always fits in a page */ 1674 /* assume all the iovs returned by client always fits in a page */
1628 BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); 1675 BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1629 1676
1630 if (!fuse_allow_task(fc, current))
1631 return -EACCES;
1632
1633 err = -EIO;
1634 if (is_bad_inode(inode))
1635 goto out;
1636
1637 err = -ENOMEM; 1677 err = -ENOMEM;
1638 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); 1678 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
1639 iov_page = alloc_page(GFP_KERNEL); 1679 iov_page = alloc_page(GFP_KERNEL);
@@ -1694,7 +1734,7 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
1694 1734
1695 /* okay, let's send it to the client */ 1735 /* okay, let's send it to the client */
1696 req->in.h.opcode = FUSE_IOCTL; 1736 req->in.h.opcode = FUSE_IOCTL;
1697 req->in.h.nodeid = get_node_id(inode); 1737 req->in.h.nodeid = ff->nodeid;
1698 req->in.numargs = 1; 1738 req->in.numargs = 1;
1699 req->in.args[0].size = sizeof(inarg); 1739 req->in.args[0].size = sizeof(inarg);
1700 req->in.args[0].value = &inarg; 1740 req->in.args[0].value = &inarg;
@@ -1777,17 +1817,33 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
1777 1817
1778 return err ? err : outarg.result; 1818 return err ? err : outarg.result;
1779} 1819}
1820EXPORT_SYMBOL_GPL(fuse_do_ioctl);
1821
1822static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
1823 unsigned long arg, unsigned int flags)
1824{
1825 struct inode *inode = file->f_dentry->d_inode;
1826 struct fuse_conn *fc = get_fuse_conn(inode);
1827
1828 if (!fuse_allow_task(fc, current))
1829 return -EACCES;
1830
1831 if (is_bad_inode(inode))
1832 return -EIO;
1833
1834 return fuse_do_ioctl(file, cmd, arg, flags);
1835}
1780 1836
1781static long fuse_file_ioctl(struct file *file, unsigned int cmd, 1837static long fuse_file_ioctl(struct file *file, unsigned int cmd,
1782 unsigned long arg) 1838 unsigned long arg)
1783{ 1839{
1784 return fuse_file_do_ioctl(file, cmd, arg, 0); 1840 return fuse_file_ioctl_common(file, cmd, arg, 0);
1785} 1841}
1786 1842
1787static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, 1843static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
1788 unsigned long arg) 1844 unsigned long arg)
1789{ 1845{
1790 return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT); 1846 return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
1791} 1847}
1792 1848
1793/* 1849/*
@@ -1841,11 +1897,10 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
1841 spin_unlock(&fc->lock); 1897 spin_unlock(&fc->lock);
1842} 1898}
1843 1899
1844static unsigned fuse_file_poll(struct file *file, poll_table *wait) 1900unsigned fuse_file_poll(struct file *file, poll_table *wait)
1845{ 1901{
1846 struct inode *inode = file->f_dentry->d_inode;
1847 struct fuse_file *ff = file->private_data; 1902 struct fuse_file *ff = file->private_data;
1848 struct fuse_conn *fc = get_fuse_conn(inode); 1903 struct fuse_conn *fc = ff->fc;
1849 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; 1904 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
1850 struct fuse_poll_out outarg; 1905 struct fuse_poll_out outarg;
1851 struct fuse_req *req; 1906 struct fuse_req *req;
@@ -1867,10 +1922,10 @@ static unsigned fuse_file_poll(struct file *file, poll_table *wait)
1867 1922
1868 req = fuse_get_req(fc); 1923 req = fuse_get_req(fc);
1869 if (IS_ERR(req)) 1924 if (IS_ERR(req))
1870 return PTR_ERR(req); 1925 return POLLERR;
1871 1926
1872 req->in.h.opcode = FUSE_POLL; 1927 req->in.h.opcode = FUSE_POLL;
1873 req->in.h.nodeid = get_node_id(inode); 1928 req->in.h.nodeid = ff->nodeid;
1874 req->in.numargs = 1; 1929 req->in.numargs = 1;
1875 req->in.args[0].size = sizeof(inarg); 1930 req->in.args[0].size = sizeof(inarg);
1876 req->in.args[0].value = &inarg; 1931 req->in.args[0].value = &inarg;
@@ -1889,6 +1944,7 @@ static unsigned fuse_file_poll(struct file *file, poll_table *wait)
1889 } 1944 }
1890 return POLLERR; 1945 return POLLERR;
1891} 1946}
1947EXPORT_SYMBOL_GPL(fuse_file_poll);
1892 1948
1893/* 1949/*
1894 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and 1950 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6fc5aedaa0d5..52b641fc0faf 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -97,8 +97,13 @@ struct fuse_inode {
97 struct list_head writepages; 97 struct list_head writepages;
98}; 98};
99 99
100struct fuse_conn;
101
100/** FUSE specific file data */ 102/** FUSE specific file data */
101struct fuse_file { 103struct fuse_file {
104 /** Fuse connection for this file */
105 struct fuse_conn *fc;
106
102 /** Request reserved for flush and release */ 107 /** Request reserved for flush and release */
103 struct fuse_req *reserved_req; 108 struct fuse_req *reserved_req;
104 109
@@ -108,9 +113,15 @@ struct fuse_file {
108 /** File handle used by userspace */ 113 /** File handle used by userspace */
109 u64 fh; 114 u64 fh;
110 115
116 /** Node id of this file */
117 u64 nodeid;
118
111 /** Refcount */ 119 /** Refcount */
112 atomic_t count; 120 atomic_t count;
113 121
122 /** FOPEN_* flags returned by open */
123 u32 open_flags;
124
114 /** Entry on inode's write_files list */ 125 /** Entry on inode's write_files list */
115 struct list_head write_entry; 126 struct list_head write_entry;
116 127
@@ -185,8 +196,6 @@ enum fuse_req_state {
185 FUSE_REQ_FINISHED 196 FUSE_REQ_FINISHED
186}; 197};
187 198
188struct fuse_conn;
189
190/** 199/**
191 * A request to the client 200 * A request to the client
192 */ 201 */
@@ -248,11 +257,12 @@ struct fuse_req {
248 struct fuse_forget_in forget_in; 257 struct fuse_forget_in forget_in;
249 struct { 258 struct {
250 struct fuse_release_in in; 259 struct fuse_release_in in;
251 struct vfsmount *vfsmount; 260 struct path path;
252 struct dentry *dentry;
253 } release; 261 } release;
254 struct fuse_init_in init_in; 262 struct fuse_init_in init_in;
255 struct fuse_init_out init_out; 263 struct fuse_init_out init_out;
264 struct cuse_init_in cuse_init_in;
265 struct cuse_init_out cuse_init_out;
256 struct { 266 struct {
257 struct fuse_read_in in; 267 struct fuse_read_in in;
258 u64 attr_ver; 268 u64 attr_ver;
@@ -386,6 +396,9 @@ struct fuse_conn {
386 /** Filesystem supports NFS exporting. Only set in INIT */ 396 /** Filesystem supports NFS exporting. Only set in INIT */
387 unsigned export_support:1; 397 unsigned export_support:1;
388 398
399 /** Set if bdi is valid */
400 unsigned bdi_initialized:1;
401
389 /* 402 /*
390 * The following bitfields are only for optimization purposes 403 * The following bitfields are only for optimization purposes
391 * and hence races in setting them will not cause malfunction 404 * and hence races in setting them will not cause malfunction
@@ -433,6 +446,9 @@ struct fuse_conn {
433 /** Do multi-page cached writes */ 446 /** Do multi-page cached writes */
434 unsigned big_writes:1; 447 unsigned big_writes:1;
435 448
449 /** Don't apply umask to creation modes */
450 unsigned dont_mask:1;
451
436 /** The number of requests waiting for completion */ 452 /** The number of requests waiting for completion */
437 atomic_t num_waiting; 453 atomic_t num_waiting;
438 454
@@ -468,6 +484,12 @@ struct fuse_conn {
468 484
469 /** Called on final put */ 485 /** Called on final put */
470 void (*release)(struct fuse_conn *); 486 void (*release)(struct fuse_conn *);
487
488 /** Super block for this connection. */
489 struct super_block *sb;
490
491 /** Read/write semaphore to hold when accessing sb. */
492 struct rw_semaphore killsb;
471}; 493};
472 494
473static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) 495static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -496,6 +518,11 @@ extern const struct file_operations fuse_dev_operations;
496extern const struct dentry_operations fuse_dentry_operations; 518extern const struct dentry_operations fuse_dentry_operations;
497 519
498/** 520/**
521 * Inode to nodeid comparison.
522 */
523int fuse_inode_eq(struct inode *inode, void *_nodeidp);
524
525/**
499 * Get a filled in inode 526 * Get a filled in inode
500 */ 527 */
501struct inode *fuse_iget(struct super_block *sb, u64 nodeid, 528struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
@@ -515,25 +542,24 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
515 * Initialize READ or READDIR request 542 * Initialize READ or READDIR request
516 */ 543 */
517void fuse_read_fill(struct fuse_req *req, struct file *file, 544void fuse_read_fill(struct fuse_req *req, struct file *file,
518 struct inode *inode, loff_t pos, size_t count, int opcode); 545 loff_t pos, size_t count, int opcode);
519 546
520/** 547/**
521 * Send OPEN or OPENDIR request 548 * Send OPEN or OPENDIR request
522 */ 549 */
523int fuse_open_common(struct inode *inode, struct file *file, int isdir); 550int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
524 551
525struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); 552struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
553struct fuse_file *fuse_file_get(struct fuse_file *ff);
526void fuse_file_free(struct fuse_file *ff); 554void fuse_file_free(struct fuse_file *ff);
527void fuse_finish_open(struct inode *inode, struct file *file, 555void fuse_finish_open(struct inode *inode, struct file *file);
528 struct fuse_file *ff, struct fuse_open_out *outarg);
529 556
530/** Fill in ff->reserved_req with a RELEASE request */ 557void fuse_sync_release(struct fuse_file *ff, int flags);
531void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode);
532 558
533/** 559/**
534 * Send RELEASE or RELEASEDIR request 560 * Send RELEASE or RELEASEDIR request
535 */ 561 */
536int fuse_release_common(struct inode *inode, struct file *file, int isdir); 562void fuse_release_common(struct file *file, int opcode);
537 563
538/** 564/**
539 * Send FSYNC or FSYNCDIR request 565 * Send FSYNC or FSYNCDIR request
@@ -652,10 +678,12 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
652 */ 678 */
653struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); 679struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
654 680
681void fuse_conn_kill(struct fuse_conn *fc);
682
655/** 683/**
656 * Initialize fuse_conn 684 * Initialize fuse_conn
657 */ 685 */
658int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb); 686void fuse_conn_init(struct fuse_conn *fc);
659 687
660/** 688/**
661 * Release reference to fuse_conn 689 * Release reference to fuse_conn
@@ -694,4 +722,26 @@ void fuse_release_nowrite(struct inode *inode);
694 722
695u64 fuse_get_attr_version(struct fuse_conn *fc); 723u64 fuse_get_attr_version(struct fuse_conn *fc);
696 724
725/**
726 * File-system tells the kernel to invalidate cache for the given node id.
727 */
728int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
729 loff_t offset, loff_t len);
730
731/**
732 * File-system tells the kernel to invalidate parent attributes and
733 * the dentry matching parent/name.
734 */
735int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
736 struct qstr *name);
737
738int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
739 bool isdir);
740ssize_t fuse_direct_io(struct file *file, const char __user *buf,
741 size_t count, loff_t *ppos, int write);
742long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
743 unsigned int flags);
744unsigned fuse_file_poll(struct file *file, poll_table *wait);
745int fuse_dev_release(struct inode *inode, struct file *file);
746
697#endif /* _FS_FUSE_I_H */ 747#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 91f7c85f1ffd..f91ccc4a189d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -19,7 +19,6 @@
19#include <linux/random.h> 19#include <linux/random.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/exportfs.h> 21#include <linux/exportfs.h>
22#include <linux/smp_lock.h>
23 22
24MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); 23MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
25MODULE_DESCRIPTION("Filesystem in Userspace"); 24MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -207,7 +206,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
207 BUG(); 206 BUG();
208} 207}
209 208
210static int fuse_inode_eq(struct inode *inode, void *_nodeidp) 209int fuse_inode_eq(struct inode *inode, void *_nodeidp)
211{ 210{
212 u64 nodeid = *(u64 *) _nodeidp; 211 u64 nodeid = *(u64 *) _nodeidp;
213 if (get_node_id(inode) == nodeid) 212 if (get_node_id(inode) == nodeid)
@@ -258,11 +257,34 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
258 return inode; 257 return inode;
259} 258}
260 259
260int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
261 loff_t offset, loff_t len)
262{
263 struct inode *inode;
264 pgoff_t pg_start;
265 pgoff_t pg_end;
266
267 inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
268 if (!inode)
269 return -ENOENT;
270
271 fuse_invalidate_attr(inode);
272 if (offset >= 0) {
273 pg_start = offset >> PAGE_CACHE_SHIFT;
274 if (len <= 0)
275 pg_end = -1;
276 else
277 pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
278 invalidate_inode_pages2_range(inode->i_mapping,
279 pg_start, pg_end);
280 }
281 iput(inode);
282 return 0;
283}
284
261static void fuse_umount_begin(struct super_block *sb) 285static void fuse_umount_begin(struct super_block *sb)
262{ 286{
263 lock_kernel();
264 fuse_abort_conn(get_fuse_conn_super(sb)); 287 fuse_abort_conn(get_fuse_conn_super(sb));
265 unlock_kernel();
266} 288}
267 289
268static void fuse_send_destroy(struct fuse_conn *fc) 290static void fuse_send_destroy(struct fuse_conn *fc)
@@ -277,11 +299,14 @@ static void fuse_send_destroy(struct fuse_conn *fc)
277 } 299 }
278} 300}
279 301
280static void fuse_put_super(struct super_block *sb) 302static void fuse_bdi_destroy(struct fuse_conn *fc)
281{ 303{
282 struct fuse_conn *fc = get_fuse_conn_super(sb); 304 if (fc->bdi_initialized)
305 bdi_destroy(&fc->bdi);
306}
283 307
284 fuse_send_destroy(fc); 308void fuse_conn_kill(struct fuse_conn *fc)
309{
285 spin_lock(&fc->lock); 310 spin_lock(&fc->lock);
286 fc->connected = 0; 311 fc->connected = 0;
287 fc->blocked = 0; 312 fc->blocked = 0;
@@ -295,7 +320,16 @@ static void fuse_put_super(struct super_block *sb)
295 list_del(&fc->entry); 320 list_del(&fc->entry);
296 fuse_ctl_remove_conn(fc); 321 fuse_ctl_remove_conn(fc);
297 mutex_unlock(&fuse_mutex); 322 mutex_unlock(&fuse_mutex);
298 bdi_destroy(&fc->bdi); 323 fuse_bdi_destroy(fc);
324}
325EXPORT_SYMBOL_GPL(fuse_conn_kill);
326
327static void fuse_put_super(struct super_block *sb)
328{
329 struct fuse_conn *fc = get_fuse_conn_super(sb);
330
331 fuse_send_destroy(fc);
332 fuse_conn_kill(fc);
299 fuse_conn_put(fc); 333 fuse_conn_put(fc);
300} 334}
301 335
@@ -466,13 +500,12 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
466 return 0; 500 return 0;
467} 501}
468 502
469int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb) 503void fuse_conn_init(struct fuse_conn *fc)
470{ 504{
471 int err;
472
473 memset(fc, 0, sizeof(*fc)); 505 memset(fc, 0, sizeof(*fc));
474 spin_lock_init(&fc->lock); 506 spin_lock_init(&fc->lock);
475 mutex_init(&fc->inst_mutex); 507 mutex_init(&fc->inst_mutex);
508 init_rwsem(&fc->killsb);
476 atomic_set(&fc->count, 1); 509 atomic_set(&fc->count, 1);
477 init_waitqueue_head(&fc->waitq); 510 init_waitqueue_head(&fc->waitq);
478 init_waitqueue_head(&fc->blocked_waitq); 511 init_waitqueue_head(&fc->blocked_waitq);
@@ -484,49 +517,12 @@ int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
484 INIT_LIST_HEAD(&fc->bg_queue); 517 INIT_LIST_HEAD(&fc->bg_queue);
485 INIT_LIST_HEAD(&fc->entry); 518 INIT_LIST_HEAD(&fc->entry);
486 atomic_set(&fc->num_waiting, 0); 519 atomic_set(&fc->num_waiting, 0);
487 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
488 fc->bdi.unplug_io_fn = default_unplug_io_fn;
489 /* fuse does it's own writeback accounting */
490 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
491 fc->khctr = 0; 520 fc->khctr = 0;
492 fc->polled_files = RB_ROOT; 521 fc->polled_files = RB_ROOT;
493 fc->dev = sb->s_dev;
494 err = bdi_init(&fc->bdi);
495 if (err)
496 goto error_mutex_destroy;
497 if (sb->s_bdev) {
498 err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
499 MAJOR(fc->dev), MINOR(fc->dev));
500 } else {
501 err = bdi_register_dev(&fc->bdi, fc->dev);
502 }
503 if (err)
504 goto error_bdi_destroy;
505 /*
506 * For a single fuse filesystem use max 1% of dirty +
507 * writeback threshold.
508 *
509 * This gives about 1M of write buffer for memory maps on a
510 * machine with 1G and 10% dirty_ratio, which should be more
511 * than enough.
512 *
513 * Privileged users can raise it by writing to
514 *
515 * /sys/class/bdi/<bdi>/max_ratio
516 */
517 bdi_set_max_ratio(&fc->bdi, 1);
518 fc->reqctr = 0; 522 fc->reqctr = 0;
519 fc->blocked = 1; 523 fc->blocked = 1;
520 fc->attr_version = 1; 524 fc->attr_version = 1;
521 get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); 525 get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
522
523 return 0;
524
525 error_bdi_destroy:
526 bdi_destroy(&fc->bdi);
527 error_mutex_destroy:
528 mutex_destroy(&fc->inst_mutex);
529 return err;
530} 526}
531EXPORT_SYMBOL_GPL(fuse_conn_init); 527EXPORT_SYMBOL_GPL(fuse_conn_init);
532 528
@@ -539,12 +535,14 @@ void fuse_conn_put(struct fuse_conn *fc)
539 fc->release(fc); 535 fc->release(fc);
540 } 536 }
541} 537}
538EXPORT_SYMBOL_GPL(fuse_conn_put);
542 539
543struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) 540struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
544{ 541{
545 atomic_inc(&fc->count); 542 atomic_inc(&fc->count);
546 return fc; 543 return fc;
547} 544}
545EXPORT_SYMBOL_GPL(fuse_conn_get);
548 546
549static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) 547static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
550{ 548{
@@ -753,6 +751,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
753 } 751 }
754 if (arg->flags & FUSE_BIG_WRITES) 752 if (arg->flags & FUSE_BIG_WRITES)
755 fc->big_writes = 1; 753 fc->big_writes = 1;
754 if (arg->flags & FUSE_DONT_MASK)
755 fc->dont_mask = 1;
756 } else { 756 } else {
757 ra_pages = fc->max_read / PAGE_CACHE_SIZE; 757 ra_pages = fc->max_read / PAGE_CACHE_SIZE;
758 fc->no_lock = 1; 758 fc->no_lock = 1;
@@ -776,7 +776,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
776 arg->minor = FUSE_KERNEL_MINOR_VERSION; 776 arg->minor = FUSE_KERNEL_MINOR_VERSION;
777 arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; 777 arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
778 arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | 778 arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
779 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES; 779 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK;
780 req->in.h.opcode = FUSE_INIT; 780 req->in.h.opcode = FUSE_INIT;
781 req->in.numargs = 1; 781 req->in.numargs = 1;
782 req->in.args[0].size = sizeof(*arg); 782 req->in.args[0].size = sizeof(*arg);
@@ -797,6 +797,48 @@ static void fuse_free_conn(struct fuse_conn *fc)
797 kfree(fc); 797 kfree(fc);
798} 798}
799 799
800static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
801{
802 int err;
803
804 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
805 fc->bdi.unplug_io_fn = default_unplug_io_fn;
806 /* fuse does it's own writeback accounting */
807 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
808
809 err = bdi_init(&fc->bdi);
810 if (err)
811 return err;
812
813 fc->bdi_initialized = 1;
814
815 if (sb->s_bdev) {
816 err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
817 MAJOR(fc->dev), MINOR(fc->dev));
818 } else {
819 err = bdi_register_dev(&fc->bdi, fc->dev);
820 }
821
822 if (err)
823 return err;
824
825 /*
826 * For a single fuse filesystem use max 1% of dirty +
827 * writeback threshold.
828 *
829 * This gives about 1M of write buffer for memory maps on a
830 * machine with 1G and 10% dirty_ratio, which should be more
831 * than enough.
832 *
833 * Privileged users can raise it by writing to
834 *
835 * /sys/class/bdi/<bdi>/max_ratio
836 */
837 bdi_set_max_ratio(&fc->bdi, 1);
838
839 return 0;
840}
841
800static int fuse_fill_super(struct super_block *sb, void *data, int silent) 842static int fuse_fill_super(struct super_block *sb, void *data, int silent)
801{ 843{
802 struct fuse_conn *fc; 844 struct fuse_conn *fc;
@@ -843,11 +885,18 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
843 if (!fc) 885 if (!fc)
844 goto err_fput; 886 goto err_fput;
845 887
846 err = fuse_conn_init(fc, sb); 888 fuse_conn_init(fc);
847 if (err) { 889
848 kfree(fc); 890 fc->dev = sb->s_dev;
849 goto err_fput; 891 fc->sb = sb;
850 } 892 err = fuse_bdi_init(fc, sb);
893 if (err)
894 goto err_put_conn;
895
896 /* Handle umasking inside the fuse code */
897 if (sb->s_flags & MS_POSIXACL)
898 fc->dont_mask = 1;
899 sb->s_flags |= MS_POSIXACL;
851 900
852 fc->release = fuse_free_conn; 901 fc->release = fuse_free_conn;
853 fc->flags = d.flags; 902 fc->flags = d.flags;
@@ -911,7 +960,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
911 err_put_root: 960 err_put_root:
912 dput(root_dentry); 961 dput(root_dentry);
913 err_put_conn: 962 err_put_conn:
914 bdi_destroy(&fc->bdi); 963 fuse_bdi_destroy(fc);
915 fuse_conn_put(fc); 964 fuse_conn_put(fc);
916 err_fput: 965 err_fput:
917 fput(file); 966 fput(file);
@@ -926,12 +975,25 @@ static int fuse_get_sb(struct file_system_type *fs_type,
926 return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt); 975 return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
927} 976}
928 977
978static void fuse_kill_sb_anon(struct super_block *sb)
979{
980 struct fuse_conn *fc = get_fuse_conn_super(sb);
981
982 if (fc) {
983 down_write(&fc->killsb);
984 fc->sb = NULL;
985 up_write(&fc->killsb);
986 }
987
988 kill_anon_super(sb);
989}
990
929static struct file_system_type fuse_fs_type = { 991static struct file_system_type fuse_fs_type = {
930 .owner = THIS_MODULE, 992 .owner = THIS_MODULE,
931 .name = "fuse", 993 .name = "fuse",
932 .fs_flags = FS_HAS_SUBTYPE, 994 .fs_flags = FS_HAS_SUBTYPE,
933 .get_sb = fuse_get_sb, 995 .get_sb = fuse_get_sb,
934 .kill_sb = kill_anon_super, 996 .kill_sb = fuse_kill_sb_anon,
935}; 997};
936 998
937#ifdef CONFIG_BLOCK 999#ifdef CONFIG_BLOCK
@@ -943,11 +1005,24 @@ static int fuse_get_sb_blk(struct file_system_type *fs_type,
943 mnt); 1005 mnt);
944} 1006}
945 1007
1008static void fuse_kill_sb_blk(struct super_block *sb)
1009{
1010 struct fuse_conn *fc = get_fuse_conn_super(sb);
1011
1012 if (fc) {
1013 down_write(&fc->killsb);
1014 fc->sb = NULL;
1015 up_write(&fc->killsb);
1016 }
1017
1018 kill_block_super(sb);
1019}
1020
946static struct file_system_type fuseblk_fs_type = { 1021static struct file_system_type fuseblk_fs_type = {
947 .owner = THIS_MODULE, 1022 .owner = THIS_MODULE,
948 .name = "fuseblk", 1023 .name = "fuseblk",
949 .get_sb = fuse_get_sb_blk, 1024 .get_sb = fuse_get_sb_blk,
950 .kill_sb = kill_block_super, 1025 .kill_sb = fuse_kill_sb_blk,
951 .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, 1026 .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
952}; 1027};
953 1028
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 3a981b7f64ca..5971359d2090 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,12 +1,13 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL && (64BIT || LBD) 3 depends on EXPERIMENTAL && (64BIT || LBDAF)
4 select DLM if GFS2_FS_LOCKING_DLM 4 select DLM if GFS2_FS_LOCKING_DLM
5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM 5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
6 select SYSFS if GFS2_FS_LOCKING_DLM 6 select SYSFS if GFS2_FS_LOCKING_DLM
7 select IP_SCTP if DLM_SCTP 7 select IP_SCTP if DLM_SCTP
8 select FS_POSIX_ACL 8 select FS_POSIX_ACL
9 select CRC32 9 select CRC32
10 select SLOW_WORK
10 help 11 help
11 A cluster filesystem. 12 A cluster filesystem.
12 13
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index a851ea4bdf70..3da2f1f4f738 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,8 +1,9 @@
1EXTRA_CFLAGS := -I$(src)
1obj-$(CONFIG_GFS2_FS) += gfs2.o 2obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ 3gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o log.o lops.o main.o meta_io.o \ 4 glops.o inode.o log.o lops.o main.o meta_io.o \
4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ 5 aops.o dentry.o export.o file.o \
5 ops_fstype.o ops_inode.o ops_super.o quota.o \ 6 ops_fstype.o ops_inode.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o 7 recovery.o rgrp.o super.o sys.o trans.o util.o
7 8
8gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o 9gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/aops.c
index a6dde1751e17..7ebae9a4ecc0 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/aops.c
@@ -28,7 +28,6 @@
28#include "inode.h" 28#include "inode.h"
29#include "log.h" 29#include "log.h"
30#include "meta_io.h" 30#include "meta_io.h"
31#include "ops_address.h"
32#include "quota.h" 31#include "quota.h"
33#include "trans.h" 32#include "trans.h"
34#include "rgrp.h" 33#include "rgrp.h"
@@ -625,6 +624,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
625{ 624{
626 struct gfs2_inode *ip = GFS2_I(mapping->host); 625 struct gfs2_inode *ip = GFS2_I(mapping->host);
627 struct gfs2_sbd *sdp = GFS2_SB(mapping->host); 626 struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
627 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
628 unsigned int data_blocks = 0, ind_blocks = 0, rblocks; 628 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
629 int alloc_required; 629 int alloc_required;
630 int error = 0; 630 int error = 0;
@@ -638,6 +638,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
638 error = gfs2_glock_nq(&ip->i_gh); 638 error = gfs2_glock_nq(&ip->i_gh);
639 if (unlikely(error)) 639 if (unlikely(error))
640 goto out_uninit; 640 goto out_uninit;
641 if (&ip->i_inode == sdp->sd_rindex) {
642 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
643 GL_NOCACHE, &m_ip->i_gh);
644 if (unlikely(error)) {
645 gfs2_glock_dq(&ip->i_gh);
646 goto out_uninit;
647 }
648 }
641 649
642 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); 650 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
643 if (error) 651 if (error)
@@ -668,6 +676,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
668 rblocks += data_blocks ? data_blocks : 1; 676 rblocks += data_blocks ? data_blocks : 1;
669 if (ind_blocks || data_blocks) 677 if (ind_blocks || data_blocks)
670 rblocks += RES_STATFS + RES_QUOTA; 678 rblocks += RES_STATFS + RES_QUOTA;
679 if (&ip->i_inode == sdp->sd_rindex)
680 rblocks += 2 * RES_STATFS;
671 681
672 error = gfs2_trans_begin(sdp, rblocks, 682 error = gfs2_trans_begin(sdp, rblocks,
673 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); 683 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -713,6 +723,10 @@ out_alloc_put:
713 gfs2_alloc_put(ip); 723 gfs2_alloc_put(ip);
714 } 724 }
715out_unlock: 725out_unlock:
726 if (&ip->i_inode == sdp->sd_rindex) {
727 gfs2_glock_dq(&m_ip->i_gh);
728 gfs2_holder_uninit(&m_ip->i_gh);
729 }
716 gfs2_glock_dq(&ip->i_gh); 730 gfs2_glock_dq(&ip->i_gh);
717out_uninit: 731out_uninit:
718 gfs2_holder_uninit(&ip->i_gh); 732 gfs2_holder_uninit(&ip->i_gh);
@@ -726,14 +740,21 @@ out_uninit:
726static void adjust_fs_space(struct inode *inode) 740static void adjust_fs_space(struct inode *inode)
727{ 741{
728 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; 742 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
743 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
744 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
729 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; 745 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
730 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; 746 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
747 struct buffer_head *m_bh, *l_bh;
731 u64 fs_total, new_free; 748 u64 fs_total, new_free;
732 749
733 /* Total up the file system space, according to the latest rindex. */ 750 /* Total up the file system space, according to the latest rindex. */
734 fs_total = gfs2_ri_total(sdp); 751 fs_total = gfs2_ri_total(sdp);
752 if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
753 return;
735 754
736 spin_lock(&sdp->sd_statfs_spin); 755 spin_lock(&sdp->sd_statfs_spin);
756 gfs2_statfs_change_in(m_sc, m_bh->b_data +
757 sizeof(struct gfs2_dinode));
737 if (fs_total > (m_sc->sc_total + l_sc->sc_total)) 758 if (fs_total > (m_sc->sc_total + l_sc->sc_total))
738 new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); 759 new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
739 else 760 else
@@ -742,6 +763,13 @@ static void adjust_fs_space(struct inode *inode)
742 fs_warn(sdp, "File system extended by %llu blocks.\n", 763 fs_warn(sdp, "File system extended by %llu blocks.\n",
743 (unsigned long long)new_free); 764 (unsigned long long)new_free);
744 gfs2_statfs_change(sdp, new_free, new_free, 0); 765 gfs2_statfs_change(sdp, new_free, new_free, 0);
766
767 if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
768 goto out;
769 update_statfs(sdp, m_bh, l_bh);
770 brelse(l_bh);
771out:
772 brelse(m_bh);
745} 773}
746 774
747/** 775/**
@@ -764,6 +792,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
764{ 792{
765 struct gfs2_inode *ip = GFS2_I(inode); 793 struct gfs2_inode *ip = GFS2_I(inode);
766 struct gfs2_sbd *sdp = GFS2_SB(inode); 794 struct gfs2_sbd *sdp = GFS2_SB(inode);
795 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
767 u64 to = pos + copied; 796 u64 to = pos + copied;
768 void *kaddr; 797 void *kaddr;
769 unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); 798 unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
@@ -781,10 +810,12 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
781 unlock_page(page); 810 unlock_page(page);
782 page_cache_release(page); 811 page_cache_release(page);
783 812
784 if (inode->i_size < to) { 813 if (copied) {
785 i_size_write(inode, to); 814 if (inode->i_size < to) {
786 ip->i_disksize = inode->i_size; 815 i_size_write(inode, to);
787 di->di_size = cpu_to_be64(inode->i_size); 816 ip->i_disksize = inode->i_size;
817 }
818 gfs2_dinode_out(ip, di);
788 mark_inode_dirty(inode); 819 mark_inode_dirty(inode);
789 } 820 }
790 821
@@ -793,6 +824,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
793 824
794 brelse(dibh); 825 brelse(dibh);
795 gfs2_trans_end(sdp); 826 gfs2_trans_end(sdp);
827 if (inode == sdp->sd_rindex) {
828 gfs2_glock_dq(&m_ip->i_gh);
829 gfs2_holder_uninit(&m_ip->i_gh);
830 }
796 gfs2_glock_dq(&ip->i_gh); 831 gfs2_glock_dq(&ip->i_gh);
797 gfs2_holder_uninit(&ip->i_gh); 832 gfs2_holder_uninit(&ip->i_gh);
798 return copied; 833 return copied;
@@ -822,9 +857,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
822 struct inode *inode = page->mapping->host; 857 struct inode *inode = page->mapping->host;
823 struct gfs2_inode *ip = GFS2_I(inode); 858 struct gfs2_inode *ip = GFS2_I(inode);
824 struct gfs2_sbd *sdp = GFS2_SB(inode); 859 struct gfs2_sbd *sdp = GFS2_SB(inode);
860 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
825 struct buffer_head *dibh; 861 struct buffer_head *dibh;
826 struct gfs2_alloc *al = ip->i_alloc; 862 struct gfs2_alloc *al = ip->i_alloc;
827 struct gfs2_dinode *di;
828 unsigned int from = pos & (PAGE_CACHE_SIZE - 1); 863 unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
829 unsigned int to = from + len; 864 unsigned int to = from + len;
830 int ret; 865 int ret;
@@ -847,11 +882,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
847 gfs2_page_add_databufs(ip, page, from, to); 882 gfs2_page_add_databufs(ip, page, from, to);
848 883
849 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 884 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
850 885 if (ret > 0) {
851 if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) { 886 if (inode->i_size > ip->i_disksize)
852 di = (struct gfs2_dinode *)dibh->b_data; 887 ip->i_disksize = inode->i_size;
853 ip->i_disksize = inode->i_size; 888 gfs2_dinode_out(ip, dibh->b_data);
854 di->di_size = cpu_to_be64(inode->i_size);
855 mark_inode_dirty(inode); 889 mark_inode_dirty(inode);
856 } 890 }
857 891
@@ -866,6 +900,10 @@ failed:
866 gfs2_quota_unlock(ip); 900 gfs2_quota_unlock(ip);
867 gfs2_alloc_put(ip); 901 gfs2_alloc_put(ip);
868 } 902 }
903 if (inode == sdp->sd_rindex) {
904 gfs2_glock_dq(&m_ip->i_gh);
905 gfs2_holder_uninit(&m_ip->i_gh);
906 }
869 gfs2_glock_dq(&ip->i_gh); 907 gfs2_glock_dq(&ip->i_gh);
870 gfs2_holder_uninit(&ip->i_gh); 908 gfs2_holder_uninit(&ip->i_gh);
871 return ret; 909 return ret;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3a5d3f883e10..6d47379e794b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -25,7 +25,7 @@
25#include "trans.h" 25#include "trans.h"
26#include "dir.h" 26#include "dir.h"
27#include "util.h" 27#include "util.h"
28#include "ops_address.h" 28#include "trace_gfs2.h"
29 29
30/* This doesn't need to be that large as max 64 bit pointers in a 4k 30/* This doesn't need to be that large as max 64 bit pointers in a 4k
31 * block is 512, so __u16 is fine for that. It saves stack space to 31 * block is 512, so __u16 is fine for that. It saves stack space to
@@ -136,7 +136,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
136 and write it out to disk */ 136 and write it out to disk */
137 137
138 unsigned int n = 1; 138 unsigned int n = 1;
139 block = gfs2_alloc_block(ip, &n); 139 error = gfs2_alloc_block(ip, &block, &n);
140 if (error)
141 goto out_brelse;
140 if (isdir) { 142 if (isdir) {
141 gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1); 143 gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
142 error = gfs2_dir_get_new_buffer(ip, block, &bh); 144 error = gfs2_dir_get_new_buffer(ip, block, &bh);
@@ -476,8 +478,11 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
476 blks = dblks + iblks; 478 blks = dblks + iblks;
477 i = sheight; 479 i = sheight;
478 do { 480 do {
481 int error;
479 n = blks - alloced; 482 n = blks - alloced;
480 bn = gfs2_alloc_block(ip, &n); 483 error = gfs2_alloc_block(ip, &bn, &n);
484 if (error)
485 return error;
481 alloced += n; 486 alloced += n;
482 if (state != ALLOC_DATA || gfs2_is_jdata(ip)) 487 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
483 gfs2_trans_add_unrevoke(sdp, bn, n); 488 gfs2_trans_add_unrevoke(sdp, bn, n);
@@ -585,6 +590,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
585 clear_buffer_mapped(bh_map); 590 clear_buffer_mapped(bh_map);
586 clear_buffer_new(bh_map); 591 clear_buffer_new(bh_map);
587 clear_buffer_boundary(bh_map); 592 clear_buffer_boundary(bh_map);
593 trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
588 if (gfs2_is_dir(ip)) { 594 if (gfs2_is_dir(ip)) {
589 bsize = sdp->sd_jbsize; 595 bsize = sdp->sd_jbsize;
590 arr = sdp->sd_jheightsize; 596 arr = sdp->sd_jheightsize;
@@ -619,6 +625,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
619 ret = 0; 625 ret = 0;
620out: 626out:
621 release_metapath(&mp); 627 release_metapath(&mp);
628 trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
622 bmap_unlock(ip, create); 629 bmap_unlock(ip, create);
623 return ret; 630 return ret;
624 631
@@ -1008,7 +1015,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
1008 gfs2_trans_add_bh(ip->i_gl, bh, 0); 1015 gfs2_trans_add_bh(ip->i_gl, bh, 0);
1009 1016
1010 zero_user(page, offset, length); 1017 zero_user(page, offset, length);
1011 1018 mark_buffer_dirty(bh);
1012unlock: 1019unlock:
1013 unlock_page(page); 1020 unlock_page(page);
1014 page_cache_release(page); 1021 page_cache_release(page);
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..022c66cd5606 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/dentry.c
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index aef4d0c06748..297d7e5cebad 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -803,13 +803,20 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
803{ 803{
804 struct gfs2_inode *ip = GFS2_I(inode); 804 struct gfs2_inode *ip = GFS2_I(inode);
805 unsigned int n = 1; 805 unsigned int n = 1;
806 u64 bn = gfs2_alloc_block(ip, &n); 806 u64 bn;
807 struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn); 807 int error;
808 struct buffer_head *bh;
808 struct gfs2_leaf *leaf; 809 struct gfs2_leaf *leaf;
809 struct gfs2_dirent *dent; 810 struct gfs2_dirent *dent;
810 struct qstr name = { .name = "", .len = 0, .hash = 0 }; 811 struct qstr name = { .name = "", .len = 0, .hash = 0 };
812
813 error = gfs2_alloc_block(ip, &bn, &n);
814 if (error)
815 return NULL;
816 bh = gfs2_meta_new(ip->i_gl, bn);
811 if (!bh) 817 if (!bh)
812 return NULL; 818 return NULL;
819
813 gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1); 820 gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
814 gfs2_trans_add_bh(ip->i_gl, bh, 1); 821 gfs2_trans_add_bh(ip->i_gl, bh, 1);
815 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); 822 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 899763aed217..07ea9529adda 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -582,8 +582,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
582 struct gfs2_ea_header *ea; 582 struct gfs2_ea_header *ea;
583 unsigned int n = 1; 583 unsigned int n = 1;
584 u64 block; 584 u64 block;
585 int error;
585 586
586 block = gfs2_alloc_block(ip, &n); 587 error = gfs2_alloc_block(ip, &block, &n);
588 if (error)
589 return error;
587 gfs2_trans_add_unrevoke(sdp, block, 1); 590 gfs2_trans_add_unrevoke(sdp, block, 1);
588 *bhp = gfs2_meta_new(ip->i_gl, block); 591 *bhp = gfs2_meta_new(ip->i_gl, block);
589 gfs2_trans_add_bh(ip->i_gl, *bhp, 1); 592 gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
@@ -617,6 +620,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
617 struct gfs2_ea_request *er) 620 struct gfs2_ea_request *er)
618{ 621{
619 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 622 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
623 int error;
620 624
621 ea->ea_data_len = cpu_to_be32(er->er_data_len); 625 ea->ea_data_len = cpu_to_be32(er->er_data_len);
622 ea->ea_name_len = er->er_name_len; 626 ea->ea_name_len = er->er_name_len;
@@ -642,7 +646,9 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
642 int mh_size = sizeof(struct gfs2_meta_header); 646 int mh_size = sizeof(struct gfs2_meta_header);
643 unsigned int n = 1; 647 unsigned int n = 1;
644 648
645 block = gfs2_alloc_block(ip, &n); 649 error = gfs2_alloc_block(ip, &block, &n);
650 if (error)
651 return error;
646 gfs2_trans_add_unrevoke(sdp, block, 1); 652 gfs2_trans_add_unrevoke(sdp, block, 1);
647 bh = gfs2_meta_new(ip->i_gl, block); 653 bh = gfs2_meta_new(ip->i_gl, block);
648 gfs2_trans_add_bh(ip->i_gl, bh, 1); 654 gfs2_trans_add_bh(ip->i_gl, bh, 1);
@@ -963,7 +969,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
963 } else { 969 } else {
964 u64 blk; 970 u64 blk;
965 unsigned int n = 1; 971 unsigned int n = 1;
966 blk = gfs2_alloc_block(ip, &n); 972 error = gfs2_alloc_block(ip, &blk, &n);
973 if (error)
974 return error;
967 gfs2_trans_add_unrevoke(sdp, blk, 1); 975 gfs2_trans_add_unrevoke(sdp, blk, 1);
968 indbh = gfs2_meta_new(ip->i_gl, blk); 976 indbh = gfs2_meta_new(ip->i_gl, blk);
969 gfs2_trans_add_bh(ip->i_gl, indbh, 1); 977 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/export.c
index 9200ef221716..9200ef221716 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/export.c
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/file.c
index 5d82e91887e3..73318a3ce6f1 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/file.c
@@ -39,7 +39,6 @@
39#include "trans.h" 39#include "trans.h"
40#include "util.h" 40#include "util.h"
41#include "eaops.h" 41#include "eaops.h"
42#include "ops_address.h"
43 42
44/** 43/**
45 * gfs2_llseek - seek to a location in a file 44 * gfs2_llseek - seek to a location in a file
@@ -425,33 +424,36 @@ static struct vm_operations_struct gfs2_vm_ops = {
425 .page_mkwrite = gfs2_page_mkwrite, 424 .page_mkwrite = gfs2_page_mkwrite,
426}; 425};
427 426
428
429/** 427/**
430 * gfs2_mmap - 428 * gfs2_mmap -
431 * @file: The file to map 429 * @file: The file to map
432 * @vma: The VMA which described the mapping 430 * @vma: The VMA which described the mapping
433 * 431 *
434 * Returns: 0 or error code 432 * There is no need to get a lock here unless we should be updating
433 * atime. We ignore any locking errors since the only consequence is
434 * a missed atime update (which will just be deferred until later).
435 *
436 * Returns: 0
435 */ 437 */
436 438
437static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) 439static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
438{ 440{
439 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 441 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
440 struct gfs2_holder i_gh;
441 int error;
442 442
443 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); 443 if (!(file->f_flags & O_NOATIME)) {
444 error = gfs2_glock_nq(&i_gh); 444 struct gfs2_holder i_gh;
445 if (error) { 445 int error;
446 gfs2_holder_uninit(&i_gh);
447 return error;
448 }
449 446
447 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
448 error = gfs2_glock_nq(&i_gh);
449 file_accessed(file);
450 if (error == 0)
451 gfs2_glock_dq_uninit(&i_gh);
452 }
450 vma->vm_ops = &gfs2_vm_ops; 453 vma->vm_ops = &gfs2_vm_ops;
454 vma->vm_flags |= VM_CAN_NONLINEAR;
451 455
452 gfs2_glock_dq_uninit(&i_gh); 456 return 0;
453
454 return error;
455} 457}
456 458
457/** 459/**
@@ -692,12 +694,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
692 694
693static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) 695static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
694{ 696{
695 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
696
697 if (!(fl->fl_flags & FL_FLOCK)) 697 if (!(fl->fl_flags & FL_FLOCK))
698 return -ENOLCK; 698 return -ENOLCK;
699 if (__mandatory_lock(&ip->i_inode)) 699 if (fl->fl_type & LOCK_MAND)
700 return -ENOLCK; 700 return -EOPNOTSUPP;
701 701
702 if (fl->fl_type == F_UNLCK) { 702 if (fl->fl_type == F_UNLCK) {
703 do_unflock(file, fl); 703 do_unflock(file, fl);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ff4981090489..8b674b1f3a55 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -39,6 +39,8 @@
39#include "super.h" 39#include "super.h"
40#include "util.h" 40#include "util.h"
41#include "bmap.h" 41#include "bmap.h"
42#define CREATE_TRACE_POINTS
43#include "trace_gfs2.h"
42 44
43struct gfs2_gl_hash_bucket { 45struct gfs2_gl_hash_bucket {
44 struct hlist_head hb_list; 46 struct hlist_head hb_list;
@@ -61,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
61static DECLARE_RWSEM(gfs2_umount_flush_sem); 63static DECLARE_RWSEM(gfs2_umount_flush_sem);
62static struct dentry *gfs2_root; 64static struct dentry *gfs2_root;
63static struct workqueue_struct *glock_workqueue; 65static struct workqueue_struct *glock_workqueue;
66struct workqueue_struct *gfs2_delete_workqueue;
64static LIST_HEAD(lru_list); 67static LIST_HEAD(lru_list);
65static atomic_t lru_count = ATOMIC_INIT(0); 68static atomic_t lru_count = ATOMIC_INIT(0);
66static DEFINE_SPINLOCK(lru_lock); 69static DEFINE_SPINLOCK(lru_lock);
@@ -155,7 +158,7 @@ static void glock_free(struct gfs2_glock *gl)
155 158
156 if (aspace) 159 if (aspace)
157 gfs2_aspace_put(aspace); 160 gfs2_aspace_put(aspace);
158 161 trace_gfs2_glock_put(gl);
159 sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl); 162 sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl);
160} 163}
161 164
@@ -165,13 +168,33 @@ static void glock_free(struct gfs2_glock *gl)
165 * 168 *
166 */ 169 */
167 170
168static void gfs2_glock_hold(struct gfs2_glock *gl) 171void gfs2_glock_hold(struct gfs2_glock *gl)
169{ 172{
170 GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0); 173 GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
171 atomic_inc(&gl->gl_ref); 174 atomic_inc(&gl->gl_ref);
172} 175}
173 176
174/** 177/**
178 * demote_ok - Check to see if it's ok to unlock a glock
179 * @gl: the glock
180 *
181 * Returns: 1 if it's ok
182 */
183
184static int demote_ok(const struct gfs2_glock *gl)
185{
186 const struct gfs2_glock_operations *glops = gl->gl_ops;
187
188 if (gl->gl_state == LM_ST_UNLOCKED)
189 return 0;
190 if (!list_empty(&gl->gl_holders))
191 return 0;
192 if (glops->go_demote_ok)
193 return glops->go_demote_ok(gl);
194 return 1;
195}
196
197/**
175 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list 198 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
176 * @gl: the glock 199 * @gl: the glock
177 * 200 *
@@ -179,8 +202,13 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
179 202
180static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) 203static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
181{ 204{
205 int may_reclaim;
206 may_reclaim = (demote_ok(gl) &&
207 (atomic_read(&gl->gl_ref) == 1 ||
208 (gl->gl_name.ln_type == LM_TYPE_INODE &&
209 atomic_read(&gl->gl_ref) <= 2)));
182 spin_lock(&lru_lock); 210 spin_lock(&lru_lock);
183 if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) { 211 if (list_empty(&gl->gl_lru) && may_reclaim) {
184 list_add_tail(&gl->gl_lru, &lru_list); 212 list_add_tail(&gl->gl_lru, &lru_list);
185 atomic_inc(&lru_count); 213 atomic_inc(&lru_count);
186 } 214 }
@@ -188,6 +216,21 @@ static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
188} 216}
189 217
190/** 218/**
219 * gfs2_glock_put_nolock() - Decrement reference count on glock
220 * @gl: The glock to put
221 *
222 * This function should only be used if the caller has its own reference
223 * to the glock, in addition to the one it is dropping.
224 */
225
226void gfs2_glock_put_nolock(struct gfs2_glock *gl)
227{
228 if (atomic_dec_and_test(&gl->gl_ref))
229 GLOCK_BUG_ON(gl, 1);
230 gfs2_glock_schedule_for_reclaim(gl);
231}
232
233/**
191 * gfs2_glock_put() - Decrement reference count on glock 234 * gfs2_glock_put() - Decrement reference count on glock
192 * @gl: The glock to put 235 * @gl: The glock to put
193 * 236 *
@@ -212,9 +255,9 @@ int gfs2_glock_put(struct gfs2_glock *gl)
212 rv = 1; 255 rv = 1;
213 goto out; 256 goto out;
214 } 257 }
215 /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */ 258 spin_lock(&gl->gl_spin);
216 if (atomic_read(&gl->gl_ref) == 2) 259 gfs2_glock_schedule_for_reclaim(gl);
217 gfs2_glock_schedule_for_reclaim(gl); 260 spin_unlock(&gl->gl_spin);
218 write_unlock(gl_lock_addr(gl->gl_hash)); 261 write_unlock(gl_lock_addr(gl->gl_hash));
219out: 262out:
220 return rv; 263 return rv;
@@ -317,14 +360,17 @@ restart:
317 return 2; 360 return 2;
318 gh->gh_error = ret; 361 gh->gh_error = ret;
319 list_del_init(&gh->gh_list); 362 list_del_init(&gh->gh_list);
363 trace_gfs2_glock_queue(gh, 0);
320 gfs2_holder_wake(gh); 364 gfs2_holder_wake(gh);
321 goto restart; 365 goto restart;
322 } 366 }
323 set_bit(HIF_HOLDER, &gh->gh_iflags); 367 set_bit(HIF_HOLDER, &gh->gh_iflags);
368 trace_gfs2_promote(gh, 1);
324 gfs2_holder_wake(gh); 369 gfs2_holder_wake(gh);
325 goto restart; 370 goto restart;
326 } 371 }
327 set_bit(HIF_HOLDER, &gh->gh_iflags); 372 set_bit(HIF_HOLDER, &gh->gh_iflags);
373 trace_gfs2_promote(gh, 0);
328 gfs2_holder_wake(gh); 374 gfs2_holder_wake(gh);
329 continue; 375 continue;
330 } 376 }
@@ -354,6 +400,7 @@ static inline void do_error(struct gfs2_glock *gl, const int ret)
354 else 400 else
355 continue; 401 continue;
356 list_del_init(&gh->gh_list); 402 list_del_init(&gh->gh_list);
403 trace_gfs2_glock_queue(gh, 0);
357 gfs2_holder_wake(gh); 404 gfs2_holder_wake(gh);
358 } 405 }
359} 406}
@@ -392,7 +439,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
392 if (held2) 439 if (held2)
393 gfs2_glock_hold(gl); 440 gfs2_glock_hold(gl);
394 else 441 else
395 gfs2_glock_put(gl); 442 gfs2_glock_put_nolock(gl);
396 } 443 }
397 444
398 gl->gl_state = new_state; 445 gl->gl_state = new_state;
@@ -422,6 +469,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
422 int rv; 469 int rv;
423 470
424 spin_lock(&gl->gl_spin); 471 spin_lock(&gl->gl_spin);
472 trace_gfs2_glock_state_change(gl, state);
425 state_change(gl, state); 473 state_change(gl, state);
426 gh = find_first_waiter(gl); 474 gh = find_first_waiter(gl);
427 475
@@ -626,12 +674,35 @@ out:
626out_sched: 674out_sched:
627 gfs2_glock_hold(gl); 675 gfs2_glock_hold(gl);
628 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 676 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
629 gfs2_glock_put(gl); 677 gfs2_glock_put_nolock(gl);
630out_unlock: 678out_unlock:
631 clear_bit(GLF_LOCK, &gl->gl_flags); 679 clear_bit(GLF_LOCK, &gl->gl_flags);
632 goto out; 680 goto out;
633} 681}
634 682
683static void delete_work_func(struct work_struct *work)
684{
685 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
686 struct gfs2_sbd *sdp = gl->gl_sbd;
687 struct gfs2_inode *ip = NULL;
688 struct inode *inode;
689 u64 no_addr = 0;
690
691 spin_lock(&gl->gl_spin);
692 ip = (struct gfs2_inode *)gl->gl_object;
693 if (ip)
694 no_addr = ip->i_no_addr;
695 spin_unlock(&gl->gl_spin);
696 if (ip) {
697 inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
698 if (inode) {
699 d_prune_aliases(inode);
700 iput(inode);
701 }
702 }
703 gfs2_glock_put(gl);
704}
705
635static void glock_work_func(struct work_struct *work) 706static void glock_work_func(struct work_struct *work)
636{ 707{
637 unsigned long delay = 0; 708 unsigned long delay = 0;
@@ -710,6 +781,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
710 gl->gl_sbd = sdp; 781 gl->gl_sbd = sdp;
711 gl->gl_aspace = NULL; 782 gl->gl_aspace = NULL;
712 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); 783 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
784 INIT_WORK(&gl->gl_delete, delete_work_func);
713 785
714 /* If this glock protects actual on-disk data or metadata blocks, 786 /* If this glock protects actual on-disk data or metadata blocks,
715 create a VFS inode to manage the pages/buffers holding them. */ 787 create a VFS inode to manage the pages/buffers holding them. */
@@ -796,22 +868,37 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
796 gh->gh_ip = 0; 868 gh->gh_ip = 0;
797} 869}
798 870
799static int just_schedule(void *word) 871/**
872 * gfs2_glock_holder_wait
873 * @word: unused
874 *
875 * This function and gfs2_glock_demote_wait both show up in the WCHAN
876 * field. Thus I've separated these otherwise identical functions in
877 * order to be more informative to the user.
878 */
879
880static int gfs2_glock_holder_wait(void *word)
800{ 881{
801 schedule(); 882 schedule();
802 return 0; 883 return 0;
803} 884}
804 885
886static int gfs2_glock_demote_wait(void *word)
887{
888 schedule();
889 return 0;
890}
891
805static void wait_on_holder(struct gfs2_holder *gh) 892static void wait_on_holder(struct gfs2_holder *gh)
806{ 893{
807 might_sleep(); 894 might_sleep();
808 wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE); 895 wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE);
809} 896}
810 897
811static void wait_on_demote(struct gfs2_glock *gl) 898static void wait_on_demote(struct gfs2_glock *gl)
812{ 899{
813 might_sleep(); 900 might_sleep();
814 wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE); 901 wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
815} 902}
816 903
817/** 904/**
@@ -836,6 +923,9 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
836 gl->gl_demote_state != state) { 923 gl->gl_demote_state != state) {
837 gl->gl_demote_state = LM_ST_UNLOCKED; 924 gl->gl_demote_state = LM_ST_UNLOCKED;
838 } 925 }
926 if (gl->gl_ops->go_callback)
927 gl->gl_ops->go_callback(gl);
928 trace_gfs2_demote_rq(gl);
839} 929}
840 930
841/** 931/**
@@ -921,6 +1011,7 @@ fail:
921 goto do_cancel; 1011 goto do_cancel;
922 return; 1012 return;
923 } 1013 }
1014 trace_gfs2_glock_queue(gh, 1);
924 list_add_tail(&gh->gh_list, insert_pt); 1015 list_add_tail(&gh->gh_list, insert_pt);
925do_cancel: 1016do_cancel:
926 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); 1017 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
@@ -1017,6 +1108,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1017 !test_bit(GLF_DEMOTE, &gl->gl_flags)) 1108 !test_bit(GLF_DEMOTE, &gl->gl_flags))
1018 fast_path = 1; 1109 fast_path = 1;
1019 } 1110 }
1111 trace_gfs2_glock_queue(gh, 0);
1020 spin_unlock(&gl->gl_spin); 1112 spin_unlock(&gl->gl_spin);
1021 if (likely(fast_path)) 1113 if (likely(fast_path))
1022 return; 1114 return;
@@ -1249,33 +1341,12 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1249 gfs2_glock_put(gl); 1341 gfs2_glock_put(gl);
1250} 1342}
1251 1343
1252/**
1253 * demote_ok - Check to see if it's ok to unlock a glock
1254 * @gl: the glock
1255 *
1256 * Returns: 1 if it's ok
1257 */
1258
1259static int demote_ok(const struct gfs2_glock *gl)
1260{
1261 const struct gfs2_glock_operations *glops = gl->gl_ops;
1262
1263 if (gl->gl_state == LM_ST_UNLOCKED)
1264 return 0;
1265 if (!list_empty(&gl->gl_holders))
1266 return 0;
1267 if (glops->go_demote_ok)
1268 return glops->go_demote_ok(gl);
1269 return 1;
1270}
1271
1272 1344
1273static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) 1345static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
1274{ 1346{
1275 struct gfs2_glock *gl; 1347 struct gfs2_glock *gl;
1276 int may_demote; 1348 int may_demote;
1277 int nr_skipped = 0; 1349 int nr_skipped = 0;
1278 int got_ref = 0;
1279 LIST_HEAD(skipped); 1350 LIST_HEAD(skipped);
1280 1351
1281 if (nr == 0) 1352 if (nr == 0)
@@ -1290,37 +1361,29 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
1290 list_del_init(&gl->gl_lru); 1361 list_del_init(&gl->gl_lru);
1291 atomic_dec(&lru_count); 1362 atomic_dec(&lru_count);
1292 1363
1364 /* Check if glock is about to be freed */
1365 if (atomic_read(&gl->gl_ref) == 0)
1366 continue;
1367
1293 /* Test for being demotable */ 1368 /* Test for being demotable */
1294 if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { 1369 if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
1295 gfs2_glock_hold(gl); 1370 gfs2_glock_hold(gl);
1296 got_ref = 1;
1297 spin_unlock(&lru_lock); 1371 spin_unlock(&lru_lock);
1298 spin_lock(&gl->gl_spin); 1372 spin_lock(&gl->gl_spin);
1299 may_demote = demote_ok(gl); 1373 may_demote = demote_ok(gl);
1300 spin_unlock(&gl->gl_spin);
1301 clear_bit(GLF_LOCK, &gl->gl_flags);
1302 if (may_demote) { 1374 if (may_demote) {
1303 handle_callback(gl, LM_ST_UNLOCKED, 0); 1375 handle_callback(gl, LM_ST_UNLOCKED, 0);
1304 nr--; 1376 nr--;
1305 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1306 gfs2_glock_put(gl);
1307 got_ref = 0;
1308 } 1377 }
1378 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1379 gfs2_glock_put_nolock(gl);
1380 spin_unlock(&gl->gl_spin);
1381 clear_bit(GLF_LOCK, &gl->gl_flags);
1309 spin_lock(&lru_lock); 1382 spin_lock(&lru_lock);
1310 if (may_demote) 1383 continue;
1311 continue;
1312 }
1313 if (list_empty(&gl->gl_lru) &&
1314 (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
1315 nr_skipped++;
1316 list_add(&gl->gl_lru, &skipped);
1317 }
1318 if (got_ref) {
1319 spin_unlock(&lru_lock);
1320 gfs2_glock_put(gl);
1321 spin_lock(&lru_lock);
1322 got_ref = 0;
1323 } 1384 }
1385 nr_skipped++;
1386 list_add(&gl->gl_lru, &skipped);
1324 } 1387 }
1325 list_splice(&skipped, &lru_list); 1388 list_splice(&skipped, &lru_list);
1326 atomic_add(nr_skipped, &lru_count); 1389 atomic_add(nr_skipped, &lru_count);
@@ -1702,6 +1765,11 @@ int __init gfs2_glock_init(void)
1702 glock_workqueue = create_workqueue("glock_workqueue"); 1765 glock_workqueue = create_workqueue("glock_workqueue");
1703 if (IS_ERR(glock_workqueue)) 1766 if (IS_ERR(glock_workqueue))
1704 return PTR_ERR(glock_workqueue); 1767 return PTR_ERR(glock_workqueue);
1768 gfs2_delete_workqueue = create_workqueue("delete_workqueue");
1769 if (IS_ERR(gfs2_delete_workqueue)) {
1770 destroy_workqueue(glock_workqueue);
1771 return PTR_ERR(gfs2_delete_workqueue);
1772 }
1705 1773
1706 register_shrinker(&glock_shrinker); 1774 register_shrinker(&glock_shrinker);
1707 1775
@@ -1712,6 +1780,7 @@ void gfs2_glock_exit(void)
1712{ 1780{
1713 unregister_shrinker(&glock_shrinker); 1781 unregister_shrinker(&glock_shrinker);
1714 destroy_workqueue(glock_workqueue); 1782 destroy_workqueue(glock_workqueue);
1783 destroy_workqueue(gfs2_delete_workqueue);
1715} 1784}
1716 1785
1717static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) 1786static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index a602a28f6f08..c609894ec0d0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -143,6 +143,7 @@ struct lm_lockops {
143 143
144#define GLR_TRYFAILED 13 144#define GLR_TRYFAILED 13
145 145
146extern struct workqueue_struct *gfs2_delete_workqueue;
146static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) 147static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
147{ 148{
148 struct gfs2_holder *gh; 149 struct gfs2_holder *gh;
@@ -191,6 +192,8 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
191int gfs2_glock_get(struct gfs2_sbd *sdp, 192int gfs2_glock_get(struct gfs2_sbd *sdp,
192 u64 number, const struct gfs2_glock_operations *glops, 193 u64 number, const struct gfs2_glock_operations *glops,
193 int create, struct gfs2_glock **glp); 194 int create, struct gfs2_glock **glp);
195void gfs2_glock_hold(struct gfs2_glock *gl);
196void gfs2_glock_put_nolock(struct gfs2_glock *gl);
194int gfs2_glock_put(struct gfs2_glock *gl); 197int gfs2_glock_put(struct gfs2_glock *gl);
195void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, 198void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
196 struct gfs2_holder *gh); 199 struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 70f87f43afa2..6985eef06c39 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -310,24 +310,6 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
310} 310}
311 311
312/** 312/**
313 * rgrp_go_dump - print out an rgrp
314 * @seq: The iterator
315 * @gl: The glock in question
316 *
317 */
318
319static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
320{
321 const struct gfs2_rgrpd *rgd = gl->gl_object;
322 if (rgd == NULL)
323 return 0;
324 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
325 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
326 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
327 return 0;
328}
329
330/**
331 * trans_go_sync - promote/demote the transaction glock 313 * trans_go_sync - promote/demote the transaction glock
332 * @gl: the glock 314 * @gl: the glock
333 * @state: the requested state 315 * @state: the requested state
@@ -341,6 +323,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
341 323
342 if (gl->gl_state != LM_ST_UNLOCKED && 324 if (gl->gl_state != LM_ST_UNLOCKED &&
343 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { 325 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
326 flush_workqueue(gfs2_delete_workqueue);
344 gfs2_meta_syncfs(sdp); 327 gfs2_meta_syncfs(sdp);
345 gfs2_log_shutdown(sdp); 328 gfs2_log_shutdown(sdp);
346 } 329 }
@@ -390,6 +373,25 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
390 return 0; 373 return 0;
391} 374}
392 375
376/**
377 * iopen_go_callback - schedule the dcache entry for the inode to be deleted
378 * @gl: the glock
379 *
380 * gl_spin lock is held while calling this
381 */
382static void iopen_go_callback(struct gfs2_glock *gl)
383{
384 struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
385
386 if (gl->gl_demote_state == LM_ST_UNLOCKED &&
387 gl->gl_state == LM_ST_SHARED &&
388 ip && test_bit(GIF_USER, &ip->i_flags)) {
389 gfs2_glock_hold(gl);
390 if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
391 gfs2_glock_put_nolock(gl);
392 }
393}
394
393const struct gfs2_glock_operations gfs2_meta_glops = { 395const struct gfs2_glock_operations gfs2_meta_glops = {
394 .go_type = LM_TYPE_META, 396 .go_type = LM_TYPE_META,
395}; 397};
@@ -410,7 +412,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
410 .go_demote_ok = rgrp_go_demote_ok, 412 .go_demote_ok = rgrp_go_demote_ok,
411 .go_lock = rgrp_go_lock, 413 .go_lock = rgrp_go_lock,
412 .go_unlock = rgrp_go_unlock, 414 .go_unlock = rgrp_go_unlock,
413 .go_dump = rgrp_go_dump, 415 .go_dump = gfs2_rgrp_dump,
414 .go_type = LM_TYPE_RGRP, 416 .go_type = LM_TYPE_RGRP,
415 .go_min_hold_time = HZ / 5, 417 .go_min_hold_time = HZ / 5,
416}; 418};
@@ -424,6 +426,7 @@ const struct gfs2_glock_operations gfs2_trans_glops = {
424 426
425const struct gfs2_glock_operations gfs2_iopen_glops = { 427const struct gfs2_glock_operations gfs2_iopen_glops = {
426 .go_type = LM_TYPE_IOPEN, 428 .go_type = LM_TYPE_IOPEN,
429 .go_callback = iopen_go_callback,
427}; 430};
428 431
429const struct gfs2_glock_operations gfs2_flock_glops = { 432const struct gfs2_glock_operations gfs2_flock_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 399d1b978049..61801ada36f0 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,7 @@
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/workqueue.h> 14#include <linux/workqueue.h>
15#include <linux/slow-work.h>
15#include <linux/dlm.h> 16#include <linux/dlm.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
17 18
@@ -63,9 +64,12 @@ struct gfs2_log_element {
63 const struct gfs2_log_operations *le_ops; 64 const struct gfs2_log_operations *le_ops;
64}; 65};
65 66
67#define GBF_FULL 1
68
66struct gfs2_bitmap { 69struct gfs2_bitmap {
67 struct buffer_head *bi_bh; 70 struct buffer_head *bi_bh;
68 char *bi_clone; 71 char *bi_clone;
72 unsigned long bi_flags;
69 u32 bi_offset; 73 u32 bi_offset;
70 u32 bi_start; 74 u32 bi_start;
71 u32 bi_len; 75 u32 bi_len;
@@ -90,10 +94,11 @@ struct gfs2_rgrpd {
90 struct gfs2_sbd *rd_sbd; 94 struct gfs2_sbd *rd_sbd;
91 unsigned int rd_bh_count; 95 unsigned int rd_bh_count;
92 u32 rd_last_alloc; 96 u32 rd_last_alloc;
93 unsigned char rd_flags; 97 u32 rd_flags;
94#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */ 98#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
95#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */ 99#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
96#define GFS2_RDF_UPTODATE 0x04 /* rg is up to date */ 100#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
101#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */
97}; 102};
98 103
99enum gfs2_state_bits { 104enum gfs2_state_bits {
@@ -154,6 +159,7 @@ struct gfs2_glock_operations {
154 int (*go_lock) (struct gfs2_holder *gh); 159 int (*go_lock) (struct gfs2_holder *gh);
155 void (*go_unlock) (struct gfs2_holder *gh); 160 void (*go_unlock) (struct gfs2_holder *gh);
156 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); 161 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
162 void (*go_callback) (struct gfs2_glock *gl);
157 const int go_type; 163 const int go_type;
158 const unsigned long go_min_hold_time; 164 const unsigned long go_min_hold_time;
159}; 165};
@@ -223,6 +229,7 @@ struct gfs2_glock {
223 struct list_head gl_ail_list; 229 struct list_head gl_ail_list;
224 atomic_t gl_ail_count; 230 atomic_t gl_ail_count;
225 struct delayed_work gl_work; 231 struct delayed_work gl_work;
232 struct work_struct gl_delete;
226}; 233};
227 234
228#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ 235#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
@@ -376,11 +383,11 @@ struct gfs2_journal_extent {
376struct gfs2_jdesc { 383struct gfs2_jdesc {
377 struct list_head jd_list; 384 struct list_head jd_list;
378 struct list_head extent_list; 385 struct list_head extent_list;
379 386 struct slow_work jd_work;
380 struct inode *jd_inode; 387 struct inode *jd_inode;
388 unsigned long jd_flags;
389#define JDF_RECOVERY 1
381 unsigned int jd_jid; 390 unsigned int jd_jid;
382 int jd_dirty;
383
384 unsigned int jd_blocks; 391 unsigned int jd_blocks;
385}; 392};
386 393
@@ -390,9 +397,6 @@ struct gfs2_statfs_change_host {
390 s64 sc_dinodes; 397 s64 sc_dinodes;
391}; 398};
392 399
393#define GFS2_GLOCKD_DEFAULT 1
394#define GFS2_GLOCKD_MAX 16
395
396#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF 400#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF
397#define GFS2_QUOTA_OFF 0 401#define GFS2_QUOTA_OFF 0
398#define GFS2_QUOTA_ACCOUNT 1 402#define GFS2_QUOTA_ACCOUNT 1
@@ -418,6 +422,7 @@ struct gfs2_args {
418 unsigned int ar_data:2; /* ordered/writeback */ 422 unsigned int ar_data:2; /* ordered/writeback */
419 unsigned int ar_meta:1; /* mount metafs */ 423 unsigned int ar_meta:1; /* mount metafs */
420 unsigned int ar_discard:1; /* discard requests */ 424 unsigned int ar_discard:1; /* discard requests */
425 int ar_commit; /* Commit interval */
421}; 426};
422 427
423struct gfs2_tune { 428struct gfs2_tune {
@@ -426,7 +431,6 @@ struct gfs2_tune {
426 unsigned int gt_incore_log_blocks; 431 unsigned int gt_incore_log_blocks;
427 unsigned int gt_log_flush_secs; 432 unsigned int gt_log_flush_secs;
428 433
429 unsigned int gt_recoverd_secs;
430 unsigned int gt_logd_secs; 434 unsigned int gt_logd_secs;
431 435
432 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ 436 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -447,6 +451,7 @@ enum {
447 SDF_JOURNAL_LIVE = 1, 451 SDF_JOURNAL_LIVE = 1,
448 SDF_SHUTDOWN = 2, 452 SDF_SHUTDOWN = 2,
449 SDF_NOBARRIERS = 3, 453 SDF_NOBARRIERS = 3,
454 SDF_NORECOVERY = 4,
450}; 455};
451 456
452#define GFS2_FSNAME_LEN 256 457#define GFS2_FSNAME_LEN 256
@@ -493,7 +498,6 @@ struct lm_lockstruct {
493 unsigned long ls_flags; 498 unsigned long ls_flags;
494 dlm_lockspace_t *ls_dlm; 499 dlm_lockspace_t *ls_dlm;
495 500
496 int ls_recover_jid;
497 int ls_recover_jid_done; 501 int ls_recover_jid_done;
498 int ls_recover_jid_status; 502 int ls_recover_jid_status;
499}; 503};
@@ -582,7 +586,6 @@ struct gfs2_sbd {
582 586
583 /* Daemon stuff */ 587 /* Daemon stuff */
584 588
585 struct task_struct *sd_recoverd_process;
586 struct task_struct *sd_logd_process; 589 struct task_struct *sd_logd_process;
587 struct task_struct *sd_quotad_process; 590 struct task_struct *sd_quotad_process;
588 591
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5a31d426116f..2f94bd723698 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -30,7 +30,6 @@
30#include "inode.h" 30#include "inode.h"
31#include "log.h" 31#include "log.h"
32#include "meta_io.h" 32#include "meta_io.h"
33#include "ops_address.h"
34#include "quota.h" 33#include "quota.h"
35#include "rgrp.h" 34#include "rgrp.h"
36#include "trans.h" 35#include "trans.h"
@@ -1047,154 +1046,7 @@ fail:
1047 return ERR_PTR(error); 1046 return ERR_PTR(error);
1048} 1047}
1049 1048
1050/** 1049static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1051 * gfs2_rmdiri - Remove a directory
1052 * @dip: The parent directory of the directory to be removed
1053 * @name: The name of the directory to be removed
1054 * @ip: The GFS2 inode of the directory to be removed
1055 *
1056 * Assumes Glocks on dip and ip are held
1057 *
1058 * Returns: errno
1059 */
1060
1061int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
1062 struct gfs2_inode *ip)
1063{
1064 struct qstr dotname;
1065 int error;
1066
1067 if (ip->i_entries != 2) {
1068 if (gfs2_consist_inode(ip))
1069 gfs2_dinode_print(ip);
1070 return -EIO;
1071 }
1072
1073 error = gfs2_dir_del(dip, name);
1074 if (error)
1075 return error;
1076
1077 error = gfs2_change_nlink(dip, -1);
1078 if (error)
1079 return error;
1080
1081 gfs2_str2qstr(&dotname, ".");
1082 error = gfs2_dir_del(ip, &dotname);
1083 if (error)
1084 return error;
1085
1086 gfs2_str2qstr(&dotname, "..");
1087 error = gfs2_dir_del(ip, &dotname);
1088 if (error)
1089 return error;
1090
1091 /* It looks odd, but it really should be done twice */
1092 error = gfs2_change_nlink(ip, -1);
1093 if (error)
1094 return error;
1095
1096 error = gfs2_change_nlink(ip, -1);
1097 if (error)
1098 return error;
1099
1100 return error;
1101}
1102
1103/*
1104 * gfs2_unlink_ok - check to see that a inode is still in a directory
1105 * @dip: the directory
1106 * @name: the name of the file
1107 * @ip: the inode
1108 *
1109 * Assumes that the lock on (at least) @dip is held.
1110 *
1111 * Returns: 0 if the parent/child relationship is correct, errno if it isn't
1112 */
1113
1114int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
1115 const struct gfs2_inode *ip)
1116{
1117 int error;
1118
1119 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
1120 return -EPERM;
1121
1122 if ((dip->i_inode.i_mode & S_ISVTX) &&
1123 dip->i_inode.i_uid != current_fsuid() &&
1124 ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
1125 return -EPERM;
1126
1127 if (IS_APPEND(&dip->i_inode))
1128 return -EPERM;
1129
1130 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
1131 if (error)
1132 return error;
1133
1134 error = gfs2_dir_check(&dip->i_inode, name, ip);
1135 if (error)
1136 return error;
1137
1138 return 0;
1139}
1140
1141/**
1142 * gfs2_readlinki - return the contents of a symlink
1143 * @ip: the symlink's inode
1144 * @buf: a pointer to the buffer to be filled
1145 * @len: a pointer to the length of @buf
1146 *
1147 * If @buf is too small, a piece of memory is kmalloc()ed and needs
1148 * to be freed by the caller.
1149 *
1150 * Returns: errno
1151 */
1152
1153int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1154{
1155 struct gfs2_holder i_gh;
1156 struct buffer_head *dibh;
1157 unsigned int x;
1158 int error;
1159
1160 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
1161 error = gfs2_glock_nq(&i_gh);
1162 if (error) {
1163 gfs2_holder_uninit(&i_gh);
1164 return error;
1165 }
1166
1167 if (!ip->i_disksize) {
1168 gfs2_consist_inode(ip);
1169 error = -EIO;
1170 goto out;
1171 }
1172
1173 error = gfs2_meta_inode_buffer(ip, &dibh);
1174 if (error)
1175 goto out;
1176
1177 x = ip->i_disksize + 1;
1178 if (x > *len) {
1179 *buf = kmalloc(x, GFP_NOFS);
1180 if (!*buf) {
1181 error = -ENOMEM;
1182 goto out_brelse;
1183 }
1184 }
1185
1186 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1187 *len = x;
1188
1189out_brelse:
1190 brelse(dibh);
1191out:
1192 gfs2_glock_dq_uninit(&i_gh);
1193 return error;
1194}
1195
1196static int
1197__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1198{ 1050{
1199 struct buffer_head *dibh; 1051 struct buffer_head *dibh;
1200 int error; 1052 int error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c30be2b66580..c341aaf67adb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -11,8 +11,16 @@
11#define __INODE_DOT_H__ 11#define __INODE_DOT_H__
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
14#include "util.h" 16#include "util.h"
15 17
18extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
19extern int gfs2_internal_read(struct gfs2_inode *ip,
20 struct file_ra_state *ra_state,
21 char *buf, loff_t *pos, unsigned size);
22extern void gfs2_set_aops(struct inode *inode);
23
16static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) 24static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
17{ 25{
18 return !ip->i_height; 26 return !ip->i_height;
@@ -73,30 +81,26 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
73} 81}
74 82
75 83
76void gfs2_set_iop(struct inode *inode); 84extern void gfs2_set_iop(struct inode *inode);
77struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
78 u64 no_addr, u64 no_formal_ino, 86 u64 no_addr, u64 no_formal_ino,
79 int skip_freeing); 87 int skip_freeing);
80struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
81 89
82int gfs2_inode_refresh(struct gfs2_inode *ip); 90extern int gfs2_inode_refresh(struct gfs2_inode *ip);
83 91
84int gfs2_dinode_dealloc(struct gfs2_inode *inode); 92extern int gfs2_dinode_dealloc(struct gfs2_inode *inode);
85int gfs2_change_nlink(struct gfs2_inode *ip, int diff); 93extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
86struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, 94extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
87 int is_root); 95 int is_root);
88struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, 96extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
89 unsigned int mode, dev_t dev); 97 const struct qstr *name,
90int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, 98 unsigned int mode, dev_t dev);
91 struct gfs2_inode *ip); 99extern int gfs2_permission(struct inode *inode, int mask);
92int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, 100extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
93 const struct gfs2_inode *ip); 101extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
94int gfs2_permission(struct inode *inode, int mask); 102extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
95int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); 103extern void gfs2_dinode_print(const struct gfs2_inode *ip);
96int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
97struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
98void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
99void gfs2_dinode_print(const struct gfs2_inode *ip);
100 104
101extern const struct inode_operations gfs2_file_iops; 105extern const struct inode_operations gfs2_file_iops;
102extern const struct inode_operations gfs2_dir_iops; 106extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 98918a756410..13c6237c5f67 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -28,6 +28,7 @@
28#include "meta_io.h" 28#include "meta_io.h"
29#include "util.h" 29#include "util.h"
30#include "dir.h" 30#include "dir.h"
31#include "trace_gfs2.h"
31 32
32#define PULL 1 33#define PULL 1
33 34
@@ -120,7 +121,7 @@ __acquires(&sdp->sd_log_lock)
120 lock_buffer(bh); 121 lock_buffer(bh);
121 if (test_clear_buffer_dirty(bh)) { 122 if (test_clear_buffer_dirty(bh)) {
122 bh->b_end_io = end_buffer_write_sync; 123 bh->b_end_io = end_buffer_write_sync;
123 submit_bh(WRITE, bh); 124 submit_bh(WRITE_SYNC_PLUG, bh);
124 } else { 125 } else {
125 unlock_buffer(bh); 126 unlock_buffer(bh);
126 brelse(bh); 127 brelse(bh);
@@ -313,6 +314,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
313 gfs2_log_lock(sdp); 314 gfs2_log_lock(sdp);
314 } 315 }
315 atomic_sub(blks, &sdp->sd_log_blks_free); 316 atomic_sub(blks, &sdp->sd_log_blks_free);
317 trace_gfs2_log_blocks(sdp, -blks);
316 gfs2_log_unlock(sdp); 318 gfs2_log_unlock(sdp);
317 mutex_unlock(&sdp->sd_log_reserve_mutex); 319 mutex_unlock(&sdp->sd_log_reserve_mutex);
318 320
@@ -333,6 +335,7 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
333 335
334 gfs2_log_lock(sdp); 336 gfs2_log_lock(sdp);
335 atomic_add(blks, &sdp->sd_log_blks_free); 337 atomic_add(blks, &sdp->sd_log_blks_free);
338 trace_gfs2_log_blocks(sdp, blks);
336 gfs2_assert_withdraw(sdp, 339 gfs2_assert_withdraw(sdp,
337 atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); 340 atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
338 gfs2_log_unlock(sdp); 341 gfs2_log_unlock(sdp);
@@ -558,6 +561,7 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
558 561
559 gfs2_log_lock(sdp); 562 gfs2_log_lock(sdp);
560 atomic_add(dist, &sdp->sd_log_blks_free); 563 atomic_add(dist, &sdp->sd_log_blks_free);
564 trace_gfs2_log_blocks(sdp, dist);
561 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); 565 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
562 gfs2_log_unlock(sdp); 566 gfs2_log_unlock(sdp);
563 567
@@ -604,7 +608,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
604 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) 608 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
605 goto skip_barrier; 609 goto skip_barrier;
606 get_bh(bh); 610 get_bh(bh);
607 submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh); 611 submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh);
608 wait_on_buffer(bh); 612 wait_on_buffer(bh);
609 if (buffer_eopnotsupp(bh)) { 613 if (buffer_eopnotsupp(bh)) {
610 clear_buffer_eopnotsupp(bh); 614 clear_buffer_eopnotsupp(bh);
@@ -664,7 +668,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
664 lock_buffer(bh); 668 lock_buffer(bh);
665 if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { 669 if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
666 bh->b_end_io = end_buffer_write_sync; 670 bh->b_end_io = end_buffer_write_sync;
667 submit_bh(WRITE, bh); 671 submit_bh(WRITE_SYNC_PLUG, bh);
668 } else { 672 } else {
669 unlock_buffer(bh); 673 unlock_buffer(bh);
670 brelse(bh); 674 brelse(bh);
@@ -715,6 +719,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
715 up_write(&sdp->sd_log_flush_lock); 719 up_write(&sdp->sd_log_flush_lock);
716 return; 720 return;
717 } 721 }
722 trace_gfs2_log_flush(sdp, 1);
718 723
719 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); 724 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
720 INIT_LIST_HEAD(&ai->ai_ail1_list); 725 INIT_LIST_HEAD(&ai->ai_ail1_list);
@@ -746,6 +751,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
746 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ 751 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
747 gfs2_log_lock(sdp); 752 gfs2_log_lock(sdp);
748 atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ 753 atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
754 trace_gfs2_log_blocks(sdp, -1);
749 gfs2_log_unlock(sdp); 755 gfs2_log_unlock(sdp);
750 log_write_header(sdp, 0, PULL); 756 log_write_header(sdp, 0, PULL);
751 } 757 }
@@ -763,8 +769,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
763 ai = NULL; 769 ai = NULL;
764 } 770 }
765 gfs2_log_unlock(sdp); 771 gfs2_log_unlock(sdp);
766 772 trace_gfs2_log_flush(sdp, 0);
767 sdp->sd_vfs->s_dirt = 0;
768 up_write(&sdp->sd_log_flush_lock); 773 up_write(&sdp->sd_log_flush_lock);
769 774
770 kfree(ai); 775 kfree(ai);
@@ -788,6 +793,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
788 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); 793 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
789 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; 794 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
790 atomic_add(unused, &sdp->sd_log_blks_free); 795 atomic_add(unused, &sdp->sd_log_blks_free);
796 trace_gfs2_log_blocks(sdp, unused);
791 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= 797 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
792 sdp->sd_jdesc->jd_blocks); 798 sdp->sd_jdesc->jd_blocks);
793 sdp->sd_log_blks_reserved = reserved; 799 sdp->sd_log_blks_reserved = reserved;
@@ -823,7 +829,6 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
823 log_refund(sdp, tr); 829 log_refund(sdp, tr);
824 buf_lo_incore_commit(sdp, tr); 830 buf_lo_incore_commit(sdp, tr);
825 831
826 sdp->sd_vfs->s_dirt = 1;
827 up_read(&sdp->sd_log_flush_lock); 832 up_read(&sdp->sd_log_flush_lock);
828 833
829 gfs2_log_lock(sdp); 834 gfs2_log_lock(sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 80e4f5f898bb..9969ff062c5b 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,6 +13,8 @@
13#include <linux/completion.h> 13#include <linux/completion.h>
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/bio.h>
17#include <linux/fs.h>
16 18
17#include "gfs2.h" 19#include "gfs2.h"
18#include "incore.h" 20#include "incore.h"
@@ -25,6 +27,7 @@
25#include "rgrp.h" 27#include "rgrp.h"
26#include "trans.h" 28#include "trans.h"
27#include "util.h" 29#include "util.h"
30#include "trace_gfs2.h"
28 31
29/** 32/**
30 * gfs2_pin - Pin a buffer in memory 33 * gfs2_pin - Pin a buffer in memory
@@ -51,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
51 if (bd->bd_ail) 54 if (bd->bd_ail)
52 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); 55 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
53 get_bh(bh); 56 get_bh(bh);
57 trace_gfs2_pin(bd, 1);
54} 58}
55 59
56/** 60/**
@@ -87,6 +91,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
87 bd->bd_ail = ai; 91 bd->bd_ail = ai;
88 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); 92 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
89 clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); 93 clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
94 trace_gfs2_pin(bd, 0);
90 gfs2_log_unlock(sdp); 95 gfs2_log_unlock(sdp);
91 unlock_buffer(bh); 96 unlock_buffer(bh);
92} 97}
@@ -189,7 +194,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
189 } 194 }
190 195
191 gfs2_log_unlock(sdp); 196 gfs2_log_unlock(sdp);
192 submit_bh(WRITE, bh); 197 submit_bh(WRITE_SYNC_PLUG, bh);
193 gfs2_log_lock(sdp); 198 gfs2_log_lock(sdp);
194 199
195 n = 0; 200 n = 0;
@@ -199,7 +204,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
199 gfs2_log_unlock(sdp); 204 gfs2_log_unlock(sdp);
200 lock_buffer(bd2->bd_bh); 205 lock_buffer(bd2->bd_bh);
201 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); 206 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
202 submit_bh(WRITE, bh); 207 submit_bh(WRITE_SYNC_PLUG, bh);
203 gfs2_log_lock(sdp); 208 gfs2_log_lock(sdp);
204 if (++n >= num) 209 if (++n >= num)
205 break; 210 break;
@@ -341,7 +346,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
341 sdp->sd_log_num_revoke--; 346 sdp->sd_log_num_revoke--;
342 347
343 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { 348 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
344 submit_bh(WRITE, bh); 349 submit_bh(WRITE_SYNC_PLUG, bh);
345 350
346 bh = gfs2_log_get_buf(sdp); 351 bh = gfs2_log_get_buf(sdp);
347 mh = (struct gfs2_meta_header *)bh->b_data; 352 mh = (struct gfs2_meta_header *)bh->b_data;
@@ -358,7 +363,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
358 } 363 }
359 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 364 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
360 365
361 submit_bh(WRITE, bh); 366 submit_bh(WRITE_SYNC_PLUG, bh);
362} 367}
363 368
364static void revoke_lo_before_scan(struct gfs2_jdesc *jd, 369static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -560,7 +565,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
560 ptr = bh_log_ptr(bh); 565 ptr = bh_log_ptr(bh);
561 566
562 get_bh(bh); 567 get_bh(bh);
563 submit_bh(WRITE, bh); 568 submit_bh(WRITE_SYNC_PLUG, bh);
564 gfs2_log_lock(sdp); 569 gfs2_log_lock(sdp);
565 while(!list_empty(list)) { 570 while(!list_empty(list)) {
566 bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list); 571 bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -586,7 +591,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
586 } else { 591 } else {
587 bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh); 592 bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
588 } 593 }
589 submit_bh(WRITE, bh1); 594 submit_bh(WRITE_SYNC_PLUG, bh1);
590 gfs2_log_lock(sdp); 595 gfs2_log_lock(sdp);
591 ptr += 2; 596 ptr += 2;
592 } 597 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a6892ed0840a..eacd78a5d082 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,6 +15,7 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <asm/atomic.h> 17#include <asm/atomic.h>
18#include <linux/slow-work.h>
18 19
19#include "gfs2.h" 20#include "gfs2.h"
20#include "incore.h" 21#include "incore.h"
@@ -113,12 +114,18 @@ static int __init init_gfs2_fs(void)
113 if (error) 114 if (error)
114 goto fail_unregister; 115 goto fail_unregister;
115 116
117 error = slow_work_register_user();
118 if (error)
119 goto fail_slow;
120
116 gfs2_register_debugfs(); 121 gfs2_register_debugfs();
117 122
118 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__); 123 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
119 124
120 return 0; 125 return 0;
121 126
127fail_slow:
128 unregister_filesystem(&gfs2meta_fs_type);
122fail_unregister: 129fail_unregister:
123 unregister_filesystem(&gfs2_fs_type); 130 unregister_filesystem(&gfs2_fs_type);
124fail: 131fail:
@@ -156,6 +163,7 @@ static void __exit exit_gfs2_fs(void)
156 gfs2_unregister_debugfs(); 163 gfs2_unregister_debugfs();
157 unregister_filesystem(&gfs2_fs_type); 164 unregister_filesystem(&gfs2_fs_type);
158 unregister_filesystem(&gfs2meta_fs_type); 165 unregister_filesystem(&gfs2meta_fs_type);
166 slow_work_unregister_user();
159 167
160 kmem_cache_destroy(gfs2_quotad_cachep); 168 kmem_cache_destroy(gfs2_quotad_cachep);
161 kmem_cache_destroy(gfs2_rgrpd_cachep); 169 kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 8d6f13256b26..cb8d7a93d5ec 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -31,19 +31,66 @@
31#include "rgrp.h" 31#include "rgrp.h"
32#include "trans.h" 32#include "trans.h"
33#include "util.h" 33#include "util.h"
34#include "ops_address.h"
35 34
36static int aspace_get_block(struct inode *inode, sector_t lblock, 35static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
37 struct buffer_head *bh_result, int create)
38{ 36{
39 gfs2_assert_warn(inode->i_sb->s_fs_info, 0); 37 int err;
40 return -EOPNOTSUPP; 38 struct buffer_head *bh, *head;
41} 39 int nr_underway = 0;
40 int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
41 WRITE_SYNC_PLUG : WRITE));
42
43 BUG_ON(!PageLocked(page));
44 BUG_ON(!page_has_buffers(page));
45
46 head = page_buffers(page);
47 bh = head;
48
49 do {
50 if (!buffer_mapped(bh))
51 continue;
52 /*
53 * If it's a fully non-blocking write attempt and we cannot
54 * lock the buffer then redirty the page. Note that this can
55 * potentially cause a busy-wait loop from pdflush and kswapd
56 * activity, but those code paths have their own higher-level
57 * throttling.
58 */
59 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
60 lock_buffer(bh);
61 } else if (!trylock_buffer(bh)) {
62 redirty_page_for_writepage(wbc, page);
63 continue;
64 }
65 if (test_clear_buffer_dirty(bh)) {
66 mark_buffer_async_write(bh);
67 } else {
68 unlock_buffer(bh);
69 }
70 } while ((bh = bh->b_this_page) != head);
71
72 /*
73 * The page and its buffers are protected by PageWriteback(), so we can
74 * drop the bh refcounts early.
75 */
76 BUG_ON(PageWriteback(page));
77 set_page_writeback(page);
78
79 do {
80 struct buffer_head *next = bh->b_this_page;
81 if (buffer_async_write(bh)) {
82 submit_bh(write_op, bh);
83 nr_underway++;
84 }
85 bh = next;
86 } while (bh != head);
87 unlock_page(page);
42 88
43static int gfs2_aspace_writepage(struct page *page, 89 err = 0;
44 struct writeback_control *wbc) 90 if (nr_underway == 0)
45{ 91 end_page_writeback(page);
46 return block_write_full_page(page, aspace_get_block, wbc); 92
93 return err;
47} 94}
48 95
49static const struct address_space_operations aspace_aops = { 96static const struct address_space_operations aspace_aops = {
@@ -201,16 +248,32 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
201int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, 248int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
202 struct buffer_head **bhp) 249 struct buffer_head **bhp)
203{ 250{
204 *bhp = gfs2_getbuf(gl, blkno, CREATE); 251 struct gfs2_sbd *sdp = gl->gl_sbd;
205 if (!buffer_uptodate(*bhp)) { 252 struct buffer_head *bh;
206 ll_rw_block(READ_META, 1, bhp); 253
207 if (flags & DIO_WAIT) { 254 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
208 int error = gfs2_meta_wait(gl->gl_sbd, *bhp); 255 return -EIO;
209 if (error) { 256
210 brelse(*bhp); 257 *bhp = bh = gfs2_getbuf(gl, blkno, CREATE);
211 return error; 258
212 } 259 lock_buffer(bh);
213 } 260 if (buffer_uptodate(bh)) {
261 unlock_buffer(bh);
262 return 0;
263 }
264 bh->b_end_io = end_buffer_read_sync;
265 get_bh(bh);
266 submit_bh(READ_SYNC | (1 << BIO_RW_META), bh);
267 if (!(flags & DIO_WAIT))
268 return 0;
269
270 wait_on_buffer(bh);
271 if (unlikely(!buffer_uptodate(bh))) {
272 struct gfs2_trans *tr = current->journal_info;
273 if (tr && tr->tr_touched)
274 gfs2_io_error_bh(sdp, bh);
275 brelse(bh);
276 return -EIO;
214 } 277 }
215 278
216 return 0; 279 return 0;
@@ -404,7 +467,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
404 if (buffer_uptodate(first_bh)) 467 if (buffer_uptodate(first_bh))
405 goto out; 468 goto out;
406 if (!buffer_locked(first_bh)) 469 if (!buffer_locked(first_bh))
407 ll_rw_block(READ_META, 1, &first_bh); 470 ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh);
408 471
409 dblock++; 472 dblock++;
410 extlen--; 473 extlen--;
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
deleted file mode 100644
index f7e8527a21e0..000000000000
--- a/fs/gfs2/mount.c
+++ /dev/null
@@ -1,185 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/slab.h>
11#include <linux/spinlock.h>
12#include <linux/completion.h>
13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h>
15#include <linux/parser.h>
16
17#include "gfs2.h"
18#include "incore.h"
19#include "super.h"
20#include "sys.h"
21#include "util.h"
22
23enum {
24 Opt_lockproto,
25 Opt_locktable,
26 Opt_hostdata,
27 Opt_spectator,
28 Opt_ignore_local_fs,
29 Opt_localflocks,
30 Opt_localcaching,
31 Opt_debug,
32 Opt_nodebug,
33 Opt_upgrade,
34 Opt_acl,
35 Opt_noacl,
36 Opt_quota_off,
37 Opt_quota_account,
38 Opt_quota_on,
39 Opt_quota,
40 Opt_noquota,
41 Opt_suiddir,
42 Opt_nosuiddir,
43 Opt_data_writeback,
44 Opt_data_ordered,
45 Opt_meta,
46 Opt_discard,
47 Opt_nodiscard,
48 Opt_err,
49};
50
51static const match_table_t tokens = {
52 {Opt_lockproto, "lockproto=%s"},
53 {Opt_locktable, "locktable=%s"},
54 {Opt_hostdata, "hostdata=%s"},
55 {Opt_spectator, "spectator"},
56 {Opt_ignore_local_fs, "ignore_local_fs"},
57 {Opt_localflocks, "localflocks"},
58 {Opt_localcaching, "localcaching"},
59 {Opt_debug, "debug"},
60 {Opt_nodebug, "nodebug"},
61 {Opt_upgrade, "upgrade"},
62 {Opt_acl, "acl"},
63 {Opt_noacl, "noacl"},
64 {Opt_quota_off, "quota=off"},
65 {Opt_quota_account, "quota=account"},
66 {Opt_quota_on, "quota=on"},
67 {Opt_quota, "quota"},
68 {Opt_noquota, "noquota"},
69 {Opt_suiddir, "suiddir"},
70 {Opt_nosuiddir, "nosuiddir"},
71 {Opt_data_writeback, "data=writeback"},
72 {Opt_data_ordered, "data=ordered"},
73 {Opt_meta, "meta"},
74 {Opt_discard, "discard"},
75 {Opt_nodiscard, "nodiscard"},
76 {Opt_err, NULL}
77};
78
79/**
80 * gfs2_mount_args - Parse mount options
81 * @sdp:
82 * @data:
83 *
84 * Return: errno
85 */
86
87int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
88{
89 char *o;
90 int token;
91 substring_t tmp[MAX_OPT_ARGS];
92
93 /* Split the options into tokens with the "," character and
94 process them */
95
96 while (1) {
97 o = strsep(&options, ",");
98 if (o == NULL)
99 break;
100 if (*o == '\0')
101 continue;
102
103 token = match_token(o, tokens, tmp);
104 switch (token) {
105 case Opt_lockproto:
106 match_strlcpy(args->ar_lockproto, &tmp[0],
107 GFS2_LOCKNAME_LEN);
108 break;
109 case Opt_locktable:
110 match_strlcpy(args->ar_locktable, &tmp[0],
111 GFS2_LOCKNAME_LEN);
112 break;
113 case Opt_hostdata:
114 match_strlcpy(args->ar_hostdata, &tmp[0],
115 GFS2_LOCKNAME_LEN);
116 break;
117 case Opt_spectator:
118 args->ar_spectator = 1;
119 break;
120 case Opt_ignore_local_fs:
121 args->ar_ignore_local_fs = 1;
122 break;
123 case Opt_localflocks:
124 args->ar_localflocks = 1;
125 break;
126 case Opt_localcaching:
127 args->ar_localcaching = 1;
128 break;
129 case Opt_debug:
130 args->ar_debug = 1;
131 break;
132 case Opt_nodebug:
133 args->ar_debug = 0;
134 break;
135 case Opt_upgrade:
136 args->ar_upgrade = 1;
137 break;
138 case Opt_acl:
139 args->ar_posix_acl = 1;
140 break;
141 case Opt_noacl:
142 args->ar_posix_acl = 0;
143 break;
144 case Opt_quota_off:
145 case Opt_noquota:
146 args->ar_quota = GFS2_QUOTA_OFF;
147 break;
148 case Opt_quota_account:
149 args->ar_quota = GFS2_QUOTA_ACCOUNT;
150 break;
151 case Opt_quota_on:
152 case Opt_quota:
153 args->ar_quota = GFS2_QUOTA_ON;
154 break;
155 case Opt_suiddir:
156 args->ar_suiddir = 1;
157 break;
158 case Opt_nosuiddir:
159 args->ar_suiddir = 0;
160 break;
161 case Opt_data_writeback:
162 args->ar_data = GFS2_DATA_WRITEBACK;
163 break;
164 case Opt_data_ordered:
165 args->ar_data = GFS2_DATA_ORDERED;
166 break;
167 case Opt_meta:
168 args->ar_meta = 1;
169 break;
170 case Opt_discard:
171 args->ar_discard = 1;
172 break;
173 case Opt_nodiscard:
174 args->ar_discard = 0;
175 break;
176 case Opt_err:
177 default:
178 fs_info(sdp, "invalid mount option: %s\n", o);
179 return -EINVAL;
180 }
181 }
182
183 return 0;
184}
185
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
deleted file mode 100644
index 5da21285bba4..000000000000
--- a/fs/gfs2/ops_address.h
+++ /dev/null
@@ -1,23 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_ADDRESS_DOT_H__
11#define __OPS_ADDRESS_DOT_H__
12
13#include <linux/fs.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16
17extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
18extern int gfs2_internal_read(struct gfs2_inode *ip,
19 struct file_ra_state *ra_state,
20 char *buf, loff_t *pos, unsigned size);
21extern void gfs2_set_aops(struct inode *inode);
22
23#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473ea753..7bc3c45cd676 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,6 +17,7 @@
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/slow-work.h>
20 21
21#include "gfs2.h" 22#include "gfs2.h"
22#include "incore.h" 23#include "incore.h"
@@ -32,6 +33,7 @@
32#include "log.h" 33#include "log.h"
33#include "quota.h" 34#include "quota.h"
34#include "dir.h" 35#include "dir.h"
36#include "trace_gfs2.h"
35 37
36#define DO 0 38#define DO 0
37#define UNDO 1 39#define UNDO 1
@@ -55,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
55 spin_lock_init(&gt->gt_spin); 57 spin_lock_init(&gt->gt_spin);
56 58
57 gt->gt_incore_log_blocks = 1024; 59 gt->gt_incore_log_blocks = 1024;
58 gt->gt_log_flush_secs = 60;
59 gt->gt_recoverd_secs = 60;
60 gt->gt_logd_secs = 1; 60 gt->gt_logd_secs = 1;
61 gt->gt_quota_simul_sync = 64; 61 gt->gt_quota_simul_sync = 64;
62 gt->gt_quota_warn_period = 10; 62 gt->gt_quota_warn_period = 10;
@@ -526,11 +526,11 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
526 } 526 }
527 527
528 /* Set up the buffer cache and SB for real */ 528 /* Set up the buffer cache and SB for real */
529 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { 529 if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
530 ret = -EINVAL; 530 ret = -EINVAL;
531 fs_err(sdp, "FS block size (%u) is too small for device " 531 fs_err(sdp, "FS block size (%u) is too small for device "
532 "block size (%u)\n", 532 "block size (%u)\n",
533 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev)); 533 sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev));
534 goto out; 534 goto out;
535 } 535 }
536 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { 536 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
@@ -676,6 +676,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
676 break; 676 break;
677 677
678 INIT_LIST_HEAD(&jd->extent_list); 678 INIT_LIST_HEAD(&jd->extent_list);
679 slow_work_init(&jd->jd_work, &gfs2_recover_ops);
679 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); 680 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
680 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { 681 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
681 if (!jd->jd_inode) 682 if (!jd->jd_inode)
@@ -701,14 +702,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
701{ 702{
702 struct inode *master = sdp->sd_master_dir->d_inode; 703 struct inode *master = sdp->sd_master_dir->d_inode;
703 struct gfs2_holder ji_gh; 704 struct gfs2_holder ji_gh;
704 struct task_struct *p;
705 struct gfs2_inode *ip; 705 struct gfs2_inode *ip;
706 int jindex = 1; 706 int jindex = 1;
707 int error = 0; 707 int error = 0;
708 708
709 if (undo) { 709 if (undo) {
710 jindex = 0; 710 jindex = 0;
711 goto fail_recoverd; 711 goto fail_jinode_gh;
712 } 712 }
713 713
714 sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); 714 sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
@@ -776,6 +776,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
776 /* Map the extents for this journal's blocks */ 776 /* Map the extents for this journal's blocks */
777 map_journal_extents(sdp); 777 map_journal_extents(sdp);
778 } 778 }
779 trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free));
779 780
780 if (sdp->sd_lockstruct.ls_first) { 781 if (sdp->sd_lockstruct.ls_first) {
781 unsigned int x; 782 unsigned int x;
@@ -801,18 +802,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
801 gfs2_glock_dq_uninit(&ji_gh); 802 gfs2_glock_dq_uninit(&ji_gh);
802 jindex = 0; 803 jindex = 0;
803 804
804 p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
805 error = IS_ERR(p);
806 if (error) {
807 fs_err(sdp, "can't start recoverd thread: %d\n", error);
808 goto fail_jinode_gh;
809 }
810 sdp->sd_recoverd_process = p;
811
812 return 0; 805 return 0;
813 806
814fail_recoverd:
815 kthread_stop(sdp->sd_recoverd_process);
816fail_jinode_gh: 807fail_jinode_gh:
817 if (!sdp->sd_args.ar_spectator) 808 if (!sdp->sd_args.ar_spectator)
818 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); 809 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
@@ -1165,6 +1156,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1165 1156
1166 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT; 1157 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
1167 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT; 1158 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
1159 sdp->sd_args.ar_commit = 60;
1168 1160
1169 error = gfs2_mount_args(sdp, &sdp->sd_args, data); 1161 error = gfs2_mount_args(sdp, &sdp->sd_args, data);
1170 if (error) { 1162 if (error) {
@@ -1172,8 +1164,10 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1172 goto fail; 1164 goto fail;
1173 } 1165 }
1174 1166
1175 if (sdp->sd_args.ar_spectator) 1167 if (sdp->sd_args.ar_spectator) {
1176 sb->s_flags |= MS_RDONLY; 1168 sb->s_flags |= MS_RDONLY;
1169 set_bit(SDF_NORECOVERY, &sdp->sd_flags);
1170 }
1177 if (sdp->sd_args.ar_posix_acl) 1171 if (sdp->sd_args.ar_posix_acl)
1178 sb->s_flags |= MS_POSIXACL; 1172 sb->s_flags |= MS_POSIXACL;
1179 1173
@@ -1191,6 +1185,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1191 GFS2_BASIC_BLOCK_SHIFT; 1185 GFS2_BASIC_BLOCK_SHIFT;
1192 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; 1186 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
1193 1187
1188 sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
1189
1194 error = init_names(sdp, silent); 1190 error = init_names(sdp, silent);
1195 if (error) 1191 if (error)
1196 goto fail; 1192 goto fail;
@@ -1279,9 +1275,22 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1279 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); 1275 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
1280} 1276}
1281 1277
1282static struct super_block *get_gfs2_sb(const char *dev_name) 1278static int test_meta_super(struct super_block *s, void *ptr)
1279{
1280 struct block_device *bdev = ptr;
1281 return (bdev == s->s_bdev);
1282}
1283
1284static int set_meta_super(struct super_block *s, void *ptr)
1283{ 1285{
1284 struct super_block *sb; 1286 return -EINVAL;
1287}
1288
1289static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
1290 const char *dev_name, void *data, struct vfsmount *mnt)
1291{
1292 struct super_block *s;
1293 struct gfs2_sbd *sdp;
1285 struct path path; 1294 struct path path;
1286 int error; 1295 int error;
1287 1296
@@ -1289,30 +1298,17 @@ static struct super_block *get_gfs2_sb(const char *dev_name)
1289 if (error) { 1298 if (error) {
1290 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", 1299 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
1291 dev_name, error); 1300 dev_name, error);
1292 return NULL; 1301 return error;
1293 } 1302 }
1294 sb = path.dentry->d_inode->i_sb; 1303 s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
1295 if (sb && (sb->s_type == &gfs2_fs_type)) 1304 path.dentry->d_inode->i_sb->s_bdev);
1296 atomic_inc(&sb->s_active);
1297 else
1298 sb = NULL;
1299 path_put(&path); 1305 path_put(&path);
1300 return sb; 1306 if (IS_ERR(s)) {
1301}
1302
1303static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
1304 const char *dev_name, void *data, struct vfsmount *mnt)
1305{
1306 struct super_block *sb = NULL;
1307 struct gfs2_sbd *sdp;
1308
1309 sb = get_gfs2_sb(dev_name);
1310 if (!sb) {
1311 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); 1307 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
1312 return -ENOENT; 1308 return PTR_ERR(s);
1313 } 1309 }
1314 sdp = sb->s_fs_info; 1310 sdp = s->s_fs_info;
1315 mnt->mnt_sb = sb; 1311 mnt->mnt_sb = s;
1316 mnt->mnt_root = dget(sdp->sd_master_dir); 1312 mnt->mnt_root = dget(sdp->sd_master_dir);
1317 return 0; 1313 return 0;
1318} 1314}
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1c70fa5168d6..f8bd20baf99c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -262,6 +262,44 @@ out_parent:
262 return error; 262 return error;
263} 263}
264 264
265/*
266 * gfs2_unlink_ok - check to see that a inode is still in a directory
267 * @dip: the directory
268 * @name: the name of the file
269 * @ip: the inode
270 *
271 * Assumes that the lock on (at least) @dip is held.
272 *
273 * Returns: 0 if the parent/child relationship is correct, errno if it isn't
274 */
275
276static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
277 const struct gfs2_inode *ip)
278{
279 int error;
280
281 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
282 return -EPERM;
283
284 if ((dip->i_inode.i_mode & S_ISVTX) &&
285 dip->i_inode.i_uid != current_fsuid() &&
286 ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
287 return -EPERM;
288
289 if (IS_APPEND(&dip->i_inode))
290 return -EPERM;
291
292 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
293 if (error)
294 return error;
295
296 error = gfs2_dir_check(&dip->i_inode, name, ip);
297 if (error)
298 return error;
299
300 return 0;
301}
302
265/** 303/**
266 * gfs2_unlink - Unlink a file 304 * gfs2_unlink - Unlink a file
267 * @dir: The inode of the directory containing the file to unlink 305 * @dir: The inode of the directory containing the file to unlink
@@ -473,6 +511,59 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
473} 511}
474 512
475/** 513/**
514 * gfs2_rmdiri - Remove a directory
515 * @dip: The parent directory of the directory to be removed
516 * @name: The name of the directory to be removed
517 * @ip: The GFS2 inode of the directory to be removed
518 *
519 * Assumes Glocks on dip and ip are held
520 *
521 * Returns: errno
522 */
523
524static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
525 struct gfs2_inode *ip)
526{
527 struct qstr dotname;
528 int error;
529
530 if (ip->i_entries != 2) {
531 if (gfs2_consist_inode(ip))
532 gfs2_dinode_print(ip);
533 return -EIO;
534 }
535
536 error = gfs2_dir_del(dip, name);
537 if (error)
538 return error;
539
540 error = gfs2_change_nlink(dip, -1);
541 if (error)
542 return error;
543
544 gfs2_str2qstr(&dotname, ".");
545 error = gfs2_dir_del(ip, &dotname);
546 if (error)
547 return error;
548
549 gfs2_str2qstr(&dotname, "..");
550 error = gfs2_dir_del(ip, &dotname);
551 if (error)
552 return error;
553
554 /* It looks odd, but it really should be done twice */
555 error = gfs2_change_nlink(ip, -1);
556 if (error)
557 return error;
558
559 error = gfs2_change_nlink(ip, -1);
560 if (error)
561 return error;
562
563 return error;
564}
565
566/**
476 * gfs2_rmdir - Remove a directory 567 * gfs2_rmdir - Remove a directory
477 * @dir: The parent directory of the directory to be removed 568 * @dir: The parent directory of the directory to be removed
478 * @dentry: The dentry of the directory to remove 569 * @dentry: The dentry of the directory to remove
@@ -885,6 +976,61 @@ out:
885} 976}
886 977
887/** 978/**
979 * gfs2_readlinki - return the contents of a symlink
980 * @ip: the symlink's inode
981 * @buf: a pointer to the buffer to be filled
982 * @len: a pointer to the length of @buf
983 *
984 * If @buf is too small, a piece of memory is kmalloc()ed and needs
985 * to be freed by the caller.
986 *
987 * Returns: errno
988 */
989
990static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
991{
992 struct gfs2_holder i_gh;
993 struct buffer_head *dibh;
994 unsigned int x;
995 int error;
996
997 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
998 error = gfs2_glock_nq(&i_gh);
999 if (error) {
1000 gfs2_holder_uninit(&i_gh);
1001 return error;
1002 }
1003
1004 if (!ip->i_disksize) {
1005 gfs2_consist_inode(ip);
1006 error = -EIO;
1007 goto out;
1008 }
1009
1010 error = gfs2_meta_inode_buffer(ip, &dibh);
1011 if (error)
1012 goto out;
1013
1014 x = ip->i_disksize + 1;
1015 if (x > *len) {
1016 *buf = kmalloc(x, GFP_NOFS);
1017 if (!*buf) {
1018 error = -ENOMEM;
1019 goto out_brelse;
1020 }
1021 }
1022
1023 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1024 *len = x;
1025
1026out_brelse:
1027 brelse(dibh);
1028out:
1029 gfs2_glock_dq_uninit(&i_gh);
1030 return error;
1031}
1032
1033/**
888 * gfs2_readlink - Read the value of a symlink 1034 * gfs2_readlink - Read the value of a symlink
889 * @dentry: the symlink 1035 * @dentry: the symlink
890 * @buf: the buffer to read the symlink data into 1036 * @buf: the buffer to read the symlink data into
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
deleted file mode 100644
index 458019569dcb..000000000000
--- a/fs/gfs2/ops_super.c
+++ /dev/null
@@ -1,723 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/statfs.h>
16#include <linux/seq_file.h>
17#include <linux/mount.h>
18#include <linux/kthread.h>
19#include <linux/delay.h>
20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h>
22#include <linux/time.h>
23
24#include "gfs2.h"
25#include "incore.h"
26#include "glock.h"
27#include "inode.h"
28#include "log.h"
29#include "quota.h"
30#include "recovery.h"
31#include "rgrp.h"
32#include "super.h"
33#include "sys.h"
34#include "util.h"
35#include "trans.h"
36#include "dir.h"
37#include "eattr.h"
38#include "bmap.h"
39#include "meta_io.h"
40
41#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
42
43/**
44 * gfs2_write_inode - Make sure the inode is stable on the disk
45 * @inode: The inode
46 * @sync: synchronous write flag
47 *
48 * Returns: errno
49 */
50
51static int gfs2_write_inode(struct inode *inode, int sync)
52{
53 struct gfs2_inode *ip = GFS2_I(inode);
54 struct gfs2_sbd *sdp = GFS2_SB(inode);
55 struct gfs2_holder gh;
56 struct buffer_head *bh;
57 struct timespec atime;
58 struct gfs2_dinode *di;
59 int ret = 0;
60
61 /* Check this is a "normal" inode, etc */
62 if (!test_bit(GIF_USER, &ip->i_flags) ||
63 (current->flags & PF_MEMALLOC))
64 return 0;
65 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
66 if (ret)
67 goto do_flush;
68 ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
69 if (ret)
70 goto do_unlock;
71 ret = gfs2_meta_inode_buffer(ip, &bh);
72 if (ret == 0) {
73 di = (struct gfs2_dinode *)bh->b_data;
74 atime.tv_sec = be64_to_cpu(di->di_atime);
75 atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
76 if (timespec_compare(&inode->i_atime, &atime) > 0) {
77 gfs2_trans_add_bh(ip->i_gl, bh, 1);
78 gfs2_dinode_out(ip, bh->b_data);
79 }
80 brelse(bh);
81 }
82 gfs2_trans_end(sdp);
83do_unlock:
84 gfs2_glock_dq_uninit(&gh);
85do_flush:
86 if (sync != 0)
87 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
88 return ret;
89}
90
91/**
92 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
93 * @sdp: the filesystem
94 *
95 * Returns: errno
96 */
97
98static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
99{
100 struct gfs2_holder t_gh;
101 int error;
102
103 gfs2_quota_sync(sdp);
104 gfs2_statfs_sync(sdp);
105
106 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
107 &t_gh);
108 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
109 return error;
110
111 gfs2_meta_syncfs(sdp);
112 gfs2_log_shutdown(sdp);
113
114 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
115
116 if (t_gh.gh_gl)
117 gfs2_glock_dq_uninit(&t_gh);
118
119 gfs2_quota_cleanup(sdp);
120
121 return error;
122}
123
124/**
125 * gfs2_put_super - Unmount the filesystem
126 * @sb: The VFS superblock
127 *
128 */
129
130static void gfs2_put_super(struct super_block *sb)
131{
132 struct gfs2_sbd *sdp = sb->s_fs_info;
133 int error;
134
135 /* Unfreeze the filesystem, if we need to */
136
137 mutex_lock(&sdp->sd_freeze_lock);
138 if (sdp->sd_freeze_count)
139 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
140 mutex_unlock(&sdp->sd_freeze_lock);
141
142 kthread_stop(sdp->sd_quotad_process);
143 kthread_stop(sdp->sd_logd_process);
144 kthread_stop(sdp->sd_recoverd_process);
145
146 if (!(sb->s_flags & MS_RDONLY)) {
147 error = gfs2_make_fs_ro(sdp);
148 if (error)
149 gfs2_io_error(sdp);
150 }
151 /* At this point, we're through modifying the disk */
152
153 /* Release stuff */
154
155 iput(sdp->sd_jindex);
156 iput(sdp->sd_inum_inode);
157 iput(sdp->sd_statfs_inode);
158 iput(sdp->sd_rindex);
159 iput(sdp->sd_quota_inode);
160
161 gfs2_glock_put(sdp->sd_rename_gl);
162 gfs2_glock_put(sdp->sd_trans_gl);
163
164 if (!sdp->sd_args.ar_spectator) {
165 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
166 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
167 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
168 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
169 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
170 iput(sdp->sd_ir_inode);
171 iput(sdp->sd_sc_inode);
172 iput(sdp->sd_qc_inode);
173 }
174
175 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
176 gfs2_clear_rgrpd(sdp);
177 gfs2_jindex_free(sdp);
178 /* Take apart glock structures and buffer lists */
179 gfs2_gl_hash_clear(sdp);
180 /* Unmount the locking protocol */
181 gfs2_lm_unmount(sdp);
182
183 /* At this point, we're through participating in the lockspace */
184 gfs2_sys_fs_del(sdp);
185}
186
187/**
188 * gfs2_write_super
189 * @sb: the superblock
190 *
191 */
192
193static void gfs2_write_super(struct super_block *sb)
194{
195 sb->s_dirt = 0;
196}
197
198/**
199 * gfs2_sync_fs - sync the filesystem
200 * @sb: the superblock
201 *
202 * Flushes the log to disk.
203 */
204
205static int gfs2_sync_fs(struct super_block *sb, int wait)
206{
207 sb->s_dirt = 0;
208 if (wait && sb->s_fs_info)
209 gfs2_log_flush(sb->s_fs_info, NULL);
210 return 0;
211}
212
213/**
214 * gfs2_freeze - prevent further writes to the filesystem
215 * @sb: the VFS structure for the filesystem
216 *
217 */
218
219static int gfs2_freeze(struct super_block *sb)
220{
221 struct gfs2_sbd *sdp = sb->s_fs_info;
222 int error;
223
224 if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
225 return -EINVAL;
226
227 for (;;) {
228 error = gfs2_freeze_fs(sdp);
229 if (!error)
230 break;
231
232 switch (error) {
233 case -EBUSY:
234 fs_err(sdp, "waiting for recovery before freeze\n");
235 break;
236
237 default:
238 fs_err(sdp, "error freezing FS: %d\n", error);
239 break;
240 }
241
242 fs_err(sdp, "retrying...\n");
243 msleep(1000);
244 }
245 return 0;
246}
247
248/**
249 * gfs2_unfreeze - reallow writes to the filesystem
250 * @sb: the VFS structure for the filesystem
251 *
252 */
253
254static int gfs2_unfreeze(struct super_block *sb)
255{
256 gfs2_unfreeze_fs(sb->s_fs_info);
257 return 0;
258}
259
260/**
261 * statfs_fill - fill in the sg for a given RG
262 * @rgd: the RG
263 * @sc: the sc structure
264 *
265 * Returns: 0 on success, -ESTALE if the LVB is invalid
266 */
267
268static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
269 struct gfs2_statfs_change_host *sc)
270{
271 gfs2_rgrp_verify(rgd);
272 sc->sc_total += rgd->rd_data;
273 sc->sc_free += rgd->rd_free;
274 sc->sc_dinodes += rgd->rd_dinodes;
275 return 0;
276}
277
278/**
279 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
280 * @sdp: the filesystem
281 * @sc: the sc info that will be returned
282 *
283 * Any error (other than a signal) will cause this routine to fall back
284 * to the synchronous version.
285 *
286 * FIXME: This really shouldn't busy wait like this.
287 *
288 * Returns: errno
289 */
290
291static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
292{
293 struct gfs2_holder ri_gh;
294 struct gfs2_rgrpd *rgd_next;
295 struct gfs2_holder *gha, *gh;
296 unsigned int slots = 64;
297 unsigned int x;
298 int done;
299 int error = 0, err;
300
301 memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
302 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
303 if (!gha)
304 return -ENOMEM;
305
306 error = gfs2_rindex_hold(sdp, &ri_gh);
307 if (error)
308 goto out;
309
310 rgd_next = gfs2_rgrpd_get_first(sdp);
311
312 for (;;) {
313 done = 1;
314
315 for (x = 0; x < slots; x++) {
316 gh = gha + x;
317
318 if (gh->gh_gl && gfs2_glock_poll(gh)) {
319 err = gfs2_glock_wait(gh);
320 if (err) {
321 gfs2_holder_uninit(gh);
322 error = err;
323 } else {
324 if (!error)
325 error = statfs_slow_fill(
326 gh->gh_gl->gl_object, sc);
327 gfs2_glock_dq_uninit(gh);
328 }
329 }
330
331 if (gh->gh_gl)
332 done = 0;
333 else if (rgd_next && !error) {
334 error = gfs2_glock_nq_init(rgd_next->rd_gl,
335 LM_ST_SHARED,
336 GL_ASYNC,
337 gh);
338 rgd_next = gfs2_rgrpd_get_next(rgd_next);
339 done = 0;
340 }
341
342 if (signal_pending(current))
343 error = -ERESTARTSYS;
344 }
345
346 if (done)
347 break;
348
349 yield();
350 }
351
352 gfs2_glock_dq_uninit(&ri_gh);
353
354out:
355 kfree(gha);
356 return error;
357}
358
359/**
360 * gfs2_statfs_i - Do a statfs
361 * @sdp: the filesystem
362 * @sg: the sg structure
363 *
364 * Returns: errno
365 */
366
367static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
368{
369 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
370 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
371
372 spin_lock(&sdp->sd_statfs_spin);
373
374 *sc = *m_sc;
375 sc->sc_total += l_sc->sc_total;
376 sc->sc_free += l_sc->sc_free;
377 sc->sc_dinodes += l_sc->sc_dinodes;
378
379 spin_unlock(&sdp->sd_statfs_spin);
380
381 if (sc->sc_free < 0)
382 sc->sc_free = 0;
383 if (sc->sc_free > sc->sc_total)
384 sc->sc_free = sc->sc_total;
385 if (sc->sc_dinodes < 0)
386 sc->sc_dinodes = 0;
387
388 return 0;
389}
390
391/**
392 * gfs2_statfs - Gather and return stats about the filesystem
393 * @sb: The superblock
394 * @statfsbuf: The buffer
395 *
396 * Returns: 0 on success or error code
397 */
398
399static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
400{
401 struct super_block *sb = dentry->d_inode->i_sb;
402 struct gfs2_sbd *sdp = sb->s_fs_info;
403 struct gfs2_statfs_change_host sc;
404 int error;
405
406 if (gfs2_tune_get(sdp, gt_statfs_slow))
407 error = gfs2_statfs_slow(sdp, &sc);
408 else
409 error = gfs2_statfs_i(sdp, &sc);
410
411 if (error)
412 return error;
413
414 buf->f_type = GFS2_MAGIC;
415 buf->f_bsize = sdp->sd_sb.sb_bsize;
416 buf->f_blocks = sc.sc_total;
417 buf->f_bfree = sc.sc_free;
418 buf->f_bavail = sc.sc_free;
419 buf->f_files = sc.sc_dinodes + sc.sc_free;
420 buf->f_ffree = sc.sc_free;
421 buf->f_namelen = GFS2_FNAMESIZE;
422
423 return 0;
424}
425
426/**
427 * gfs2_remount_fs - called when the FS is remounted
428 * @sb: the filesystem
429 * @flags: the remount flags
430 * @data: extra data passed in (not used right now)
431 *
432 * Returns: errno
433 */
434
435static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
436{
437 struct gfs2_sbd *sdp = sb->s_fs_info;
438 struct gfs2_args args = sdp->sd_args; /* Default to current settings */
439 int error;
440
441 error = gfs2_mount_args(sdp, &args, data);
442 if (error)
443 return error;
444
445 /* Not allowed to change locking details */
446 if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
447 strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
448 strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
449 return -EINVAL;
450
451 /* Some flags must not be changed */
452 if (args_neq(&args, &sdp->sd_args, spectator) ||
453 args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
454 args_neq(&args, &sdp->sd_args, localflocks) ||
455 args_neq(&args, &sdp->sd_args, localcaching) ||
456 args_neq(&args, &sdp->sd_args, meta))
457 return -EINVAL;
458
459 if (sdp->sd_args.ar_spectator)
460 *flags |= MS_RDONLY;
461
462 if ((sb->s_flags ^ *flags) & MS_RDONLY) {
463 if (*flags & MS_RDONLY)
464 error = gfs2_make_fs_ro(sdp);
465 else
466 error = gfs2_make_fs_rw(sdp);
467 if (error)
468 return error;
469 }
470
471 sdp->sd_args = args;
472 if (sdp->sd_args.ar_posix_acl)
473 sb->s_flags |= MS_POSIXACL;
474 else
475 sb->s_flags &= ~MS_POSIXACL;
476 return 0;
477}
478
479/**
480 * gfs2_drop_inode - Drop an inode (test for remote unlink)
481 * @inode: The inode to drop
482 *
483 * If we've received a callback on an iopen lock then its because a
484 * remote node tried to deallocate the inode but failed due to this node
485 * still having the inode open. Here we mark the link count zero
486 * since we know that it must have reached zero if the GLF_DEMOTE flag
487 * is set on the iopen glock. If we didn't do a disk read since the
488 * remote node removed the final link then we might otherwise miss
489 * this event. This check ensures that this node will deallocate the
490 * inode's blocks, or alternatively pass the baton on to another
491 * node for later deallocation.
492 */
493
494static void gfs2_drop_inode(struct inode *inode)
495{
496 struct gfs2_inode *ip = GFS2_I(inode);
497
498 if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
499 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
500 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
501 clear_nlink(inode);
502 }
503 generic_drop_inode(inode);
504}
505
506/**
507 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
508 * @inode: The VFS inode
509 *
510 */
511
512static void gfs2_clear_inode(struct inode *inode)
513{
514 struct gfs2_inode *ip = GFS2_I(inode);
515
516 /* This tells us its a "real" inode and not one which only
517 * serves to contain an address space (see rgrp.c, meta_io.c)
518 * which therefore doesn't have its own glocks.
519 */
520 if (test_bit(GIF_USER, &ip->i_flags)) {
521 ip->i_gl->gl_object = NULL;
522 gfs2_glock_put(ip->i_gl);
523 ip->i_gl = NULL;
524 if (ip->i_iopen_gh.gh_gl) {
525 ip->i_iopen_gh.gh_gl->gl_object = NULL;
526 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
527 }
528 }
529}
530
531static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
532{
533 do {
534 if (d1 == d2)
535 return 1;
536 d1 = d1->d_parent;
537 } while (!IS_ROOT(d1));
538 return 0;
539}
540
541/**
542 * gfs2_show_options - Show mount options for /proc/mounts
543 * @s: seq_file structure
544 * @mnt: vfsmount
545 *
546 * Returns: 0 on success or error code
547 */
548
549static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
550{
551 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
552 struct gfs2_args *args = &sdp->sd_args;
553
554 if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
555 seq_printf(s, ",meta");
556 if (args->ar_lockproto[0])
557 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
558 if (args->ar_locktable[0])
559 seq_printf(s, ",locktable=%s", args->ar_locktable);
560 if (args->ar_hostdata[0])
561 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
562 if (args->ar_spectator)
563 seq_printf(s, ",spectator");
564 if (args->ar_ignore_local_fs)
565 seq_printf(s, ",ignore_local_fs");
566 if (args->ar_localflocks)
567 seq_printf(s, ",localflocks");
568 if (args->ar_localcaching)
569 seq_printf(s, ",localcaching");
570 if (args->ar_debug)
571 seq_printf(s, ",debug");
572 if (args->ar_upgrade)
573 seq_printf(s, ",upgrade");
574 if (args->ar_posix_acl)
575 seq_printf(s, ",acl");
576 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
577 char *state;
578 switch (args->ar_quota) {
579 case GFS2_QUOTA_OFF:
580 state = "off";
581 break;
582 case GFS2_QUOTA_ACCOUNT:
583 state = "account";
584 break;
585 case GFS2_QUOTA_ON:
586 state = "on";
587 break;
588 default:
589 state = "unknown";
590 break;
591 }
592 seq_printf(s, ",quota=%s", state);
593 }
594 if (args->ar_suiddir)
595 seq_printf(s, ",suiddir");
596 if (args->ar_data != GFS2_DATA_DEFAULT) {
597 char *state;
598 switch (args->ar_data) {
599 case GFS2_DATA_WRITEBACK:
600 state = "writeback";
601 break;
602 case GFS2_DATA_ORDERED:
603 state = "ordered";
604 break;
605 default:
606 state = "unknown";
607 break;
608 }
609 seq_printf(s, ",data=%s", state);
610 }
611 if (args->ar_discard)
612 seq_printf(s, ",discard");
613
614 return 0;
615}
616
617/*
618 * We have to (at the moment) hold the inodes main lock to cover
619 * the gap between unlocking the shared lock on the iopen lock and
620 * taking the exclusive lock. I'd rather do a shared -> exclusive
621 * conversion on the iopen lock, but we can change that later. This
622 * is safe, just less efficient.
623 */
624
625static void gfs2_delete_inode(struct inode *inode)
626{
627 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
628 struct gfs2_inode *ip = GFS2_I(inode);
629 struct gfs2_holder gh;
630 int error;
631
632 if (!test_bit(GIF_USER, &ip->i_flags))
633 goto out;
634
635 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
636 if (unlikely(error)) {
637 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
638 goto out;
639 }
640
641 gfs2_glock_dq_wait(&ip->i_iopen_gh);
642 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
643 error = gfs2_glock_nq(&ip->i_iopen_gh);
644 if (error)
645 goto out_truncate;
646
647 if (S_ISDIR(inode->i_mode) &&
648 (ip->i_diskflags & GFS2_DIF_EXHASH)) {
649 error = gfs2_dir_exhash_dealloc(ip);
650 if (error)
651 goto out_unlock;
652 }
653
654 if (ip->i_eattr) {
655 error = gfs2_ea_dealloc(ip);
656 if (error)
657 goto out_unlock;
658 }
659
660 if (!gfs2_is_stuffed(ip)) {
661 error = gfs2_file_dealloc(ip);
662 if (error)
663 goto out_unlock;
664 }
665
666 error = gfs2_dinode_dealloc(ip);
667 if (error)
668 goto out_unlock;
669
670out_truncate:
671 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
672 if (error)
673 goto out_unlock;
674 /* Needs to be done before glock release & also in a transaction */
675 truncate_inode_pages(&inode->i_data, 0);
676 gfs2_trans_end(sdp);
677
678out_unlock:
679 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
680 gfs2_glock_dq(&ip->i_iopen_gh);
681 gfs2_holder_uninit(&ip->i_iopen_gh);
682 gfs2_glock_dq_uninit(&gh);
683 if (error && error != GLR_TRYFAILED)
684 fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
685out:
686 truncate_inode_pages(&inode->i_data, 0);
687 clear_inode(inode);
688}
689
690static struct inode *gfs2_alloc_inode(struct super_block *sb)
691{
692 struct gfs2_inode *ip;
693
694 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
695 if (ip) {
696 ip->i_flags = 0;
697 ip->i_gl = NULL;
698 }
699 return &ip->i_inode;
700}
701
702static void gfs2_destroy_inode(struct inode *inode)
703{
704 kmem_cache_free(gfs2_inode_cachep, inode);
705}
706
707const struct super_operations gfs2_super_ops = {
708 .alloc_inode = gfs2_alloc_inode,
709 .destroy_inode = gfs2_destroy_inode,
710 .write_inode = gfs2_write_inode,
711 .delete_inode = gfs2_delete_inode,
712 .put_super = gfs2_put_super,
713 .write_super = gfs2_write_super,
714 .sync_fs = gfs2_sync_fs,
715 .freeze_fs = gfs2_freeze,
716 .unfreeze_fs = gfs2_unfreeze,
717 .statfs = gfs2_statfs,
718 .remount_fs = gfs2_remount_fs,
719 .clear_inode = gfs2_clear_inode,
720 .drop_inode = gfs2_drop_inode,
721 .show_options = gfs2_show_options,
722};
723
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 152e6c4a0dca..2e9b9326bfc9 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -60,7 +60,6 @@
60#include "super.h" 60#include "super.h"
61#include "trans.h" 61#include "trans.h"
62#include "inode.h" 62#include "inode.h"
63#include "ops_address.h"
64#include "util.h" 63#include "util.h"
65 64
66#define QUOTA_USER 1 65#define QUOTA_USER 1
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 247e8f7d6b3d..59d2695509d3 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,8 +13,7 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/kthread.h> 16#include <linux/slow-work.h>
17#include <linux/freezer.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
20#include "incore.h" 19#include "incore.h"
@@ -441,18 +440,25 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
441 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); 440 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
442} 441}
443 442
444/** 443static int gfs2_recover_get_ref(struct slow_work *work)
445 * gfs2_recover_journal - recover a given journal 444{
446 * @jd: the struct gfs2_jdesc describing the journal 445 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
447 * 446 if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
448 * Acquire the journal's lock, check to see if the journal is clean, and 447 return -EBUSY;
449 * do recovery if necessary. 448 return 0;
450 * 449}
451 * Returns: errno
452 */
453 450
454int gfs2_recover_journal(struct gfs2_jdesc *jd) 451static void gfs2_recover_put_ref(struct slow_work *work)
452{
453 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
454 clear_bit(JDF_RECOVERY, &jd->jd_flags);
455 smp_mb__after_clear_bit();
456 wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
457}
458
459static void gfs2_recover_work(struct slow_work *work)
455{ 460{
461 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
456 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 462 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
457 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); 463 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
458 struct gfs2_log_header_host head; 464 struct gfs2_log_header_host head;
@@ -569,7 +575,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
569 gfs2_glock_dq_uninit(&j_gh); 575 gfs2_glock_dq_uninit(&j_gh);
570 576
571 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); 577 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
572 return 0; 578 return;
573 579
574fail_gunlock_tr: 580fail_gunlock_tr:
575 gfs2_glock_dq_uninit(&t_gh); 581 gfs2_glock_dq_uninit(&t_gh);
@@ -584,70 +590,28 @@ fail_gunlock_j:
584 590
585fail: 591fail:
586 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); 592 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
587 return error;
588} 593}
589 594
590static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp) 595struct slow_work_ops gfs2_recover_ops = {
591{ 596 .get_ref = gfs2_recover_get_ref,
592 struct gfs2_jdesc *jd; 597 .put_ref = gfs2_recover_put_ref,
593 int found = 0; 598 .execute = gfs2_recover_work,
594 599};
595 spin_lock(&sdp->sd_jindex_spin);
596 600
597 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
598 if (jd->jd_dirty) {
599 jd->jd_dirty = 0;
600 found = 1;
601 break;
602 }
603 }
604 spin_unlock(&sdp->sd_jindex_spin);
605
606 if (!found)
607 jd = NULL;
608 601
609 return jd; 602static int gfs2_recovery_wait(void *word)
610}
611
612/**
613 * gfs2_check_journals - Recover any dirty journals
614 * @sdp: the filesystem
615 *
616 */
617
618static void gfs2_check_journals(struct gfs2_sbd *sdp)
619{ 603{
620 struct gfs2_jdesc *jd; 604 schedule();
621 605 return 0;
622 for (;;) {
623 jd = gfs2_jdesc_find_dirty(sdp);
624 if (!jd)
625 break;
626
627 if (jd != sdp->sd_jdesc)
628 gfs2_recover_journal(jd);
629 }
630} 606}
631 607
632/** 608int gfs2_recover_journal(struct gfs2_jdesc *jd)
633 * gfs2_recoverd - Recover dead machine's journals
634 * @sdp: Pointer to GFS2 superblock
635 *
636 */
637
638int gfs2_recoverd(void *data)
639{ 609{
640 struct gfs2_sbd *sdp = data; 610 int rv;
641 unsigned long t; 611 rv = slow_work_enqueue(&jd->jd_work);
642 612 if (rv)
643 while (!kthread_should_stop()) { 613 return rv;
644 gfs2_check_journals(sdp); 614 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE);
645 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
646 if (freezing(current))
647 refrigerator();
648 schedule_timeout_interruptible(t);
649 }
650
651 return 0; 615 return 0;
652} 616}
653 617
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index a8218ea15b57..1616ac22569a 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -28,7 +28,7 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
28extern int gfs2_find_jhead(struct gfs2_jdesc *jd, 28extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
29 struct gfs2_log_header_host *head); 29 struct gfs2_log_header_host *head);
30extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); 30extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
31extern int gfs2_recoverd(void *data); 31extern struct slow_work_ops gfs2_recover_ops;
32 32
33#endif /* __RECOVERY_DOT_H__ */ 33#endif /* __RECOVERY_DOT_H__ */
34 34
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 565038243fa2..fba795798d3a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -29,7 +29,7 @@
29#include "util.h" 29#include "util.h"
30#include "log.h" 30#include "log.h"
31#include "inode.h" 31#include "inode.h"
32#include "ops_address.h" 32#include "trace_gfs2.h"
33 33
34#define BFITNOENT ((u32)~0) 34#define BFITNOENT ((u32)~0)
35#define NO_BLOCK ((u64)~0) 35#define NO_BLOCK ((u64)~0)
@@ -285,27 +285,19 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
285 } 285 }
286 286
287 tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes; 287 tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
288 if (count[1] + count[2] != tmp) { 288 if (count[1] != tmp) {
289 if (gfs2_consist_rgrpd(rgd)) 289 if (gfs2_consist_rgrpd(rgd))
290 fs_err(sdp, "used data mismatch: %u != %u\n", 290 fs_err(sdp, "used data mismatch: %u != %u\n",
291 count[1], tmp); 291 count[1], tmp);
292 return; 292 return;
293 } 293 }
294 294
295 if (count[3] != rgd->rd_dinodes) { 295 if (count[2] + count[3] != rgd->rd_dinodes) {
296 if (gfs2_consist_rgrpd(rgd)) 296 if (gfs2_consist_rgrpd(rgd))
297 fs_err(sdp, "used metadata mismatch: %u != %u\n", 297 fs_err(sdp, "used metadata mismatch: %u != %u\n",
298 count[3], rgd->rd_dinodes); 298 count[2] + count[3], rgd->rd_dinodes);
299 return; 299 return;
300 } 300 }
301
302 if (count[2] > count[3]) {
303 if (gfs2_consist_rgrpd(rgd))
304 fs_err(sdp, "unlinked inodes > inodes: %u\n",
305 count[2]);
306 return;
307 }
308
309} 301}
310 302
311static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) 303static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
@@ -442,6 +434,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
442 for (x = 0; x < length; x++) { 434 for (x = 0; x < length; x++) {
443 bi = rgd->rd_bits + x; 435 bi = rgd->rd_bits + x;
444 436
437 bi->bi_flags = 0;
445 /* small rgrp; bitmap stored completely in header block */ 438 /* small rgrp; bitmap stored completely in header block */
446 if (length == 1) { 439 if (length == 1) {
447 bytes = bytes_left; 440 bytes = bytes_left;
@@ -580,7 +573,6 @@ static int read_rindex_entry(struct gfs2_inode *ip,
580 573
581 rgd->rd_gl->gl_object = rgd; 574 rgd->rd_gl->gl_object = rgd;
582 rgd->rd_flags &= ~GFS2_RDF_UPTODATE; 575 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
583 rgd->rd_flags |= GFS2_RDF_CHECK;
584 return error; 576 return error;
585} 577}
586 578
@@ -701,10 +693,9 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
701 u32 rg_flags; 693 u32 rg_flags;
702 694
703 rg_flags = be32_to_cpu(str->rg_flags); 695 rg_flags = be32_to_cpu(str->rg_flags);
704 if (rg_flags & GFS2_RGF_NOALLOC) 696 rg_flags &= ~GFS2_RDF_MASK;
705 rgd->rd_flags |= GFS2_RDF_NOALLOC; 697 rgd->rd_flags &= GFS2_RDF_MASK;
706 else 698 rgd->rd_flags |= rg_flags;
707 rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
708 rgd->rd_free = be32_to_cpu(str->rg_free); 699 rgd->rd_free = be32_to_cpu(str->rg_free);
709 rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); 700 rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
710 rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); 701 rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
@@ -713,11 +704,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
713static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) 704static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
714{ 705{
715 struct gfs2_rgrp *str = buf; 706 struct gfs2_rgrp *str = buf;
716 u32 rg_flags = 0;
717 707
718 if (rgd->rd_flags & GFS2_RDF_NOALLOC) 708 str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
719 rg_flags |= GFS2_RGF_NOALLOC;
720 str->rg_flags = cpu_to_be32(rg_flags);
721 str->rg_free = cpu_to_be32(rgd->rd_free); 709 str->rg_free = cpu_to_be32(rgd->rd_free);
722 str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes); 710 str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
723 str->__pad = cpu_to_be32(0); 711 str->__pad = cpu_to_be32(0);
@@ -775,8 +763,10 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
775 } 763 }
776 764
777 if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) { 765 if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
766 for (x = 0; x < length; x++)
767 clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
778 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); 768 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
779 rgd->rd_flags |= GFS2_RDF_UPTODATE; 769 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
780 } 770 }
781 771
782 spin_lock(&sdp->sd_rindex_spin); 772 spin_lock(&sdp->sd_rindex_spin);
@@ -845,7 +835,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
845 struct super_block *sb = sdp->sd_vfs; 835 struct super_block *sb = sdp->sd_vfs;
846 struct block_device *bdev = sb->s_bdev; 836 struct block_device *bdev = sb->s_bdev;
847 const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize / 837 const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
848 bdev_hardsect_size(sb->s_bdev); 838 bdev_logical_block_size(sb->s_bdev);
849 u64 blk; 839 u64 blk;
850 sector_t start = 0; 840 sector_t start = 0;
851 sector_t nr_sects = 0; 841 sector_t nr_sects = 0;
@@ -903,6 +893,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
903 continue; 893 continue;
904 if (sdp->sd_args.ar_discard) 894 if (sdp->sd_args.ar_discard)
905 gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi); 895 gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
896 clear_bit(GBF_FULL, &bi->bi_flags);
906 memcpy(bi->bi_clone + bi->bi_offset, 897 memcpy(bi->bi_clone + bi->bi_offset,
907 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); 898 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
908 } 899 }
@@ -942,7 +933,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
942 struct gfs2_sbd *sdp = rgd->rd_sbd; 933 struct gfs2_sbd *sdp = rgd->rd_sbd;
943 int ret = 0; 934 int ret = 0;
944 935
945 if (rgd->rd_flags & GFS2_RDF_NOALLOC) 936 if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
946 return 0; 937 return 0;
947 938
948 spin_lock(&sdp->sd_rindex_spin); 939 spin_lock(&sdp->sd_rindex_spin);
@@ -962,7 +953,8 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
962 * Returns: The inode, if one has been found 953 * Returns: The inode, if one has been found
963 */ 954 */
964 955
965static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) 956static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
957 u64 skip)
966{ 958{
967 struct inode *inode; 959 struct inode *inode;
968 u32 goal = 0, block; 960 u32 goal = 0, block;
@@ -986,6 +978,8 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
986 goal++; 978 goal++;
987 if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked) 979 if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
988 continue; 980 continue;
981 if (no_addr == skip)
982 continue;
989 *last_unlinked = no_addr; 983 *last_unlinked = no_addr;
990 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, 984 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
991 no_addr, -1, 1); 985 no_addr, -1, 1);
@@ -1105,7 +1099,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1105 if (try_rgrp_fit(rgd, al)) 1099 if (try_rgrp_fit(rgd, al))
1106 goto out; 1100 goto out;
1107 if (rgd->rd_flags & GFS2_RDF_CHECK) 1101 if (rgd->rd_flags & GFS2_RDF_CHECK)
1108 inode = try_rgrp_unlink(rgd, last_unlinked); 1102 inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1109 if (!rg_locked) 1103 if (!rg_locked)
1110 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1104 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1111 if (inode) 1105 if (inode)
@@ -1139,7 +1133,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1139 if (try_rgrp_fit(rgd, al)) 1133 if (try_rgrp_fit(rgd, al))
1140 goto out; 1134 goto out;
1141 if (rgd->rd_flags & GFS2_RDF_CHECK) 1135 if (rgd->rd_flags & GFS2_RDF_CHECK)
1142 inode = try_rgrp_unlink(rgd, last_unlinked); 1136 inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1143 if (!rg_locked) 1137 if (!rg_locked)
1144 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1138 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1145 if (inode) 1139 if (inode)
@@ -1315,30 +1309,37 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1315{ 1309{
1316 struct gfs2_bitmap *bi = NULL; 1310 struct gfs2_bitmap *bi = NULL;
1317 const u32 length = rgd->rd_length; 1311 const u32 length = rgd->rd_length;
1318 u32 blk = 0; 1312 u32 blk = BFITNOENT;
1319 unsigned int buf, x; 1313 unsigned int buf, x;
1320 const unsigned int elen = *n; 1314 const unsigned int elen = *n;
1321 const u8 *buffer; 1315 const u8 *buffer = NULL;
1322 1316
1323 *n = 0; 1317 *n = 0;
1324 /* Find bitmap block that contains bits for goal block */ 1318 /* Find bitmap block that contains bits for goal block */
1325 for (buf = 0; buf < length; buf++) { 1319 for (buf = 0; buf < length; buf++) {
1326 bi = rgd->rd_bits + buf; 1320 bi = rgd->rd_bits + buf;
1327 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) 1321 /* Convert scope of "goal" from rgrp-wide to within found bit block */
1328 break; 1322 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
1323 goal -= bi->bi_start * GFS2_NBBY;
1324 goto do_search;
1325 }
1329 } 1326 }
1327 buf = 0;
1328 goal = 0;
1330 1329
1331 gfs2_assert(rgd->rd_sbd, buf < length); 1330do_search:
1332
1333 /* Convert scope of "goal" from rgrp-wide to within found bit block */
1334 goal -= bi->bi_start * GFS2_NBBY;
1335
1336 /* Search (up to entire) bitmap in this rgrp for allocatable block. 1331 /* Search (up to entire) bitmap in this rgrp for allocatable block.
1337 "x <= length", instead of "x < length", because we typically start 1332 "x <= length", instead of "x < length", because we typically start
1338 the search in the middle of a bit block, but if we can't find an 1333 the search in the middle of a bit block, but if we can't find an
1339 allocatable block anywhere else, we want to be able wrap around and 1334 allocatable block anywhere else, we want to be able wrap around and
1340 search in the first part of our first-searched bit block. */ 1335 search in the first part of our first-searched bit block. */
1341 for (x = 0; x <= length; x++) { 1336 for (x = 0; x <= length; x++) {
1337 bi = rgd->rd_bits + buf;
1338
1339 if (test_bit(GBF_FULL, &bi->bi_flags) &&
1340 (old_state == GFS2_BLKST_FREE))
1341 goto skip;
1342
1342 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone 1343 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
1343 bitmaps, so we must search the originals for that. */ 1344 bitmaps, so we must search the originals for that. */
1344 buffer = bi->bi_bh->b_data + bi->bi_offset; 1345 buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1349,33 +1350,39 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1349 if (blk != BFITNOENT) 1350 if (blk != BFITNOENT)
1350 break; 1351 break;
1351 1352
1353 if ((goal == 0) && (old_state == GFS2_BLKST_FREE))
1354 set_bit(GBF_FULL, &bi->bi_flags);
1355
1352 /* Try next bitmap block (wrap back to rgrp header if at end) */ 1356 /* Try next bitmap block (wrap back to rgrp header if at end) */
1353 buf = (buf + 1) % length; 1357skip:
1354 bi = rgd->rd_bits + buf; 1358 buf++;
1359 buf %= length;
1355 goal = 0; 1360 goal = 0;
1356 } 1361 }
1357 1362
1358 if (blk != BFITNOENT && old_state != new_state) { 1363 if (blk == BFITNOENT)
1359 *n = 1; 1364 return blk;
1360 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); 1365 *n = 1;
1366 if (old_state == new_state)
1367 goto out;
1368
1369 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1370 gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
1371 bi->bi_len, blk, new_state);
1372 goal = blk;
1373 while (*n < elen) {
1374 goal++;
1375 if (goal >= (bi->bi_len * GFS2_NBBY))
1376 break;
1377 if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
1378 GFS2_BLKST_FREE)
1379 break;
1361 gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, 1380 gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
1362 bi->bi_len, blk, new_state); 1381 bi->bi_len, goal, new_state);
1363 goal = blk; 1382 (*n)++;
1364 while (*n < elen) {
1365 goal++;
1366 if (goal >= (bi->bi_len * GFS2_NBBY))
1367 break;
1368 if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
1369 GFS2_BLKST_FREE)
1370 break;
1371 gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
1372 bi->bi_offset, bi->bi_len, goal,
1373 new_state);
1374 (*n)++;
1375 }
1376 } 1383 }
1377 1384out:
1378 return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk; 1385 return (bi->bi_start * GFS2_NBBY) + blk;
1379} 1386}
1380 1387
1381/** 1388/**
@@ -1435,13 +1442,33 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1435} 1442}
1436 1443
1437/** 1444/**
1438 * gfs2_alloc_block - Allocate a block 1445 * gfs2_rgrp_dump - print out an rgrp
1446 * @seq: The iterator
1447 * @gl: The glock in question
1448 *
1449 */
1450
1451int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
1452{
1453 const struct gfs2_rgrpd *rgd = gl->gl_object;
1454 if (rgd == NULL)
1455 return 0;
1456 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
1457 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
1458 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
1459 return 0;
1460}
1461
1462/**
1463 * gfs2_alloc_block - Allocate one or more blocks
1439 * @ip: the inode to allocate the block for 1464 * @ip: the inode to allocate the block for
1465 * @bn: Used to return the starting block number
1466 * @n: requested number of blocks/extent length (value/result)
1440 * 1467 *
1441 * Returns: the allocated block 1468 * Returns: 0 or error
1442 */ 1469 */
1443 1470
1444u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) 1471int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1445{ 1472{
1446 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1473 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1447 struct buffer_head *dibh; 1474 struct buffer_head *dibh;
@@ -1457,7 +1484,10 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1457 goal = rgd->rd_last_alloc; 1484 goal = rgd->rd_last_alloc;
1458 1485
1459 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n); 1486 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
1460 BUG_ON(blk == BFITNOENT); 1487
1488 /* Since all blocks are reserved in advance, this shouldn't happen */
1489 if (blk == BFITNOENT)
1490 goto rgrp_error;
1461 1491
1462 rgd->rd_last_alloc = blk; 1492 rgd->rd_last_alloc = blk;
1463 block = rgd->rd_data0 + blk; 1493 block = rgd->rd_data0 + blk;
@@ -1469,7 +1499,9 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1469 di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal); 1499 di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
1470 brelse(dibh); 1500 brelse(dibh);
1471 } 1501 }
1472 gfs2_assert_withdraw(sdp, rgd->rd_free >= *n); 1502 if (rgd->rd_free < *n)
1503 goto rgrp_error;
1504
1473 rgd->rd_free -= *n; 1505 rgd->rd_free -= *n;
1474 1506
1475 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1507 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1483,8 +1515,17 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1483 spin_lock(&sdp->sd_rindex_spin); 1515 spin_lock(&sdp->sd_rindex_spin);
1484 rgd->rd_free_clone -= *n; 1516 rgd->rd_free_clone -= *n;
1485 spin_unlock(&sdp->sd_rindex_spin); 1517 spin_unlock(&sdp->sd_rindex_spin);
1518 trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED);
1519 *bn = block;
1520 return 0;
1486 1521
1487 return block; 1522rgrp_error:
1523 fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
1524 (unsigned long long)rgd->rd_addr);
1525 fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
1526 gfs2_rgrp_dump(NULL, rgd->rd_gl);
1527 rgd->rd_flags |= GFS2_RDF_ERROR;
1528 return -EIO;
1488} 1529}
1489 1530
1490/** 1531/**
@@ -1526,7 +1567,7 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1526 spin_lock(&sdp->sd_rindex_spin); 1567 spin_lock(&sdp->sd_rindex_spin);
1527 rgd->rd_free_clone--; 1568 rgd->rd_free_clone--;
1528 spin_unlock(&sdp->sd_rindex_spin); 1569 spin_unlock(&sdp->sd_rindex_spin);
1529 1570 trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
1530 return block; 1571 return block;
1531} 1572}
1532 1573
@@ -1546,7 +1587,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1546 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); 1587 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1547 if (!rgd) 1588 if (!rgd)
1548 return; 1589 return;
1549 1590 trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);
1550 rgd->rd_free += blen; 1591 rgd->rd_free += blen;
1551 1592
1552 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1593 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1574,7 +1615,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1574 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); 1615 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1575 if (!rgd) 1616 if (!rgd)
1576 return; 1617 return;
1577 1618 trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);
1578 rgd->rd_free += blen; 1619 rgd->rd_free += blen;
1579 1620
1580 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1621 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1597,6 +1638,7 @@ void gfs2_unlink_di(struct inode *inode)
1597 rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); 1638 rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
1598 if (!rgd) 1639 if (!rgd)
1599 return; 1640 return;
1641 trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED);
1600 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1642 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1601 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1643 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1602 gfs2_trans_add_rg(rgd); 1644 gfs2_trans_add_rg(rgd);
@@ -1628,6 +1670,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
1628void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) 1670void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1629{ 1671{
1630 gfs2_free_uninit_di(rgd, ip->i_no_addr); 1672 gfs2_free_uninit_di(rgd, ip->i_no_addr);
1673 trace_gfs2_block_alloc(ip, ip->i_no_addr, 1, GFS2_BLKST_FREE);
1631 gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); 1674 gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
1632 gfs2_meta_wipe(ip, ip->i_no_addr, 1); 1675 gfs2_meta_wipe(ip, ip->i_no_addr, 1);
1633} 1676}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3181c7e624bf..1e76ff0f3e00 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -14,22 +14,22 @@ struct gfs2_rgrpd;
14struct gfs2_sbd; 14struct gfs2_sbd;
15struct gfs2_holder; 15struct gfs2_holder;
16 16
17void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); 17extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
18 18
19struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); 19struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
20struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); 20struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
21struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); 21struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
22 22
23void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); 23extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
24int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); 24extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
25 25
26int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); 26extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
27void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); 27extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
28void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); 28extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
29 29
30void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); 30extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
31 31
32struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); 32extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
33static inline void gfs2_alloc_put(struct gfs2_inode *ip) 33static inline void gfs2_alloc_put(struct gfs2_inode *ip)
34{ 34{
35 BUG_ON(ip->i_alloc == NULL); 35 BUG_ON(ip->i_alloc == NULL);
@@ -37,22 +37,22 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
37 ip->i_alloc = NULL; 37 ip->i_alloc = NULL;
38} 38}
39 39
40int gfs2_inplace_reserve_i(struct gfs2_inode *ip, 40extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
41 char *file, unsigned int line); 41 unsigned int line);
42#define gfs2_inplace_reserve(ip) \ 42#define gfs2_inplace_reserve(ip) \
43gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) 43gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
44 44
45void gfs2_inplace_release(struct gfs2_inode *ip); 45extern void gfs2_inplace_release(struct gfs2_inode *ip);
46 46
47unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); 47extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
48 48
49u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n); 49extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
50u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); 50extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
51 51
52void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); 52extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
53void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); 53extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
54void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); 54extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
55void gfs2_unlink_di(struct inode *inode); 55extern void gfs2_unlink_di(struct inode *inode);
56 56
57struct gfs2_rgrp_list { 57struct gfs2_rgrp_list {
58 unsigned int rl_rgrps; 58 unsigned int rl_rgrps;
@@ -61,10 +61,11 @@ struct gfs2_rgrp_list {
61 struct gfs2_holder *rl_ghs; 61 struct gfs2_holder *rl_ghs;
62}; 62};
63 63
64void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, 64extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
65 u64 block); 65 u64 block);
66void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); 66extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
67void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); 67extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
68u64 gfs2_ri_total(struct gfs2_sbd *sdp); 68extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
69extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
69 70
70#endif /* __RGRP_DOT_H__ */ 71#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 601913e0a482..f522bb017973 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,14 +7,20 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/bio.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <linux/slab.h> 12#include <linux/slab.h>
12#include <linux/spinlock.h> 13#include <linux/spinlock.h>
13#include <linux/completion.h> 14#include <linux/completion.h>
14#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
15#include <linux/crc32.h> 16#include <linux/statfs.h>
17#include <linux/seq_file.h>
18#include <linux/mount.h>
19#include <linux/kthread.h>
20#include <linux/delay.h>
16#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
17#include <linux/bio.h> 22#include <linux/crc32.h>
23#include <linux/time.h>
18 24
19#include "gfs2.h" 25#include "gfs2.h"
20#include "incore.h" 26#include "incore.h"
@@ -31,6 +37,183 @@
31#include "super.h" 37#include "super.h"
32#include "trans.h" 38#include "trans.h"
33#include "util.h" 39#include "util.h"
40#include "sys.h"
41#include "eattr.h"
42
43#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
44
45enum {
46 Opt_lockproto,
47 Opt_locktable,
48 Opt_hostdata,
49 Opt_spectator,
50 Opt_ignore_local_fs,
51 Opt_localflocks,
52 Opt_localcaching,
53 Opt_debug,
54 Opt_nodebug,
55 Opt_upgrade,
56 Opt_acl,
57 Opt_noacl,
58 Opt_quota_off,
59 Opt_quota_account,
60 Opt_quota_on,
61 Opt_quota,
62 Opt_noquota,
63 Opt_suiddir,
64 Opt_nosuiddir,
65 Opt_data_writeback,
66 Opt_data_ordered,
67 Opt_meta,
68 Opt_discard,
69 Opt_nodiscard,
70 Opt_commit,
71 Opt_error,
72};
73
74static const match_table_t tokens = {
75 {Opt_lockproto, "lockproto=%s"},
76 {Opt_locktable, "locktable=%s"},
77 {Opt_hostdata, "hostdata=%s"},
78 {Opt_spectator, "spectator"},
79 {Opt_ignore_local_fs, "ignore_local_fs"},
80 {Opt_localflocks, "localflocks"},
81 {Opt_localcaching, "localcaching"},
82 {Opt_debug, "debug"},
83 {Opt_nodebug, "nodebug"},
84 {Opt_upgrade, "upgrade"},
85 {Opt_acl, "acl"},
86 {Opt_noacl, "noacl"},
87 {Opt_quota_off, "quota=off"},
88 {Opt_quota_account, "quota=account"},
89 {Opt_quota_on, "quota=on"},
90 {Opt_quota, "quota"},
91 {Opt_noquota, "noquota"},
92 {Opt_suiddir, "suiddir"},
93 {Opt_nosuiddir, "nosuiddir"},
94 {Opt_data_writeback, "data=writeback"},
95 {Opt_data_ordered, "data=ordered"},
96 {Opt_meta, "meta"},
97 {Opt_discard, "discard"},
98 {Opt_nodiscard, "nodiscard"},
99 {Opt_commit, "commit=%d"},
100 {Opt_error, NULL}
101};
102
103/**
104 * gfs2_mount_args - Parse mount options
105 * @sdp:
106 * @data:
107 *
108 * Return: errno
109 */
110
111int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
112{
113 char *o;
114 int token;
115 substring_t tmp[MAX_OPT_ARGS];
116 int rv;
117
118 /* Split the options into tokens with the "," character and
119 process them */
120
121 while (1) {
122 o = strsep(&options, ",");
123 if (o == NULL)
124 break;
125 if (*o == '\0')
126 continue;
127
128 token = match_token(o, tokens, tmp);
129 switch (token) {
130 case Opt_lockproto:
131 match_strlcpy(args->ar_lockproto, &tmp[0],
132 GFS2_LOCKNAME_LEN);
133 break;
134 case Opt_locktable:
135 match_strlcpy(args->ar_locktable, &tmp[0],
136 GFS2_LOCKNAME_LEN);
137 break;
138 case Opt_hostdata:
139 match_strlcpy(args->ar_hostdata, &tmp[0],
140 GFS2_LOCKNAME_LEN);
141 break;
142 case Opt_spectator:
143 args->ar_spectator = 1;
144 break;
145 case Opt_ignore_local_fs:
146 args->ar_ignore_local_fs = 1;
147 break;
148 case Opt_localflocks:
149 args->ar_localflocks = 1;
150 break;
151 case Opt_localcaching:
152 args->ar_localcaching = 1;
153 break;
154 case Opt_debug:
155 args->ar_debug = 1;
156 break;
157 case Opt_nodebug:
158 args->ar_debug = 0;
159 break;
160 case Opt_upgrade:
161 args->ar_upgrade = 1;
162 break;
163 case Opt_acl:
164 args->ar_posix_acl = 1;
165 break;
166 case Opt_noacl:
167 args->ar_posix_acl = 0;
168 break;
169 case Opt_quota_off:
170 case Opt_noquota:
171 args->ar_quota = GFS2_QUOTA_OFF;
172 break;
173 case Opt_quota_account:
174 args->ar_quota = GFS2_QUOTA_ACCOUNT;
175 break;
176 case Opt_quota_on:
177 case Opt_quota:
178 args->ar_quota = GFS2_QUOTA_ON;
179 break;
180 case Opt_suiddir:
181 args->ar_suiddir = 1;
182 break;
183 case Opt_nosuiddir:
184 args->ar_suiddir = 0;
185 break;
186 case Opt_data_writeback:
187 args->ar_data = GFS2_DATA_WRITEBACK;
188 break;
189 case Opt_data_ordered:
190 args->ar_data = GFS2_DATA_ORDERED;
191 break;
192 case Opt_meta:
193 args->ar_meta = 1;
194 break;
195 case Opt_discard:
196 args->ar_discard = 1;
197 break;
198 case Opt_nodiscard:
199 args->ar_discard = 0;
200 break;
201 case Opt_commit:
202 rv = match_int(&tmp[0], &args->ar_commit);
203 if (rv || args->ar_commit <= 0) {
204 fs_info(sdp, "commit mount option requires a positive numeric argument\n");
205 return rv ? rv : -EINVAL;
206 }
207 break;
208 case Opt_error:
209 default:
210 fs_info(sdp, "invalid mount option: %s\n", o);
211 return -EINVAL;
212 }
213 }
214
215 return 0;
216}
34 217
35/** 218/**
36 * gfs2_jindex_free - Clear all the journal index information 219 * gfs2_jindex_free - Clear all the journal index information
@@ -170,7 +353,7 @@ fail:
170 return error; 353 return error;
171} 354}
172 355
173static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) 356void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
174{ 357{
175 const struct gfs2_statfs_change *str = buf; 358 const struct gfs2_statfs_change *str = buf;
176 359
@@ -258,6 +441,29 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
258 brelse(l_bh); 441 brelse(l_bh);
259} 442}
260 443
444void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
445 struct buffer_head *l_bh)
446{
447 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
448 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
449 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
450 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
451
452 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
453
454 spin_lock(&sdp->sd_statfs_spin);
455 m_sc->sc_total += l_sc->sc_total;
456 m_sc->sc_free += l_sc->sc_free;
457 m_sc->sc_dinodes += l_sc->sc_dinodes;
458 memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
459 memset(l_bh->b_data + sizeof(struct gfs2_dinode),
460 0, sizeof(struct gfs2_statfs_change));
461 spin_unlock(&sdp->sd_statfs_spin);
462
463 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
464 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
465}
466
261int gfs2_statfs_sync(struct gfs2_sbd *sdp) 467int gfs2_statfs_sync(struct gfs2_sbd *sdp)
262{ 468{
263 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); 469 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -294,19 +500,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
294 if (error) 500 if (error)
295 goto out_bh2; 501 goto out_bh2;
296 502
297 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); 503 update_statfs(sdp, m_bh, l_bh);
298
299 spin_lock(&sdp->sd_statfs_spin);
300 m_sc->sc_total += l_sc->sc_total;
301 m_sc->sc_free += l_sc->sc_free;
302 m_sc->sc_dinodes += l_sc->sc_dinodes;
303 memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
304 memset(l_bh->b_data + sizeof(struct gfs2_dinode),
305 0, sizeof(struct gfs2_statfs_change));
306 spin_unlock(&sdp->sd_statfs_spin);
307
308 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
309 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
310 504
311 gfs2_trans_end(sdp); 505 gfs2_trans_end(sdp);
312 506
@@ -436,3 +630,707 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
436 mutex_unlock(&sdp->sd_freeze_lock); 630 mutex_unlock(&sdp->sd_freeze_lock);
437} 631}
438 632
633
634/**
635 * gfs2_write_inode - Make sure the inode is stable on the disk
636 * @inode: The inode
637 * @sync: synchronous write flag
638 *
639 * Returns: errno
640 */
641
642static int gfs2_write_inode(struct inode *inode, int sync)
643{
644 struct gfs2_inode *ip = GFS2_I(inode);
645 struct gfs2_sbd *sdp = GFS2_SB(inode);
646 struct gfs2_holder gh;
647 struct buffer_head *bh;
648 struct timespec atime;
649 struct gfs2_dinode *di;
650 int ret = 0;
651
652 /* Check this is a "normal" inode, etc */
653 if (!test_bit(GIF_USER, &ip->i_flags) ||
654 (current->flags & PF_MEMALLOC))
655 return 0;
656 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
657 if (ret)
658 goto do_flush;
659 ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
660 if (ret)
661 goto do_unlock;
662 ret = gfs2_meta_inode_buffer(ip, &bh);
663 if (ret == 0) {
664 di = (struct gfs2_dinode *)bh->b_data;
665 atime.tv_sec = be64_to_cpu(di->di_atime);
666 atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
667 if (timespec_compare(&inode->i_atime, &atime) > 0) {
668 gfs2_trans_add_bh(ip->i_gl, bh, 1);
669 gfs2_dinode_out(ip, bh->b_data);
670 }
671 brelse(bh);
672 }
673 gfs2_trans_end(sdp);
674do_unlock:
675 gfs2_glock_dq_uninit(&gh);
676do_flush:
677 if (sync != 0)
678 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
679 return ret;
680}
681
682/**
683 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
684 * @sdp: the filesystem
685 *
686 * Returns: errno
687 */
688
689static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
690{
691 struct gfs2_holder t_gh;
692 int error;
693
694 flush_workqueue(gfs2_delete_workqueue);
695 gfs2_quota_sync(sdp);
696 gfs2_statfs_sync(sdp);
697
698 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
699 &t_gh);
700 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
701 return error;
702
703 gfs2_meta_syncfs(sdp);
704 gfs2_log_shutdown(sdp);
705
706 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
707
708 if (t_gh.gh_gl)
709 gfs2_glock_dq_uninit(&t_gh);
710
711 gfs2_quota_cleanup(sdp);
712
713 return error;
714}
715
716static int gfs2_umount_recovery_wait(void *word)
717{
718 schedule();
719 return 0;
720}
721
722/**
723 * gfs2_put_super - Unmount the filesystem
724 * @sb: The VFS superblock
725 *
726 */
727
728static void gfs2_put_super(struct super_block *sb)
729{
730 struct gfs2_sbd *sdp = sb->s_fs_info;
731 int error;
732 struct gfs2_jdesc *jd;
733
734 /* Unfreeze the filesystem, if we need to */
735
736 mutex_lock(&sdp->sd_freeze_lock);
737 if (sdp->sd_freeze_count)
738 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
739 mutex_unlock(&sdp->sd_freeze_lock);
740
741 /* No more recovery requests */
742 set_bit(SDF_NORECOVERY, &sdp->sd_flags);
743 smp_mb();
744
745 /* Wait on outstanding recovery */
746restart:
747 spin_lock(&sdp->sd_jindex_spin);
748 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
749 if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
750 continue;
751 spin_unlock(&sdp->sd_jindex_spin);
752 wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
753 gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
754 goto restart;
755 }
756 spin_unlock(&sdp->sd_jindex_spin);
757
758 kthread_stop(sdp->sd_quotad_process);
759 kthread_stop(sdp->sd_logd_process);
760
761 if (!(sb->s_flags & MS_RDONLY)) {
762 error = gfs2_make_fs_ro(sdp);
763 if (error)
764 gfs2_io_error(sdp);
765 }
766 /* At this point, we're through modifying the disk */
767
768 /* Release stuff */
769
770 iput(sdp->sd_jindex);
771 iput(sdp->sd_inum_inode);
772 iput(sdp->sd_statfs_inode);
773 iput(sdp->sd_rindex);
774 iput(sdp->sd_quota_inode);
775
776 gfs2_glock_put(sdp->sd_rename_gl);
777 gfs2_glock_put(sdp->sd_trans_gl);
778
779 if (!sdp->sd_args.ar_spectator) {
780 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
781 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
782 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
783 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
784 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
785 iput(sdp->sd_ir_inode);
786 iput(sdp->sd_sc_inode);
787 iput(sdp->sd_qc_inode);
788 }
789
790 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
791 gfs2_clear_rgrpd(sdp);
792 gfs2_jindex_free(sdp);
793 /* Take apart glock structures and buffer lists */
794 gfs2_gl_hash_clear(sdp);
795 /* Unmount the locking protocol */
796 gfs2_lm_unmount(sdp);
797
798 /* At this point, we're through participating in the lockspace */
799 gfs2_sys_fs_del(sdp);
800}
801
802/**
803 * gfs2_sync_fs - sync the filesystem
804 * @sb: the superblock
805 *
806 * Flushes the log to disk.
807 */
808
809static int gfs2_sync_fs(struct super_block *sb, int wait)
810{
811 if (wait && sb->s_fs_info)
812 gfs2_log_flush(sb->s_fs_info, NULL);
813 return 0;
814}
815
816/**
817 * gfs2_freeze - prevent further writes to the filesystem
818 * @sb: the VFS structure for the filesystem
819 *
820 */
821
822static int gfs2_freeze(struct super_block *sb)
823{
824 struct gfs2_sbd *sdp = sb->s_fs_info;
825 int error;
826
827 if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
828 return -EINVAL;
829
830 for (;;) {
831 error = gfs2_freeze_fs(sdp);
832 if (!error)
833 break;
834
835 switch (error) {
836 case -EBUSY:
837 fs_err(sdp, "waiting for recovery before freeze\n");
838 break;
839
840 default:
841 fs_err(sdp, "error freezing FS: %d\n", error);
842 break;
843 }
844
845 fs_err(sdp, "retrying...\n");
846 msleep(1000);
847 }
848 return 0;
849}
850
851/**
852 * gfs2_unfreeze - reallow writes to the filesystem
853 * @sb: the VFS structure for the filesystem
854 *
855 */
856
857static int gfs2_unfreeze(struct super_block *sb)
858{
859 gfs2_unfreeze_fs(sb->s_fs_info);
860 return 0;
861}
862
863/**
864 * statfs_fill - fill in the sg for a given RG
865 * @rgd: the RG
866 * @sc: the sc structure
867 *
868 * Returns: 0 on success, -ESTALE if the LVB is invalid
869 */
870
871static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
872 struct gfs2_statfs_change_host *sc)
873{
874 gfs2_rgrp_verify(rgd);
875 sc->sc_total += rgd->rd_data;
876 sc->sc_free += rgd->rd_free;
877 sc->sc_dinodes += rgd->rd_dinodes;
878 return 0;
879}
880
881/**
882 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
883 * @sdp: the filesystem
884 * @sc: the sc info that will be returned
885 *
886 * Any error (other than a signal) will cause this routine to fall back
887 * to the synchronous version.
888 *
889 * FIXME: This really shouldn't busy wait like this.
890 *
891 * Returns: errno
892 */
893
894static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
895{
896 struct gfs2_holder ri_gh;
897 struct gfs2_rgrpd *rgd_next;
898 struct gfs2_holder *gha, *gh;
899 unsigned int slots = 64;
900 unsigned int x;
901 int done;
902 int error = 0, err;
903
904 memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
905 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
906 if (!gha)
907 return -ENOMEM;
908
909 error = gfs2_rindex_hold(sdp, &ri_gh);
910 if (error)
911 goto out;
912
913 rgd_next = gfs2_rgrpd_get_first(sdp);
914
915 for (;;) {
916 done = 1;
917
918 for (x = 0; x < slots; x++) {
919 gh = gha + x;
920
921 if (gh->gh_gl && gfs2_glock_poll(gh)) {
922 err = gfs2_glock_wait(gh);
923 if (err) {
924 gfs2_holder_uninit(gh);
925 error = err;
926 } else {
927 if (!error)
928 error = statfs_slow_fill(
929 gh->gh_gl->gl_object, sc);
930 gfs2_glock_dq_uninit(gh);
931 }
932 }
933
934 if (gh->gh_gl)
935 done = 0;
936 else if (rgd_next && !error) {
937 error = gfs2_glock_nq_init(rgd_next->rd_gl,
938 LM_ST_SHARED,
939 GL_ASYNC,
940 gh);
941 rgd_next = gfs2_rgrpd_get_next(rgd_next);
942 done = 0;
943 }
944
945 if (signal_pending(current))
946 error = -ERESTARTSYS;
947 }
948
949 if (done)
950 break;
951
952 yield();
953 }
954
955 gfs2_glock_dq_uninit(&ri_gh);
956
957out:
958 kfree(gha);
959 return error;
960}
961
962/**
963 * gfs2_statfs_i - Do a statfs
964 * @sdp: the filesystem
965 * @sg: the sg structure
966 *
967 * Returns: errno
968 */
969
970static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
971{
972 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
973 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
974
975 spin_lock(&sdp->sd_statfs_spin);
976
977 *sc = *m_sc;
978 sc->sc_total += l_sc->sc_total;
979 sc->sc_free += l_sc->sc_free;
980 sc->sc_dinodes += l_sc->sc_dinodes;
981
982 spin_unlock(&sdp->sd_statfs_spin);
983
984 if (sc->sc_free < 0)
985 sc->sc_free = 0;
986 if (sc->sc_free > sc->sc_total)
987 sc->sc_free = sc->sc_total;
988 if (sc->sc_dinodes < 0)
989 sc->sc_dinodes = 0;
990
991 return 0;
992}
993
994/**
995 * gfs2_statfs - Gather and return stats about the filesystem
996 * @sb: The superblock
997 * @statfsbuf: The buffer
998 *
999 * Returns: 0 on success or error code
1000 */
1001
1002static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
1003{
1004 struct super_block *sb = dentry->d_inode->i_sb;
1005 struct gfs2_sbd *sdp = sb->s_fs_info;
1006 struct gfs2_statfs_change_host sc;
1007 int error;
1008
1009 if (gfs2_tune_get(sdp, gt_statfs_slow))
1010 error = gfs2_statfs_slow(sdp, &sc);
1011 else
1012 error = gfs2_statfs_i(sdp, &sc);
1013
1014 if (error)
1015 return error;
1016
1017 buf->f_type = GFS2_MAGIC;
1018 buf->f_bsize = sdp->sd_sb.sb_bsize;
1019 buf->f_blocks = sc.sc_total;
1020 buf->f_bfree = sc.sc_free;
1021 buf->f_bavail = sc.sc_free;
1022 buf->f_files = sc.sc_dinodes + sc.sc_free;
1023 buf->f_ffree = sc.sc_free;
1024 buf->f_namelen = GFS2_FNAMESIZE;
1025
1026 return 0;
1027}
1028
1029/**
1030 * gfs2_remount_fs - called when the FS is remounted
1031 * @sb: the filesystem
1032 * @flags: the remount flags
1033 * @data: extra data passed in (not used right now)
1034 *
1035 * Returns: errno
1036 */
1037
1038static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1039{
1040 struct gfs2_sbd *sdp = sb->s_fs_info;
1041 struct gfs2_args args = sdp->sd_args; /* Default to current settings */
1042 struct gfs2_tune *gt = &sdp->sd_tune;
1043 int error;
1044
1045 spin_lock(&gt->gt_spin);
1046 args.ar_commit = gt->gt_log_flush_secs;
1047 spin_unlock(&gt->gt_spin);
1048 error = gfs2_mount_args(sdp, &args, data);
1049 if (error)
1050 return error;
1051
1052 /* Not allowed to change locking details */
1053 if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
1054 strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
1055 strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
1056 return -EINVAL;
1057
1058 /* Some flags must not be changed */
1059 if (args_neq(&args, &sdp->sd_args, spectator) ||
1060 args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
1061 args_neq(&args, &sdp->sd_args, localflocks) ||
1062 args_neq(&args, &sdp->sd_args, localcaching) ||
1063 args_neq(&args, &sdp->sd_args, meta))
1064 return -EINVAL;
1065
1066 if (sdp->sd_args.ar_spectator)
1067 *flags |= MS_RDONLY;
1068
1069 if ((sb->s_flags ^ *flags) & MS_RDONLY) {
1070 if (*flags & MS_RDONLY)
1071 error = gfs2_make_fs_ro(sdp);
1072 else
1073 error = gfs2_make_fs_rw(sdp);
1074 if (error)
1075 return error;
1076 }
1077
1078 sdp->sd_args = args;
1079 if (sdp->sd_args.ar_posix_acl)
1080 sb->s_flags |= MS_POSIXACL;
1081 else
1082 sb->s_flags &= ~MS_POSIXACL;
1083 spin_lock(&gt->gt_spin);
1084 gt->gt_log_flush_secs = args.ar_commit;
1085 spin_unlock(&gt->gt_spin);
1086
1087 return 0;
1088}
1089
1090/**
1091 * gfs2_drop_inode - Drop an inode (test for remote unlink)
1092 * @inode: The inode to drop
1093 *
1094 * If we've received a callback on an iopen lock then its because a
1095 * remote node tried to deallocate the inode but failed due to this node
1096 * still having the inode open. Here we mark the link count zero
1097 * since we know that it must have reached zero if the GLF_DEMOTE flag
1098 * is set on the iopen glock. If we didn't do a disk read since the
1099 * remote node removed the final link then we might otherwise miss
1100 * this event. This check ensures that this node will deallocate the
1101 * inode's blocks, or alternatively pass the baton on to another
1102 * node for later deallocation.
1103 */
1104
1105static void gfs2_drop_inode(struct inode *inode)
1106{
1107 struct gfs2_inode *ip = GFS2_I(inode);
1108
1109 if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
1110 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1111 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
1112 clear_nlink(inode);
1113 }
1114 generic_drop_inode(inode);
1115}
1116
1117/**
1118 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
1119 * @inode: The VFS inode
1120 *
1121 */
1122
1123static void gfs2_clear_inode(struct inode *inode)
1124{
1125 struct gfs2_inode *ip = GFS2_I(inode);
1126
1127 /* This tells us its a "real" inode and not one which only
1128 * serves to contain an address space (see rgrp.c, meta_io.c)
1129 * which therefore doesn't have its own glocks.
1130 */
1131 if (test_bit(GIF_USER, &ip->i_flags)) {
1132 ip->i_gl->gl_object = NULL;
1133 gfs2_glock_put(ip->i_gl);
1134 ip->i_gl = NULL;
1135 if (ip->i_iopen_gh.gh_gl) {
1136 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1137 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1138 }
1139 }
1140}
1141
1142static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
1143{
1144 do {
1145 if (d1 == d2)
1146 return 1;
1147 d1 = d1->d_parent;
1148 } while (!IS_ROOT(d1));
1149 return 0;
1150}
1151
1152/**
1153 * gfs2_show_options - Show mount options for /proc/mounts
1154 * @s: seq_file structure
1155 * @mnt: vfsmount
1156 *
1157 * Returns: 0 on success or error code
1158 */
1159
1160static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1161{
1162 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
1163 struct gfs2_args *args = &sdp->sd_args;
1164 int lfsecs;
1165
1166 if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
1167 seq_printf(s, ",meta");
1168 if (args->ar_lockproto[0])
1169 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
1170 if (args->ar_locktable[0])
1171 seq_printf(s, ",locktable=%s", args->ar_locktable);
1172 if (args->ar_hostdata[0])
1173 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
1174 if (args->ar_spectator)
1175 seq_printf(s, ",spectator");
1176 if (args->ar_ignore_local_fs)
1177 seq_printf(s, ",ignore_local_fs");
1178 if (args->ar_localflocks)
1179 seq_printf(s, ",localflocks");
1180 if (args->ar_localcaching)
1181 seq_printf(s, ",localcaching");
1182 if (args->ar_debug)
1183 seq_printf(s, ",debug");
1184 if (args->ar_upgrade)
1185 seq_printf(s, ",upgrade");
1186 if (args->ar_posix_acl)
1187 seq_printf(s, ",acl");
1188 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
1189 char *state;
1190 switch (args->ar_quota) {
1191 case GFS2_QUOTA_OFF:
1192 state = "off";
1193 break;
1194 case GFS2_QUOTA_ACCOUNT:
1195 state = "account";
1196 break;
1197 case GFS2_QUOTA_ON:
1198 state = "on";
1199 break;
1200 default:
1201 state = "unknown";
1202 break;
1203 }
1204 seq_printf(s, ",quota=%s", state);
1205 }
1206 if (args->ar_suiddir)
1207 seq_printf(s, ",suiddir");
1208 if (args->ar_data != GFS2_DATA_DEFAULT) {
1209 char *state;
1210 switch (args->ar_data) {
1211 case GFS2_DATA_WRITEBACK:
1212 state = "writeback";
1213 break;
1214 case GFS2_DATA_ORDERED:
1215 state = "ordered";
1216 break;
1217 default:
1218 state = "unknown";
1219 break;
1220 }
1221 seq_printf(s, ",data=%s", state);
1222 }
1223 if (args->ar_discard)
1224 seq_printf(s, ",discard");
1225 lfsecs = sdp->sd_tune.gt_log_flush_secs;
1226 if (lfsecs != 60)
1227 seq_printf(s, ",commit=%d", lfsecs);
1228 return 0;
1229}
1230
1231/*
1232 * We have to (at the moment) hold the inodes main lock to cover
1233 * the gap between unlocking the shared lock on the iopen lock and
1234 * taking the exclusive lock. I'd rather do a shared -> exclusive
1235 * conversion on the iopen lock, but we can change that later. This
1236 * is safe, just less efficient.
1237 */
1238
1239static void gfs2_delete_inode(struct inode *inode)
1240{
1241 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
1242 struct gfs2_inode *ip = GFS2_I(inode);
1243 struct gfs2_holder gh;
1244 int error;
1245
1246 if (!test_bit(GIF_USER, &ip->i_flags))
1247 goto out;
1248
1249 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1250 if (unlikely(error)) {
1251 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1252 goto out;
1253 }
1254
1255 gfs2_glock_dq_wait(&ip->i_iopen_gh);
1256 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
1257 error = gfs2_glock_nq(&ip->i_iopen_gh);
1258 if (error)
1259 goto out_truncate;
1260
1261 if (S_ISDIR(inode->i_mode) &&
1262 (ip->i_diskflags & GFS2_DIF_EXHASH)) {
1263 error = gfs2_dir_exhash_dealloc(ip);
1264 if (error)
1265 goto out_unlock;
1266 }
1267
1268 if (ip->i_eattr) {
1269 error = gfs2_ea_dealloc(ip);
1270 if (error)
1271 goto out_unlock;
1272 }
1273
1274 if (!gfs2_is_stuffed(ip)) {
1275 error = gfs2_file_dealloc(ip);
1276 if (error)
1277 goto out_unlock;
1278 }
1279
1280 error = gfs2_dinode_dealloc(ip);
1281 if (error)
1282 goto out_unlock;
1283
1284out_truncate:
1285 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1286 if (error)
1287 goto out_unlock;
1288 /* Needs to be done before glock release & also in a transaction */
1289 truncate_inode_pages(&inode->i_data, 0);
1290 gfs2_trans_end(sdp);
1291
1292out_unlock:
1293 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
1294 gfs2_glock_dq(&ip->i_iopen_gh);
1295 gfs2_holder_uninit(&ip->i_iopen_gh);
1296 gfs2_glock_dq_uninit(&gh);
1297 if (error && error != GLR_TRYFAILED && error != -EROFS)
1298 fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
1299out:
1300 truncate_inode_pages(&inode->i_data, 0);
1301 clear_inode(inode);
1302}
1303
1304static struct inode *gfs2_alloc_inode(struct super_block *sb)
1305{
1306 struct gfs2_inode *ip;
1307
1308 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
1309 if (ip) {
1310 ip->i_flags = 0;
1311 ip->i_gl = NULL;
1312 }
1313 return &ip->i_inode;
1314}
1315
1316static void gfs2_destroy_inode(struct inode *inode)
1317{
1318 kmem_cache_free(gfs2_inode_cachep, inode);
1319}
1320
1321const struct super_operations gfs2_super_ops = {
1322 .alloc_inode = gfs2_alloc_inode,
1323 .destroy_inode = gfs2_destroy_inode,
1324 .write_inode = gfs2_write_inode,
1325 .delete_inode = gfs2_delete_inode,
1326 .put_super = gfs2_put_super,
1327 .sync_fs = gfs2_sync_fs,
1328 .freeze_fs = gfs2_freeze,
1329 .unfreeze_fs = gfs2_unfreeze,
1330 .statfs = gfs2_statfs,
1331 .remount_fs = gfs2_remount_fs,
1332 .clear_inode = gfs2_clear_inode,
1333 .drop_inode = gfs2_drop_inode,
1334 .show_options = gfs2_show_options,
1335};
1336
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index b56413e3e40d..22e0417ed996 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -40,6 +40,10 @@ extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
40extern int gfs2_statfs_init(struct gfs2_sbd *sdp); 40extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
41extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, 41extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
42 s64 dinodes); 42 s64 dinodes);
43extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
44 const void *buf);
45extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
46 struct buffer_head *l_bh);
43extern int gfs2_statfs_sync(struct gfs2_sbd *sdp); 47extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
44 48
45extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); 49extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7655f5025fec..a7cbfbd340c7 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,6 +26,36 @@
26#include "util.h" 26#include "util.h"
27#include "glops.h" 27#include "glops.h"
28 28
29struct gfs2_attr {
30 struct attribute attr;
31 ssize_t (*show)(struct gfs2_sbd *, char *);
32 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
33};
34
35static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
36 char *buf)
37{
38 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
39 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
40 return a->show ? a->show(sdp, buf) : 0;
41}
42
43static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
44 const char *buf, size_t len)
45{
46 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
47 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
48 return a->store ? a->store(sdp, buf, len) : len;
49}
50
51static struct sysfs_ops gfs2_attr_ops = {
52 .show = gfs2_attr_show,
53 .store = gfs2_attr_store,
54};
55
56
57static struct kset *gfs2_kset;
58
29static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) 59static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
30{ 60{
31 return snprintf(buf, PAGE_SIZE, "%u:%u\n", 61 return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -212,11 +242,6 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
212 return len; 242 return len;
213} 243}
214 244
215struct gfs2_attr {
216 struct attribute attr;
217 ssize_t (*show)(struct gfs2_sbd *, char *);
218 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
219};
220 245
221#define GFS2_ATTR(name, mode, show, store) \ 246#define GFS2_ATTR(name, mode, show, store) \
222static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) 247static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
@@ -246,58 +271,11 @@ static struct attribute *gfs2_attrs[] = {
246 NULL, 271 NULL,
247}; 272};
248 273
249static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
250 char *buf)
251{
252 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
253 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
254 return a->show ? a->show(sdp, buf) : 0;
255}
256
257static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
258 const char *buf, size_t len)
259{
260 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
261 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
262 return a->store ? a->store(sdp, buf, len) : len;
263}
264
265static struct sysfs_ops gfs2_attr_ops = {
266 .show = gfs2_attr_show,
267 .store = gfs2_attr_store,
268};
269
270static struct kobj_type gfs2_ktype = { 274static struct kobj_type gfs2_ktype = {
271 .default_attrs = gfs2_attrs, 275 .default_attrs = gfs2_attrs,
272 .sysfs_ops = &gfs2_attr_ops, 276 .sysfs_ops = &gfs2_attr_ops,
273}; 277};
274 278
275static struct kset *gfs2_kset;
276
277/*
278 * display struct lm_lockstruct fields
279 */
280
281struct lockstruct_attr {
282 struct attribute attr;
283 ssize_t (*show)(struct gfs2_sbd *, char *);
284};
285
286#define LOCKSTRUCT_ATTR(name, fmt) \
287static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
288{ \
289 return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
290} \
291static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
292
293LOCKSTRUCT_ATTR(jid, "%u\n");
294LOCKSTRUCT_ATTR(first, "%u\n");
295
296static struct attribute *lockstruct_attrs[] = {
297 &lockstruct_attr_jid.attr,
298 &lockstruct_attr_first.attr,
299 NULL,
300};
301 279
302/* 280/*
303 * lock_module. Originally from lock_dlm 281 * lock_module. Originally from lock_dlm
@@ -359,34 +337,33 @@ static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
359 return sprintf(buf, "%d\n", ls->ls_first_done); 337 return sprintf(buf, "%d\n", ls->ls_first_done);
360} 338}
361 339
362static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf) 340static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
363{
364 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
365 return sprintf(buf, "%d\n", ls->ls_recover_jid);
366}
367
368static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
369{ 341{
342 unsigned jid;
370 struct gfs2_jdesc *jd; 343 struct gfs2_jdesc *jd;
344 int rv;
345
346 rv = sscanf(buf, "%u", &jid);
347 if (rv != 1)
348 return -EINVAL;
371 349
350 rv = -ESHUTDOWN;
372 spin_lock(&sdp->sd_jindex_spin); 351 spin_lock(&sdp->sd_jindex_spin);
352 if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
353 goto out;
354 rv = -EBUSY;
355 if (sdp->sd_jdesc->jd_jid == jid)
356 goto out;
357 rv = -ENOENT;
373 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { 358 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
374 if (jd->jd_jid != jid) 359 if (jd->jd_jid != jid)
375 continue; 360 continue;
376 jd->jd_dirty = 1; 361 rv = slow_work_enqueue(&jd->jd_work);
377 break; 362 break;
378 } 363 }
364out:
379 spin_unlock(&sdp->sd_jindex_spin); 365 spin_unlock(&sdp->sd_jindex_spin);
380} 366 return rv ? rv : len;
381
382static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
383{
384 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
385 ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
386 gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
387 if (sdp->sd_recoverd_process)
388 wake_up_process(sdp->sd_recoverd_process);
389 return len;
390} 367}
391 368
392static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf) 369static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
@@ -401,31 +378,31 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
401 return sprintf(buf, "%d\n", ls->ls_recover_jid_status); 378 return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
402} 379}
403 380
404struct gdlm_attr { 381static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
405 struct attribute attr; 382{
406 ssize_t (*show)(struct gfs2_sbd *sdp, char *); 383 return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
407 ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t); 384}
408};
409 385
410#define GDLM_ATTR(_name,_mode,_show,_store) \ 386#define GDLM_ATTR(_name,_mode,_show,_store) \
411static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) 387static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
412 388
413GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); 389GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
414GDLM_ATTR(block, 0644, block_show, block_store); 390GDLM_ATTR(block, 0644, block_show, block_store);
415GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); 391GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
416GDLM_ATTR(id, 0444, lkid_show, NULL); 392GDLM_ATTR(id, 0444, lkid_show, NULL);
417GDLM_ATTR(first, 0444, lkfirst_show, NULL); 393GDLM_ATTR(jid, 0444, jid_show, NULL);
418GDLM_ATTR(first_done, 0444, first_done_show, NULL); 394GDLM_ATTR(first, 0444, lkfirst_show, NULL);
419GDLM_ATTR(recover, 0644, recover_show, recover_store); 395GDLM_ATTR(first_done, 0444, first_done_show, NULL);
420GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); 396GDLM_ATTR(recover, 0600, NULL, recover_store);
421GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); 397GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
398GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
422 399
423static struct attribute *lock_module_attrs[] = { 400static struct attribute *lock_module_attrs[] = {
424 &gdlm_attr_proto_name.attr, 401 &gdlm_attr_proto_name.attr,
425 &gdlm_attr_block.attr, 402 &gdlm_attr_block.attr,
426 &gdlm_attr_withdraw.attr, 403 &gdlm_attr_withdraw.attr,
427 &gdlm_attr_id.attr, 404 &gdlm_attr_id.attr,
428 &lockstruct_attr_jid.attr, 405 &gdlm_attr_jid.attr,
429 &gdlm_attr_first.attr, 406 &gdlm_attr_first.attr,
430 &gdlm_attr_first_done.attr, 407 &gdlm_attr_first_done.attr,
431 &gdlm_attr_recover.attr, 408 &gdlm_attr_recover.attr,
@@ -435,53 +412,6 @@ static struct attribute *lock_module_attrs[] = {
435}; 412};
436 413
437/* 414/*
438 * display struct gfs2_args fields
439 */
440
441struct args_attr {
442 struct attribute attr;
443 ssize_t (*show)(struct gfs2_sbd *, char *);
444};
445
446#define ARGS_ATTR(name, fmt) \
447static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
448{ \
449 return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name); \
450} \
451static struct args_attr args_attr_##name = __ATTR_RO(name)
452
453ARGS_ATTR(lockproto, "%s\n");
454ARGS_ATTR(locktable, "%s\n");
455ARGS_ATTR(hostdata, "%s\n");
456ARGS_ATTR(spectator, "%d\n");
457ARGS_ATTR(ignore_local_fs, "%d\n");
458ARGS_ATTR(localcaching, "%d\n");
459ARGS_ATTR(localflocks, "%d\n");
460ARGS_ATTR(debug, "%d\n");
461ARGS_ATTR(upgrade, "%d\n");
462ARGS_ATTR(posix_acl, "%d\n");
463ARGS_ATTR(quota, "%u\n");
464ARGS_ATTR(suiddir, "%d\n");
465ARGS_ATTR(data, "%d\n");
466
467static struct attribute *args_attrs[] = {
468 &args_attr_lockproto.attr,
469 &args_attr_locktable.attr,
470 &args_attr_hostdata.attr,
471 &args_attr_spectator.attr,
472 &args_attr_ignore_local_fs.attr,
473 &args_attr_localcaching.attr,
474 &args_attr_localflocks.attr,
475 &args_attr_debug.attr,
476 &args_attr_upgrade.attr,
477 &args_attr_posix_acl.attr,
478 &args_attr_quota.attr,
479 &args_attr_suiddir.attr,
480 &args_attr_data.attr,
481 NULL,
482};
483
484/*
485 * get and set struct gfs2_tune fields 415 * get and set struct gfs2_tune fields
486 */ 416 */
487 417
@@ -531,14 +461,8 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
531 return len; 461 return len;
532} 462}
533 463
534struct tune_attr {
535 struct attribute attr;
536 ssize_t (*show)(struct gfs2_sbd *, char *);
537 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
538};
539
540#define TUNE_ATTR_3(name, show, store) \ 464#define TUNE_ATTR_3(name, show, store) \
541static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store) 465static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store)
542 466
543#define TUNE_ATTR_2(name, store) \ 467#define TUNE_ATTR_2(name, store) \
544static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ 468static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
@@ -554,15 +478,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
554} \ 478} \
555TUNE_ATTR_2(name, name##_store) 479TUNE_ATTR_2(name, name##_store)
556 480
557#define TUNE_ATTR_DAEMON(name, process) \
558static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
559{ \
560 ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \
561 wake_up_process(sdp->sd_##process); \
562 return r; \
563} \
564TUNE_ATTR_2(name, name##_store)
565
566TUNE_ATTR(incore_log_blocks, 0); 481TUNE_ATTR(incore_log_blocks, 0);
567TUNE_ATTR(log_flush_secs, 0); 482TUNE_ATTR(log_flush_secs, 0);
568TUNE_ATTR(quota_warn_period, 0); 483TUNE_ATTR(quota_warn_period, 0);
@@ -574,8 +489,6 @@ TUNE_ATTR(new_files_jdata, 0);
574TUNE_ATTR(quota_simul_sync, 1); 489TUNE_ATTR(quota_simul_sync, 1);
575TUNE_ATTR(stall_secs, 1); 490TUNE_ATTR(stall_secs, 1);
576TUNE_ATTR(statfs_quantum, 1); 491TUNE_ATTR(statfs_quantum, 1);
577TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
578TUNE_ATTR_DAEMON(logd_secs, logd_process);
579TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 492TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
580 493
581static struct attribute *tune_attrs[] = { 494static struct attribute *tune_attrs[] = {
@@ -589,23 +502,11 @@ static struct attribute *tune_attrs[] = {
589 &tune_attr_quota_simul_sync.attr, 502 &tune_attr_quota_simul_sync.attr,
590 &tune_attr_stall_secs.attr, 503 &tune_attr_stall_secs.attr,
591 &tune_attr_statfs_quantum.attr, 504 &tune_attr_statfs_quantum.attr,
592 &tune_attr_recoverd_secs.attr,
593 &tune_attr_logd_secs.attr,
594 &tune_attr_quota_scale.attr, 505 &tune_attr_quota_scale.attr,
595 &tune_attr_new_files_jdata.attr, 506 &tune_attr_new_files_jdata.attr,
596 NULL, 507 NULL,
597}; 508};
598 509
599static struct attribute_group lockstruct_group = {
600 .name = "lockstruct",
601 .attrs = lockstruct_attrs,
602};
603
604static struct attribute_group args_group = {
605 .name = "args",
606 .attrs = args_attrs,
607};
608
609static struct attribute_group tune_group = { 510static struct attribute_group tune_group = {
610 .name = "tune", 511 .name = "tune",
611 .attrs = tune_attrs, 512 .attrs = tune_attrs,
@@ -626,17 +527,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
626 if (error) 527 if (error)
627 goto fail; 528 goto fail;
628 529
629 error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
630 if (error)
631 goto fail_reg;
632
633 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
634 if (error)
635 goto fail_lockstruct;
636
637 error = sysfs_create_group(&sdp->sd_kobj, &tune_group); 530 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
638 if (error) 531 if (error)
639 goto fail_args; 532 goto fail_reg;
640 533
641 error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group); 534 error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
642 if (error) 535 if (error)
@@ -647,10 +540,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
647 540
648fail_tune: 541fail_tune:
649 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 542 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
650fail_args:
651 sysfs_remove_group(&sdp->sd_kobj, &args_group);
652fail_lockstruct:
653 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
654fail_reg: 543fail_reg:
655 kobject_put(&sdp->sd_kobj); 544 kobject_put(&sdp->sd_kobj);
656fail: 545fail:
@@ -661,8 +550,6 @@ fail:
661void gfs2_sys_fs_del(struct gfs2_sbd *sdp) 550void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
662{ 551{
663 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 552 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
664 sysfs_remove_group(&sdp->sd_kobj, &args_group);
665 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
666 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); 553 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
667 kobject_put(&sdp->sd_kobj); 554 kobject_put(&sdp->sd_kobj);
668} 555}
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
new file mode 100644
index 000000000000..148d55c14171
--- /dev/null
+++ b/fs/gfs2/trace_gfs2.h
@@ -0,0 +1,407 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM gfs2
3
4#if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_GFS2_H
6
7#include <linux/tracepoint.h>
8
9#include <linux/fs.h>
10#include <linux/buffer_head.h>
11#include <linux/dlmconstants.h>
12#include <linux/gfs2_ondisk.h>
13#include "incore.h"
14#include "glock.h"
15
16#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn }
17#define glock_trace_name(x) __print_symbolic(x, \
18 dlm_state_name(IV), \
19 dlm_state_name(NL), \
20 dlm_state_name(CR), \
21 dlm_state_name(CW), \
22 dlm_state_name(PR), \
23 dlm_state_name(PW), \
24 dlm_state_name(EX))
25
26#define block_state_name(x) __print_symbolic(x, \
27 { GFS2_BLKST_FREE, "free" }, \
28 { GFS2_BLKST_USED, "used" }, \
29 { GFS2_BLKST_DINODE, "dinode" }, \
30 { GFS2_BLKST_UNLINKED, "unlinked" })
31
32#define show_glock_flags(flags) __print_flags(flags, "", \
33 {(1UL << GLF_LOCK), "l" }, \
34 {(1UL << GLF_DEMOTE), "D" }, \
35 {(1UL << GLF_PENDING_DEMOTE), "d" }, \
36 {(1UL << GLF_DEMOTE_IN_PROGRESS), "p" }, \
37 {(1UL << GLF_DIRTY), "y" }, \
38 {(1UL << GLF_LFLUSH), "f" }, \
39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
40 {(1UL << GLF_REPLY_PENDING), "r" }, \
41 {(1UL << GLF_INITIAL), "I" }, \
42 {(1UL << GLF_FROZEN), "F" })
43
44#ifndef NUMPTY
45#define NUMPTY
46static inline u8 glock_trace_state(unsigned int state)
47{
48 switch(state) {
49 case LM_ST_SHARED:
50 return DLM_LOCK_PR;
51 case LM_ST_DEFERRED:
52 return DLM_LOCK_CW;
53 case LM_ST_EXCLUSIVE:
54 return DLM_LOCK_EX;
55 }
56 return DLM_LOCK_NL;
57}
58#endif
59
60/* Section 1 - Locking
61 *
62 * Objectives:
63 * Latency: Remote demote request to state change
64 * Latency: Local lock request to state change
65 * Latency: State change to lock grant
66 * Correctness: Ordering of local lock state vs. I/O requests
67 * Correctness: Responses to remote demote requests
68 */
69
70/* General glock state change (DLM lock request completes) */
71TRACE_EVENT(gfs2_glock_state_change,
72
73 TP_PROTO(const struct gfs2_glock *gl, unsigned int new_state),
74
75 TP_ARGS(gl, new_state),
76
77 TP_STRUCT__entry(
78 __field( dev_t, dev )
79 __field( u64, glnum )
80 __field( u32, gltype )
81 __field( u8, cur_state )
82 __field( u8, new_state )
83 __field( u8, dmt_state )
84 __field( u8, tgt_state )
85 __field( unsigned long, flags )
86 ),
87
88 TP_fast_assign(
89 __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
90 __entry->glnum = gl->gl_name.ln_number;
91 __entry->gltype = gl->gl_name.ln_type;
92 __entry->cur_state = glock_trace_state(gl->gl_state);
93 __entry->new_state = glock_trace_state(new_state);
94 __entry->tgt_state = glock_trace_state(gl->gl_target);
95 __entry->dmt_state = glock_trace_state(gl->gl_demote_state);
96 __entry->flags = gl->gl_flags;
97 ),
98
99 TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s",
100 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
101 (unsigned long long)__entry->glnum,
102 glock_trace_name(__entry->cur_state),
103 glock_trace_name(__entry->new_state),
104 glock_trace_name(__entry->tgt_state),
105 glock_trace_name(__entry->dmt_state),
106 show_glock_flags(__entry->flags))
107);
108
109/* State change -> unlocked, glock is being deallocated */
110TRACE_EVENT(gfs2_glock_put,
111
112 TP_PROTO(const struct gfs2_glock *gl),
113
114 TP_ARGS(gl),
115
116 TP_STRUCT__entry(
117 __field( dev_t, dev )
118 __field( u64, glnum )
119 __field( u32, gltype )
120 __field( u8, cur_state )
121 __field( unsigned long, flags )
122 ),
123
124 TP_fast_assign(
125 __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
126 __entry->gltype = gl->gl_name.ln_type;
127 __entry->glnum = gl->gl_name.ln_number;
128 __entry->cur_state = glock_trace_state(gl->gl_state);
129 __entry->flags = gl->gl_flags;
130 ),
131
132 TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s",
133 MAJOR(__entry->dev), MINOR(__entry->dev),
134 __entry->gltype, (unsigned long long)__entry->glnum,
135 glock_trace_name(__entry->cur_state),
136 glock_trace_name(DLM_LOCK_IV),
137 show_glock_flags(__entry->flags))
138
139);
140
141/* Callback (local or remote) requesting lock demotion */
142TRACE_EVENT(gfs2_demote_rq,
143
144 TP_PROTO(const struct gfs2_glock *gl),
145
146 TP_ARGS(gl),
147
148 TP_STRUCT__entry(
149 __field( dev_t, dev )
150 __field( u64, glnum )
151 __field( u32, gltype )
152 __field( u8, cur_state )
153 __field( u8, dmt_state )
154 __field( unsigned long, flags )
155 ),
156
157 TP_fast_assign(
158 __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
159 __entry->gltype = gl->gl_name.ln_type;
160 __entry->glnum = gl->gl_name.ln_number;
161 __entry->cur_state = glock_trace_state(gl->gl_state);
162 __entry->dmt_state = glock_trace_state(gl->gl_demote_state);
163 __entry->flags = gl->gl_flags;
164 ),
165
166 TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s",
167 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
168 (unsigned long long)__entry->glnum,
169 glock_trace_name(__entry->cur_state),
170 glock_trace_name(__entry->dmt_state),
171 show_glock_flags(__entry->flags))
172
173);
174
175/* Promotion/grant of a glock */
176TRACE_EVENT(gfs2_promote,
177
178 TP_PROTO(const struct gfs2_holder *gh, int first),
179
180 TP_ARGS(gh, first),
181
182 TP_STRUCT__entry(
183 __field( dev_t, dev )
184 __field( u64, glnum )
185 __field( u32, gltype )
186 __field( int, first )
187 __field( u8, state )
188 ),
189
190 TP_fast_assign(
191 __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
192 __entry->glnum = gh->gh_gl->gl_name.ln_number;
193 __entry->gltype = gh->gh_gl->gl_name.ln_type;
194 __entry->first = first;
195 __entry->state = glock_trace_state(gh->gh_state);
196 ),
197
198 TP_printk("%u,%u glock %u:%llu promote %s %s",
199 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
200 (unsigned long long)__entry->glnum,
201 __entry->first ? "first": "other",
202 glock_trace_name(__entry->state))
203);
204
205/* Queue/dequeue a lock request */
206TRACE_EVENT(gfs2_glock_queue,
207
208 TP_PROTO(const struct gfs2_holder *gh, int queue),
209
210 TP_ARGS(gh, queue),
211
212 TP_STRUCT__entry(
213 __field( dev_t, dev )
214 __field( u64, glnum )
215 __field( u32, gltype )
216 __field( int, queue )
217 __field( u8, state )
218 ),
219
220 TP_fast_assign(
221 __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
222 __entry->glnum = gh->gh_gl->gl_name.ln_number;
223 __entry->gltype = gh->gh_gl->gl_name.ln_type;
224 __entry->queue = queue;
225 __entry->state = glock_trace_state(gh->gh_state);
226 ),
227
228 TP_printk("%u,%u glock %u:%llu %squeue %s",
229 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
230 (unsigned long long)__entry->glnum,
231 __entry->queue ? "" : "de",
232 glock_trace_name(__entry->state))
233);
234
235/* Section 2 - Log/journal
236 *
237 * Objectives:
238 * Latency: Log flush time
239 * Correctness: pin/unpin vs. disk I/O ordering
240 * Performance: Log usage stats
241 */
242
243/* Pin/unpin a block in the log */
244TRACE_EVENT(gfs2_pin,
245
246 TP_PROTO(const struct gfs2_bufdata *bd, int pin),
247
248 TP_ARGS(bd, pin),
249
250 TP_STRUCT__entry(
251 __field( dev_t, dev )
252 __field( int, pin )
253 __field( u32, len )
254 __field( sector_t, block )
255 __field( u64, ino )
256 ),
257
258 TP_fast_assign(
259 __entry->dev = bd->bd_gl->gl_sbd->sd_vfs->s_dev;
260 __entry->pin = pin;
261 __entry->len = bd->bd_bh->b_size;
262 __entry->block = bd->bd_bh->b_blocknr;
263 __entry->ino = bd->bd_gl->gl_name.ln_number;
264 ),
265
266 TP_printk("%u,%u log %s %llu/%lu inode %llu",
267 MAJOR(__entry->dev), MINOR(__entry->dev),
268 __entry->pin ? "pin" : "unpin",
269 (unsigned long long)__entry->block,
270 (unsigned long)__entry->len,
271 (unsigned long long)__entry->ino)
272);
273
274/* Flushing the log */
275TRACE_EVENT(gfs2_log_flush,
276
277 TP_PROTO(const struct gfs2_sbd *sdp, int start),
278
279 TP_ARGS(sdp, start),
280
281 TP_STRUCT__entry(
282 __field( dev_t, dev )
283 __field( int, start )
284 __field( u64, log_seq )
285 ),
286
287 TP_fast_assign(
288 __entry->dev = sdp->sd_vfs->s_dev;
289 __entry->start = start;
290 __entry->log_seq = sdp->sd_log_sequence;
291 ),
292
293 TP_printk("%u,%u log flush %s %llu",
294 MAJOR(__entry->dev), MINOR(__entry->dev),
295 __entry->start ? "start" : "end",
296 (unsigned long long)__entry->log_seq)
297);
298
299/* Reserving/releasing blocks in the log */
300TRACE_EVENT(gfs2_log_blocks,
301
302 TP_PROTO(const struct gfs2_sbd *sdp, int blocks),
303
304 TP_ARGS(sdp, blocks),
305
306 TP_STRUCT__entry(
307 __field( dev_t, dev )
308 __field( int, blocks )
309 ),
310
311 TP_fast_assign(
312 __entry->dev = sdp->sd_vfs->s_dev;
313 __entry->blocks = blocks;
314 ),
315
316 TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev),
317 MINOR(__entry->dev), __entry->blocks)
318);
319
320/* Section 3 - bmap
321 *
322 * Objectives:
323 * Latency: Bmap request time
324 * Performance: Block allocator tracing
325 * Correctness: Test of disard generation vs. blocks allocated
326 */
327
328/* Map an extent of blocks, possibly a new allocation */
329TRACE_EVENT(gfs2_bmap,
330
331 TP_PROTO(const struct gfs2_inode *ip, const struct buffer_head *bh,
332 sector_t lblock, int create, int errno),
333
334 TP_ARGS(ip, bh, lblock, create, errno),
335
336 TP_STRUCT__entry(
337 __field( dev_t, dev )
338 __field( sector_t, lblock )
339 __field( sector_t, pblock )
340 __field( u64, inum )
341 __field( unsigned long, state )
342 __field( u32, len )
343 __field( int, create )
344 __field( int, errno )
345 ),
346
347 TP_fast_assign(
348 __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev;
349 __entry->lblock = lblock;
350 __entry->pblock = buffer_mapped(bh) ? bh->b_blocknr : 0;
351 __entry->inum = ip->i_no_addr;
352 __entry->state = bh->b_state;
353 __entry->len = bh->b_size;
354 __entry->create = create;
355 __entry->errno = errno;
356 ),
357
358 TP_printk("%u,%u bmap %llu map %llu/%lu to %llu flags:%08lx %s %d",
359 MAJOR(__entry->dev), MINOR(__entry->dev),
360 (unsigned long long)__entry->inum,
361 (unsigned long long)__entry->lblock,
362 (unsigned long)__entry->len,
363 (unsigned long long)__entry->pblock,
364 __entry->state, __entry->create ? "create " : "nocreate",
365 __entry->errno)
366);
367
368/* Keep track of blocks as they are allocated/freed */
369TRACE_EVENT(gfs2_block_alloc,
370
371 TP_PROTO(const struct gfs2_inode *ip, u64 block, unsigned len,
372 u8 block_state),
373
374 TP_ARGS(ip, block, len, block_state),
375
376 TP_STRUCT__entry(
377 __field( dev_t, dev )
378 __field( u64, start )
379 __field( u64, inum )
380 __field( u32, len )
381 __field( u8, block_state )
382 ),
383
384 TP_fast_assign(
385 __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev;
386 __entry->start = block;
387 __entry->inum = ip->i_no_addr;
388 __entry->len = len;
389 __entry->block_state = block_state;
390 ),
391
392 TP_printk("%u,%u bmap %llu alloc %llu/%lu %s",
393 MAJOR(__entry->dev), MINOR(__entry->dev),
394 (unsigned long long)__entry->inum,
395 (unsigned long long)__entry->start,
396 (unsigned long)__entry->len,
397 block_state_name(__entry->block_state))
398);
399
400#endif /* _TRACE_GFS2_H */
401
402/* This part must be outside protection */
403#undef TRACE_INCLUDE_PATH
404#define TRACE_INCLUDE_PATH .
405#define TRACE_INCLUDE_FILE trace_gfs2
406#include <trace/define_trace.h>
407
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 053752d4b27f..4ef0e9fa3549 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -33,6 +33,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
33 BUG_ON(current->journal_info); 33 BUG_ON(current->journal_info);
34 BUG_ON(blocks == 0 && revokes == 0); 34 BUG_ON(blocks == 0 && revokes == 0);
35 35
36 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
37 return -EROFS;
38
36 tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS); 39 tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
37 if (!tr) 40 if (!tr)
38 return -ENOMEM; 41 return -ENOMEM;
@@ -54,12 +57,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
54 if (error) 57 if (error)
55 goto fail_holder_uninit; 58 goto fail_holder_uninit;
56 59
57 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
58 tr->tr_t_gh.gh_flags |= GL_NOCACHE;
59 error = -EROFS;
60 goto fail_gunlock;
61 }
62
63 error = gfs2_log_reserve(sdp, tr->tr_reserved); 60 error = gfs2_log_reserve(sdp, tr->tr_reserved);
64 if (error) 61 if (error)
65 goto fail_gunlock; 62 goto fail_gunlock;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index a36bb749926d..f7fcbe49da72 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
19#include <linux/nls.h> 19#include <linux/nls.h>
20#include <linux/parser.h> 20#include <linux/parser.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/smp_lock.h>
22#include <linux/vfs.h> 23#include <linux/vfs.h>
23 24
24#include "hfs_fs.h" 25#include "hfs_fs.h"
@@ -49,11 +50,23 @@ MODULE_LICENSE("GPL");
49 */ 50 */
50static void hfs_write_super(struct super_block *sb) 51static void hfs_write_super(struct super_block *sb)
51{ 52{
53 lock_super(sb);
52 sb->s_dirt = 0; 54 sb->s_dirt = 0;
53 if (sb->s_flags & MS_RDONLY) 55
54 return;
55 /* sync everything to the buffers */ 56 /* sync everything to the buffers */
57 if (!(sb->s_flags & MS_RDONLY))
58 hfs_mdb_commit(sb);
59 unlock_super(sb);
60}
61
62static int hfs_sync_fs(struct super_block *sb, int wait)
63{
64 lock_super(sb);
56 hfs_mdb_commit(sb); 65 hfs_mdb_commit(sb);
66 sb->s_dirt = 0;
67 unlock_super(sb);
68
69 return 0;
57} 70}
58 71
59/* 72/*
@@ -65,9 +78,15 @@ static void hfs_write_super(struct super_block *sb)
65 */ 78 */
66static void hfs_put_super(struct super_block *sb) 79static void hfs_put_super(struct super_block *sb)
67{ 80{
81 lock_kernel();
82
83 if (sb->s_dirt)
84 hfs_write_super(sb);
68 hfs_mdb_close(sb); 85 hfs_mdb_close(sb);
69 /* release the MDB's resources */ 86 /* release the MDB's resources */
70 hfs_mdb_put(sb); 87 hfs_mdb_put(sb);
88
89 unlock_kernel();
71} 90}
72 91
73/* 92/*
@@ -164,6 +183,7 @@ static const struct super_operations hfs_super_operations = {
164 .clear_inode = hfs_clear_inode, 183 .clear_inode = hfs_clear_inode,
165 .put_super = hfs_put_super, 184 .put_super = hfs_put_super,
166 .write_super = hfs_write_super, 185 .write_super = hfs_write_super,
186 .sync_fs = hfs_sync_fs,
167 .statfs = hfs_statfs, 187 .statfs = hfs_statfs,
168 .remount_fs = hfs_remount, 188 .remount_fs = hfs_remount,
169 .show_options = hfs_show_options, 189 .show_options = hfs_show_options,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index f2a64020f42e..c0759fe0855b 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,6 +12,7 @@
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/smp_lock.h>
15#include <linux/vfs.h> 16#include <linux/vfs.h>
16#include <linux/nls.h> 17#include <linux/nls.h>
17 18
@@ -152,15 +153,14 @@ static void hfsplus_clear_inode(struct inode *inode)
152 } 153 }
153} 154}
154 155
155static void hfsplus_write_super(struct super_block *sb) 156static int hfsplus_sync_fs(struct super_block *sb, int wait)
156{ 157{
157 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 158 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
158 159
159 dprint(DBG_SUPER, "hfsplus_write_super\n"); 160 dprint(DBG_SUPER, "hfsplus_write_super\n");
161
162 lock_super(sb);
160 sb->s_dirt = 0; 163 sb->s_dirt = 0;
161 if (sb->s_flags & MS_RDONLY)
162 /* warn? */
163 return;
164 164
165 vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); 165 vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
166 vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); 166 vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
@@ -192,6 +192,16 @@ static void hfsplus_write_super(struct super_block *sb)
192 } 192 }
193 HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP; 193 HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
194 } 194 }
195 unlock_super(sb);
196 return 0;
197}
198
199static void hfsplus_write_super(struct super_block *sb)
200{
201 if (!(sb->s_flags & MS_RDONLY))
202 hfsplus_sync_fs(sb, 1);
203 else
204 sb->s_dirt = 0;
195} 205}
196 206
197static void hfsplus_put_super(struct super_block *sb) 207static void hfsplus_put_super(struct super_block *sb)
@@ -199,6 +209,11 @@ static void hfsplus_put_super(struct super_block *sb)
199 dprint(DBG_SUPER, "hfsplus_put_super\n"); 209 dprint(DBG_SUPER, "hfsplus_put_super\n");
200 if (!sb->s_fs_info) 210 if (!sb->s_fs_info)
201 return; 211 return;
212
213 lock_kernel();
214
215 if (sb->s_dirt)
216 hfsplus_write_super(sb);
202 if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { 217 if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
203 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 218 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
204 219
@@ -218,6 +233,8 @@ static void hfsplus_put_super(struct super_block *sb)
218 unload_nls(HFSPLUS_SB(sb).nls); 233 unload_nls(HFSPLUS_SB(sb).nls);
219 kfree(sb->s_fs_info); 234 kfree(sb->s_fs_info);
220 sb->s_fs_info = NULL; 235 sb->s_fs_info = NULL;
236
237 unlock_kernel();
221} 238}
222 239
223static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) 240static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -279,6 +296,7 @@ static const struct super_operations hfsplus_sops = {
279 .clear_inode = hfsplus_clear_inode, 296 .clear_inode = hfsplus_clear_inode,
280 .put_super = hfsplus_put_super, 297 .put_super = hfsplus_put_super,
281 .write_super = hfsplus_write_super, 298 .write_super = hfsplus_write_super,
299 .sync_fs = hfsplus_sync_fs,
282 .statfs = hfsplus_statfs, 300 .statfs = hfsplus_statfs,
283 .remount_fs = hfsplus_remount, 301 .remount_fs = hfsplus_remount,
284 .show_options = hfsplus_show_options, 302 .show_options = hfsplus_show_options,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index fe02ad4740e7..032604e5ef2c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -972,6 +972,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
972 sb->s_blocksize_bits = 10; 972 sb->s_blocksize_bits = 10;
973 sb->s_magic = HOSTFS_SUPER_MAGIC; 973 sb->s_magic = HOSTFS_SUPER_MAGIC;
974 sb->s_op = &hostfs_sbops; 974 sb->s_op = &hostfs_sbops;
975 sb->s_maxbytes = MAX_LFS_FILESIZE;
975 976
976 /* NULL is printed as <NULL> by sprintf: avoid that. */ 977 /* NULL is printed as <NULL> by sprintf: avoid that. */
977 if (req_root == NULL) 978 if (req_root == NULL)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 6916c41d7017..8865c94f55f6 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -6,6 +6,7 @@
6 * directory VFS functions 6 * directory VFS functions
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
9#include "hpfs_fn.h" 10#include "hpfs_fn.h"
10 11
11static int hpfs_dir_release(struct inode *inode, struct file *filp) 12static int hpfs_dir_release(struct inode *inode, struct file *filp)
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 64ab52259204..3efabff00367 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -6,6 +6,7 @@
6 * file VFS functions 6 * file VFS functions
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
9#include "hpfs_fn.h" 10#include "hpfs_fn.h"
10 11
11#define BLOCKS(size) (((size) + 511) >> 9) 12#define BLOCKS(size) (((size) + 511) >> 9)
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index c2ea31bae313..701ca54c0867 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -13,7 +13,6 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/smp_lock.h>
17 16
18#include "hpfs.h" 17#include "hpfs.h"
19 18
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 39a1bfbea312..fe703ae46bc7 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -6,6 +6,7 @@
6 * inode VFS functions 6 * inode VFS functions
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
9#include "hpfs_fn.h" 10#include "hpfs_fn.h"
10 11
11void hpfs_init_inode(struct inode *i) 12void hpfs_init_inode(struct inode *i)
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index b649232dde97..82b9c4ba9ed0 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -6,6 +6,7 @@
6 * adding & removing files & directories 6 * adding & removing files & directories
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/smp_lock.h>
9#include "hpfs_fn.h" 10#include "hpfs_fn.h"
10 11
11static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 12static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index fc77965be841..f2feaa06bf26 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,6 +13,7 @@
13#include <linux/statfs.h> 13#include <linux/statfs.h>
14#include <linux/magic.h> 14#include <linux/magic.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/smp_lock.h>
16 17
17/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ 18/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
18 19
@@ -99,11 +100,16 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
99static void hpfs_put_super(struct super_block *s) 100static void hpfs_put_super(struct super_block *s)
100{ 101{
101 struct hpfs_sb_info *sbi = hpfs_sb(s); 102 struct hpfs_sb_info *sbi = hpfs_sb(s);
103
104 lock_kernel();
105
102 kfree(sbi->sb_cp_table); 106 kfree(sbi->sb_cp_table);
103 kfree(sbi->sb_bmp_dir); 107 kfree(sbi->sb_bmp_dir);
104 unmark_dirty(s); 108 unmark_dirty(s);
105 s->s_fs_info = NULL; 109 s->s_fs_info = NULL;
106 kfree(sbi); 110 kfree(sbi);
111
112 unlock_kernel();
107} 113}
108 114
109unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) 115unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -393,6 +399,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
393 399
394 *flags |= MS_NOATIME; 400 *flags |= MS_NOATIME;
395 401
402 lock_kernel();
403 lock_super(s);
396 uid = sbi->sb_uid; gid = sbi->sb_gid; 404 uid = sbi->sb_uid; gid = sbi->sb_gid;
397 umask = 0777 & ~sbi->sb_mode; 405 umask = 0777 & ~sbi->sb_mode;
398 lowercase = sbi->sb_lowercase; conv = sbi->sb_conv; 406 lowercase = sbi->sb_lowercase; conv = sbi->sb_conv;
@@ -425,9 +433,13 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
425 433
426 replace_mount_options(s, new_opts); 434 replace_mount_options(s, new_opts);
427 435
436 unlock_super(s);
437 unlock_kernel();
428 return 0; 438 return 0;
429 439
430out_err: 440out_err:
441 unlock_super(s);
442 unlock_kernel();
431 kfree(new_opts); 443 kfree(new_opts);
432 return -EINVAL; 444 return -EINVAL;
433} 445}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c1462d43e721..cb88dac8ccaa 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -30,6 +30,7 @@
30#include <linux/dnotify.h> 30#include <linux/dnotify.h>
31#include <linux/statfs.h> 31#include <linux/statfs.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/ima.h>
33 34
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35 36
@@ -934,26 +935,28 @@ static int can_do_hugetlb_shm(void)
934 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); 935 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
935} 936}
936 937
937struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) 938struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
939 struct user_struct **user)
938{ 940{
939 int error = -ENOMEM; 941 int error = -ENOMEM;
940 int unlock_shm = 0;
941 struct file *file; 942 struct file *file;
942 struct inode *inode; 943 struct inode *inode;
943 struct dentry *dentry, *root; 944 struct dentry *dentry, *root;
944 struct qstr quick_string; 945 struct qstr quick_string;
945 struct user_struct *user = current_user();
946 946
947 *user = NULL;
947 if (!hugetlbfs_vfsmount) 948 if (!hugetlbfs_vfsmount)
948 return ERR_PTR(-ENOENT); 949 return ERR_PTR(-ENOENT);
949 950
950 if (!can_do_hugetlb_shm()) { 951 if (!can_do_hugetlb_shm()) {
951 if (user_shm_lock(size, user)) { 952 *user = current_user();
952 unlock_shm = 1; 953 if (user_shm_lock(size, *user)) {
953 WARN_ONCE(1, 954 WARN_ONCE(1,
954 "Using mlock ulimits for SHM_HUGETLB deprecated\n"); 955 "Using mlock ulimits for SHM_HUGETLB deprecated\n");
955 } else 956 } else {
957 *user = NULL;
956 return ERR_PTR(-EPERM); 958 return ERR_PTR(-EPERM);
959 }
957 } 960 }
958 961
959 root = hugetlbfs_vfsmount->mnt_root; 962 root = hugetlbfs_vfsmount->mnt_root;
@@ -986,6 +989,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
986 &hugetlbfs_file_operations); 989 &hugetlbfs_file_operations);
987 if (!file) 990 if (!file)
988 goto out_dentry; /* inode is already attached */ 991 goto out_dentry; /* inode is already attached */
992 ima_counts_get(file);
989 993
990 return file; 994 return file;
991 995
@@ -994,8 +998,10 @@ out_inode:
994out_dentry: 998out_dentry:
995 dput(dentry); 999 dput(dentry);
996out_shm_unlock: 1000out_shm_unlock:
997 if (unlock_shm) 1001 if (*user) {
998 user_shm_unlock(size, user); 1002 user_shm_unlock(size, *user);
1003 *user = NULL;
1004 }
999 return ERR_PTR(error); 1005 return ERR_PTR(error);
1000} 1006}
1001 1007
diff --git a/fs/inode.c b/fs/inode.c
index bca0c618fdb3..ae7b67e48661 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,8 +22,10 @@
22#include <linux/cdev.h> 22#include <linux/cdev.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/inotify.h> 24#include <linux/inotify.h>
25#include <linux/fsnotify.h>
25#include <linux/mount.h> 26#include <linux/mount.h>
26#include <linux/async.h> 27#include <linux/async.h>
28#include <linux/posix_acl.h>
27 29
28/* 30/*
29 * This is needed for the following functions: 31 * This is needed for the following functions:
@@ -118,12 +120,11 @@ static void wake_up_inode(struct inode *inode)
118 * These are initializations that need to be done on every inode 120 * These are initializations that need to be done on every inode
119 * allocation as the fields are not initialised by slab allocation. 121 * allocation as the fields are not initialised by slab allocation.
120 */ 122 */
121struct inode *inode_init_always(struct super_block *sb, struct inode *inode) 123int inode_init_always(struct super_block *sb, struct inode *inode)
122{ 124{
123 static const struct address_space_operations empty_aops; 125 static const struct address_space_operations empty_aops;
124 static struct inode_operations empty_iops; 126 static struct inode_operations empty_iops;
125 static const struct file_operations empty_fops; 127 static const struct file_operations empty_fops;
126
127 struct address_space *const mapping = &inode->i_data; 128 struct address_space *const mapping = &inode->i_data;
128 129
129 inode->i_sb = sb; 130 inode->i_sb = sb;
@@ -150,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
150 inode->dirtied_when = 0; 151 inode->dirtied_when = 0;
151 152
152 if (security_inode_alloc(inode)) 153 if (security_inode_alloc(inode))
153 goto out_free_inode; 154 goto out;
154 155
155 /* allocate and initialize an i_integrity */ 156 /* allocate and initialize an i_integrity */
156 if (ima_inode_alloc(inode)) 157 if (ima_inode_alloc(inode))
@@ -188,17 +189,20 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
188 } 189 }
189 inode->i_private = NULL; 190 inode->i_private = NULL;
190 inode->i_mapping = mapping; 191 inode->i_mapping = mapping;
192#ifdef CONFIG_FS_POSIX_ACL
193 inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
194#endif
191 195
192 return inode; 196#ifdef CONFIG_FSNOTIFY
197 inode->i_fsnotify_mask = 0;
198#endif
199
200 return 0;
193 201
194out_free_security: 202out_free_security:
195 security_inode_free(inode); 203 security_inode_free(inode);
196out_free_inode: 204out:
197 if (inode->i_sb->s_op->destroy_inode) 205 return -ENOMEM;
198 inode->i_sb->s_op->destroy_inode(inode);
199 else
200 kmem_cache_free(inode_cachep, (inode));
201 return NULL;
202} 206}
203EXPORT_SYMBOL(inode_init_always); 207EXPORT_SYMBOL(inode_init_always);
204 208
@@ -211,23 +215,43 @@ static struct inode *alloc_inode(struct super_block *sb)
211 else 215 else
212 inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); 216 inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
213 217
214 if (inode) 218 if (!inode)
215 return inode_init_always(sb, inode); 219 return NULL;
216 return NULL; 220
221 if (unlikely(inode_init_always(sb, inode))) {
222 if (inode->i_sb->s_op->destroy_inode)
223 inode->i_sb->s_op->destroy_inode(inode);
224 else
225 kmem_cache_free(inode_cachep, inode);
226 return NULL;
227 }
228
229 return inode;
217} 230}
218 231
219void destroy_inode(struct inode *inode) 232void __destroy_inode(struct inode *inode)
220{ 233{
221 BUG_ON(inode_has_buffers(inode)); 234 BUG_ON(inode_has_buffers(inode));
222 ima_inode_free(inode); 235 ima_inode_free(inode);
223 security_inode_free(inode); 236 security_inode_free(inode);
237 fsnotify_inode_delete(inode);
238#ifdef CONFIG_FS_POSIX_ACL
239 if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
240 posix_acl_release(inode->i_acl);
241 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
242 posix_acl_release(inode->i_default_acl);
243#endif
244}
245EXPORT_SYMBOL(__destroy_inode);
246
247void destroy_inode(struct inode *inode)
248{
249 __destroy_inode(inode);
224 if (inode->i_sb->s_op->destroy_inode) 250 if (inode->i_sb->s_op->destroy_inode)
225 inode->i_sb->s_op->destroy_inode(inode); 251 inode->i_sb->s_op->destroy_inode(inode);
226 else 252 else
227 kmem_cache_free(inode_cachep, (inode)); 253 kmem_cache_free(inode_cachep, (inode));
228} 254}
229EXPORT_SYMBOL(destroy_inode);
230
231 255
232/* 256/*
233 * These are initializations that only need to be done 257 * These are initializations that only need to be done
@@ -252,6 +276,9 @@ void inode_init_once(struct inode *inode)
252 INIT_LIST_HEAD(&inode->inotify_watches); 276 INIT_LIST_HEAD(&inode->inotify_watches);
253 mutex_init(&inode->inotify_mutex); 277 mutex_init(&inode->inotify_mutex);
254#endif 278#endif
279#ifdef CONFIG_FSNOTIFY
280 INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
281#endif
255} 282}
256EXPORT_SYMBOL(inode_init_once); 283EXPORT_SYMBOL(inode_init_once);
257 284
@@ -398,6 +425,7 @@ int invalidate_inodes(struct super_block *sb)
398 mutex_lock(&iprune_mutex); 425 mutex_lock(&iprune_mutex);
399 spin_lock(&inode_lock); 426 spin_lock(&inode_lock);
400 inotify_unmount_inodes(&sb->s_inodes); 427 inotify_unmount_inodes(&sb->s_inodes);
428 fsnotify_unmount_inodes(&sb->s_inodes);
401 busy = invalidate_list(&sb->s_inodes, &throw_away); 429 busy = invalidate_list(&sb->s_inodes, &throw_away);
402 spin_unlock(&inode_lock); 430 spin_unlock(&inode_lock);
403 431
@@ -655,12 +683,17 @@ void unlock_new_inode(struct inode *inode)
655 if (inode->i_mode & S_IFDIR) { 683 if (inode->i_mode & S_IFDIR) {
656 struct file_system_type *type = inode->i_sb->s_type; 684 struct file_system_type *type = inode->i_sb->s_type;
657 685
658 /* 686 /* Set new key only if filesystem hasn't already changed it */
659 * ensure nobody is actually holding i_mutex 687 if (!lockdep_match_class(&inode->i_mutex,
660 */ 688 &type->i_mutex_key)) {
661 mutex_destroy(&inode->i_mutex); 689 /*
662 mutex_init(&inode->i_mutex); 690 * ensure nobody is actually holding i_mutex
663 lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key); 691 */
692 mutex_destroy(&inode->i_mutex);
693 mutex_init(&inode->i_mutex);
694 lockdep_set_class(&inode->i_mutex,
695 &type->i_mutex_dir_key);
696 }
664 } 697 }
665#endif 698#endif
666 /* 699 /*
@@ -1398,7 +1431,7 @@ EXPORT_SYMBOL(touch_atime);
1398 * for writeback. Note that this function is meant exclusively for 1431 * for writeback. Note that this function is meant exclusively for
1399 * usage in the file write path of filesystems, and filesystems may 1432 * usage in the file write path of filesystems, and filesystems may
1400 * choose to explicitly ignore update via this function with the 1433 * choose to explicitly ignore update via this function with the
1401 * S_NOCTIME inode flag, e.g. for network filesystem where these 1434 * S_NOCMTIME inode flag, e.g. for network filesystem where these
1402 * timestamps are handled by the server. 1435 * timestamps are handled by the server.
1403 */ 1436 */
1404 1437
@@ -1412,7 +1445,7 @@ void file_update_time(struct file *file)
1412 if (IS_NOCMTIME(inode)) 1445 if (IS_NOCMTIME(inode))
1413 return; 1446 return;
1414 1447
1415 err = mnt_want_write(file->f_path.mnt); 1448 err = mnt_want_write_file(file);
1416 if (err) 1449 if (err)
1417 return; 1450 return;
1418 1451
diff --git a/fs/internal.h b/fs/internal.h
index b4dac4fb6b61..d55ef562f0bb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -25,6 +25,8 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
25 return sb == blockdev_superblock; 25 return sb == blockdev_superblock;
26} 26}
27 27
28extern int __sync_blockdev(struct block_device *bdev, int wait);
29
28#else 30#else
29static inline void bdev_cache_init(void) 31static inline void bdev_cache_init(void)
30{ 32{
@@ -34,6 +36,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
34{ 36{
35 return 0; 37 return 0;
36} 38}
39
40static inline int __sync_blockdev(struct block_device *bdev, int wait)
41{
42 return 0;
43}
37#endif 44#endif
38 45
39/* 46/*
@@ -66,3 +73,13 @@ extern void __init mnt_init(void);
66 * fs_struct.c 73 * fs_struct.c
67 */ 74 */
68extern void chroot_fs_refs(struct path *, struct path *); 75extern void chroot_fs_refs(struct path *, struct path *);
76
77/*
78 * file_table.c
79 */
80extern void mark_files_ro(struct super_block *);
81
82/*
83 * super.c
84 */
85extern int do_remount_sb(struct super_block *, int, void *, int);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 82d9c42b8bac..5612880fcbe7 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,7 @@
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/writeback.h> 16#include <linux/writeback.h>
17#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
18#include <linux/falloc.h>
18 19
19#include <asm/ioctls.h> 20#include <asm/ioctls.h>
20 21
@@ -70,9 +71,7 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
70 res = get_user(block, p); 71 res = get_user(block, p);
71 if (res) 72 if (res)
72 return res; 73 return res;
73 lock_kernel();
74 res = mapping->a_ops->bmap(mapping, block); 74 res = mapping->a_ops->bmap(mapping, block);
75 unlock_kernel();
76 return put_user(res, p); 75 return put_user(res, p);
77} 76}
78 77
@@ -405,6 +404,37 @@ EXPORT_SYMBOL(generic_block_fiemap);
405 404
406#endif /* CONFIG_BLOCK */ 405#endif /* CONFIG_BLOCK */
407 406
407/*
408 * This provides compatibility with legacy XFS pre-allocation ioctls
409 * which predate the fallocate syscall.
410 *
411 * Only the l_start, l_len and l_whence fields of the 'struct space_resv'
412 * are used here, rest are ignored.
413 */
414int ioctl_preallocate(struct file *filp, void __user *argp)
415{
416 struct inode *inode = filp->f_path.dentry->d_inode;
417 struct space_resv sr;
418
419 if (copy_from_user(&sr, argp, sizeof(sr)))
420 return -EFAULT;
421
422 switch (sr.l_whence) {
423 case SEEK_SET:
424 break;
425 case SEEK_CUR:
426 sr.l_start += filp->f_pos;
427 break;
428 case SEEK_END:
429 sr.l_start += i_size_read(inode);
430 break;
431 default:
432 return -EINVAL;
433 }
434
435 return do_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
436}
437
408static int file_ioctl(struct file *filp, unsigned int cmd, 438static int file_ioctl(struct file *filp, unsigned int cmd,
409 unsigned long arg) 439 unsigned long arg)
410{ 440{
@@ -414,12 +444,11 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
414 switch (cmd) { 444 switch (cmd) {
415 case FIBMAP: 445 case FIBMAP:
416 return ioctl_fibmap(filp, p); 446 return ioctl_fibmap(filp, p);
417 case FS_IOC_FIEMAP:
418 return ioctl_fiemap(filp, arg);
419 case FIGETBSZ:
420 return put_user(inode->i_sb->s_blocksize, p);
421 case FIONREAD: 447 case FIONREAD:
422 return put_user(i_size_read(inode) - filp->f_pos, p); 448 return put_user(i_size_read(inode) - filp->f_pos, p);
449 case FS_IOC_RESVSP:
450 case FS_IOC_RESVSP64:
451 return ioctl_preallocate(filp, p);
423 } 452 }
424 453
425 return vfs_ioctl(filp, cmd, arg); 454 return vfs_ioctl(filp, cmd, arg);
@@ -557,6 +586,16 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
557 error = ioctl_fsthaw(filp); 586 error = ioctl_fsthaw(filp);
558 break; 587 break;
559 588
589 case FS_IOC_FIEMAP:
590 return ioctl_fiemap(filp, arg);
591
592 case FIGETBSZ:
593 {
594 struct inode *inode = filp->f_path.dentry->d_inode;
595 int __user *p = (int __user *)arg;
596 return put_user(inode->i_sb->s_blocksize, p);
597 }
598
560 default: 599 default:
561 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) 600 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
562 error = file_ioctl(filp, cmd, arg); 601 error = file_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 2f0dc5a14633..8ba5441063be 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -195,9 +195,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
195 * Do not report hidden files if so instructed, or associated 195 * Do not report hidden files if so instructed, or associated
196 * files unless instructed to do so 196 * files unless instructed to do so
197 */ 197 */
198 if ((sbi->s_hide == 'y' && 198 if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
199 (de->flags[-sbi->s_high_sierra] & 1)) || 199 (!sbi->s_showassoc &&
200 (sbi->s_showassoc =='n' &&
201 (de->flags[-sbi->s_high_sierra] & 4))) { 200 (de->flags[-sbi->s_high_sierra] & 4))) {
202 filp->f_pos += de_len; 201 filp->f_pos += de_len;
203 continue; 202 continue;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index b4cbe9603c7d..85f96bc651c7 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -42,11 +42,16 @@ static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qst
42static void isofs_put_super(struct super_block *sb) 42static void isofs_put_super(struct super_block *sb)
43{ 43{
44 struct isofs_sb_info *sbi = ISOFS_SB(sb); 44 struct isofs_sb_info *sbi = ISOFS_SB(sb);
45
45#ifdef CONFIG_JOLIET 46#ifdef CONFIG_JOLIET
47 lock_kernel();
48
46 if (sbi->s_nls_iocharset) { 49 if (sbi->s_nls_iocharset) {
47 unload_nls(sbi->s_nls_iocharset); 50 unload_nls(sbi->s_nls_iocharset);
48 sbi->s_nls_iocharset = NULL; 51 sbi->s_nls_iocharset = NULL;
49 } 52 }
53
54 unlock_kernel();
50#endif 55#endif
51 56
52 kfree(sbi); 57 kfree(sbi);
@@ -136,13 +141,17 @@ static const struct dentry_operations isofs_dentry_ops[] = {
136}; 141};
137 142
138struct iso9660_options{ 143struct iso9660_options{
139 char map; 144 unsigned int rock:1;
140 char rock; 145 unsigned int joliet:1;
141 char joliet; 146 unsigned int cruft:1;
142 char cruft; 147 unsigned int hide:1;
143 char hide; 148 unsigned int showassoc:1;
144 char showassoc; 149 unsigned int nocompress:1;
145 char nocompress; 150 unsigned int overriderockperm:1;
151 unsigned int uid_set:1;
152 unsigned int gid_set:1;
153 unsigned int utf8:1;
154 unsigned char map;
146 unsigned char check; 155 unsigned char check;
147 unsigned int blocksize; 156 unsigned int blocksize;
148 mode_t fmode; 157 mode_t fmode;
@@ -150,7 +159,6 @@ struct iso9660_options{
150 gid_t gid; 159 gid_t gid;
151 uid_t uid; 160 uid_t uid;
152 char *iocharset; 161 char *iocharset;
153 unsigned char utf8;
154 /* LVE */ 162 /* LVE */
155 s32 session; 163 s32 session;
156 s32 sbsector; 164 s32 sbsector;
@@ -307,7 +315,7 @@ enum {
307 Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore, 315 Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore,
308 Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet, 316 Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet,
309 Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err, 317 Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err,
310 Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, 318 Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm,
311}; 319};
312 320
313static const match_table_t tokens = { 321static const match_table_t tokens = {
@@ -335,6 +343,7 @@ static const match_table_t tokens = {
335 {Opt_gid, "gid=%u"}, 343 {Opt_gid, "gid=%u"},
336 {Opt_mode, "mode=%u"}, 344 {Opt_mode, "mode=%u"},
337 {Opt_dmode, "dmode=%u"}, 345 {Opt_dmode, "dmode=%u"},
346 {Opt_overriderockperm, "overriderockperm"},
338 {Opt_block, "block=%u"}, 347 {Opt_block, "block=%u"},
339 {Opt_ignore, "conv=binary"}, 348 {Opt_ignore, "conv=binary"},
340 {Opt_ignore, "conv=b"}, 349 {Opt_ignore, "conv=b"},
@@ -354,24 +363,22 @@ static int parse_options(char *options, struct iso9660_options *popt)
354 int option; 363 int option;
355 364
356 popt->map = 'n'; 365 popt->map = 'n';
357 popt->rock = 'y'; 366 popt->rock = 1;
358 popt->joliet = 'y'; 367 popt->joliet = 1;
359 popt->cruft = 'n'; 368 popt->cruft = 0;
360 popt->hide = 'n'; 369 popt->hide = 0;
361 popt->showassoc = 'n'; 370 popt->showassoc = 0;
362 popt->check = 'u'; /* unset */ 371 popt->check = 'u'; /* unset */
363 popt->nocompress = 0; 372 popt->nocompress = 0;
364 popt->blocksize = 1024; 373 popt->blocksize = 1024;
365 popt->fmode = popt->dmode = S_IRUGO | S_IXUGO; /* 374 popt->fmode = popt->dmode = ISOFS_INVALID_MODE;
366 * r-x for all. The disc could 375 popt->uid_set = 0;
367 * be shared with DOS machines so 376 popt->gid_set = 0;
368 * virtually anything could be
369 * a valid executable.
370 */
371 popt->gid = 0; 377 popt->gid = 0;
372 popt->uid = 0; 378 popt->uid = 0;
373 popt->iocharset = NULL; 379 popt->iocharset = NULL;
374 popt->utf8 = 0; 380 popt->utf8 = 0;
381 popt->overriderockperm = 0;
375 popt->session=-1; 382 popt->session=-1;
376 popt->sbsector=-1; 383 popt->sbsector=-1;
377 if (!options) 384 if (!options)
@@ -388,20 +395,20 @@ static int parse_options(char *options, struct iso9660_options *popt)
388 token = match_token(p, tokens, args); 395 token = match_token(p, tokens, args);
389 switch (token) { 396 switch (token) {
390 case Opt_norock: 397 case Opt_norock:
391 popt->rock = 'n'; 398 popt->rock = 0;
392 break; 399 break;
393 case Opt_nojoliet: 400 case Opt_nojoliet:
394 popt->joliet = 'n'; 401 popt->joliet = 0;
395 break; 402 break;
396 case Opt_hide: 403 case Opt_hide:
397 popt->hide = 'y'; 404 popt->hide = 1;
398 break; 405 break;
399 case Opt_unhide: 406 case Opt_unhide:
400 case Opt_showassoc: 407 case Opt_showassoc:
401 popt->showassoc = 'y'; 408 popt->showassoc = 1;
402 break; 409 break;
403 case Opt_cruft: 410 case Opt_cruft:
404 popt->cruft = 'y'; 411 popt->cruft = 1;
405 break; 412 break;
406 case Opt_utf8: 413 case Opt_utf8:
407 popt->utf8 = 1; 414 popt->utf8 = 1;
@@ -445,11 +452,13 @@ static int parse_options(char *options, struct iso9660_options *popt)
445 if (match_int(&args[0], &option)) 452 if (match_int(&args[0], &option))
446 return 0; 453 return 0;
447 popt->uid = option; 454 popt->uid = option;
455 popt->uid_set = 1;
448 break; 456 break;
449 case Opt_gid: 457 case Opt_gid:
450 if (match_int(&args[0], &option)) 458 if (match_int(&args[0], &option))
451 return 0; 459 return 0;
452 popt->gid = option; 460 popt->gid = option;
461 popt->gid_set = 1;
453 break; 462 break;
454 case Opt_mode: 463 case Opt_mode:
455 if (match_int(&args[0], &option)) 464 if (match_int(&args[0], &option))
@@ -461,6 +470,9 @@ static int parse_options(char *options, struct iso9660_options *popt)
461 return 0; 470 return 0;
462 popt->dmode = option; 471 popt->dmode = option;
463 break; 472 break;
473 case Opt_overriderockperm:
474 popt->overriderockperm = 1;
475 break;
464 case Opt_block: 476 case Opt_block:
465 if (match_int(&args[0], &option)) 477 if (match_int(&args[0], &option))
466 return 0; 478 return 0;
@@ -620,7 +632,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
620 else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) { 632 else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) {
621 sec = (struct iso_supplementary_descriptor *)vdp; 633 sec = (struct iso_supplementary_descriptor *)vdp;
622 if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) { 634 if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
623 if (opt.joliet == 'y') { 635 if (opt.joliet) {
624 if (sec->escape[2] == 0x40) 636 if (sec->escape[2] == 0x40)
625 joliet_level = 1; 637 joliet_level = 1;
626 else if (sec->escape[2] == 0x43) 638 else if (sec->escape[2] == 0x43)
@@ -645,7 +657,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
645 goto out_freebh; 657 goto out_freebh;
646 658
647 sbi->s_high_sierra = 1; 659 sbi->s_high_sierra = 1;
648 opt.rock = 'n'; 660 opt.rock = 0;
649 h_pri = (struct hs_primary_descriptor *)vdp; 661 h_pri = (struct hs_primary_descriptor *)vdp;
650 goto root_found; 662 goto root_found;
651 } 663 }
@@ -668,7 +680,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
668 680
669root_found: 681root_found:
670 682
671 if (joliet_level && (pri == NULL || opt.rock == 'n')) { 683 if (joliet_level && (pri == NULL || !opt.rock)) {
672 /* This is the case of Joliet with the norock mount flag. 684 /* This is the case of Joliet with the norock mount flag.
673 * A disc with both Joliet and Rock Ridge is handled later 685 * A disc with both Joliet and Rock Ridge is handled later
674 */ 686 */
@@ -797,22 +809,31 @@ root_found:
797 s->s_op = &isofs_sops; 809 s->s_op = &isofs_sops;
798 s->s_export_op = &isofs_export_ops; 810 s->s_export_op = &isofs_export_ops;
799 sbi->s_mapping = opt.map; 811 sbi->s_mapping = opt.map;
800 sbi->s_rock = (opt.rock == 'y' ? 2 : 0); 812 sbi->s_rock = (opt.rock ? 2 : 0);
801 sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/ 813 sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/
802 sbi->s_cruft = opt.cruft; 814 sbi->s_cruft = opt.cruft;
803 sbi->s_hide = opt.hide; 815 sbi->s_hide = opt.hide;
804 sbi->s_showassoc = opt.showassoc; 816 sbi->s_showassoc = opt.showassoc;
805 sbi->s_uid = opt.uid; 817 sbi->s_uid = opt.uid;
806 sbi->s_gid = opt.gid; 818 sbi->s_gid = opt.gid;
819 sbi->s_uid_set = opt.uid_set;
820 sbi->s_gid_set = opt.gid_set;
807 sbi->s_utf8 = opt.utf8; 821 sbi->s_utf8 = opt.utf8;
808 sbi->s_nocompress = opt.nocompress; 822 sbi->s_nocompress = opt.nocompress;
823 sbi->s_overriderockperm = opt.overriderockperm;
809 /* 824 /*
810 * It would be incredibly stupid to allow people to mark every file 825 * It would be incredibly stupid to allow people to mark every file
811 * on the disk as suid, so we merely allow them to set the default 826 * on the disk as suid, so we merely allow them to set the default
812 * permissions. 827 * permissions.
813 */ 828 */
814 sbi->s_fmode = opt.fmode & 0777; 829 if (opt.fmode != ISOFS_INVALID_MODE)
815 sbi->s_dmode = opt.dmode & 0777; 830 sbi->s_fmode = opt.fmode & 0777;
831 else
832 sbi->s_fmode = ISOFS_INVALID_MODE;
833 if (opt.dmode != ISOFS_INVALID_MODE)
834 sbi->s_dmode = opt.dmode & 0777;
835 else
836 sbi->s_dmode = ISOFS_INVALID_MODE;
816 837
817 /* 838 /*
818 * Read the root inode, which _may_ result in changing 839 * Read the root inode, which _may_ result in changing
@@ -1090,18 +1111,6 @@ static const struct address_space_operations isofs_aops = {
1090 .bmap = _isofs_bmap 1111 .bmap = _isofs_bmap
1091}; 1112};
1092 1113
1093static inline void test_and_set_uid(uid_t *p, uid_t value)
1094{
1095 if (value)
1096 *p = value;
1097}
1098
1099static inline void test_and_set_gid(gid_t *p, gid_t value)
1100{
1101 if (value)
1102 *p = value;
1103}
1104
1105static int isofs_read_level3_size(struct inode *inode) 1114static int isofs_read_level3_size(struct inode *inode)
1106{ 1115{
1107 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 1116 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -1256,7 +1265,10 @@ static int isofs_read_inode(struct inode *inode)
1256 ei->i_file_format = isofs_file_normal; 1265 ei->i_file_format = isofs_file_normal;
1257 1266
1258 if (de->flags[-high_sierra] & 2) { 1267 if (de->flags[-high_sierra] & 2) {
1259 inode->i_mode = sbi->s_dmode | S_IFDIR; 1268 if (sbi->s_dmode != ISOFS_INVALID_MODE)
1269 inode->i_mode = S_IFDIR | sbi->s_dmode;
1270 else
1271 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
1260 inode->i_nlink = 1; /* 1272 inode->i_nlink = 1; /*
1261 * Set to 1. We know there are 2, but 1273 * Set to 1. We know there are 2, but
1262 * the find utility tries to optimize 1274 * the find utility tries to optimize
@@ -1265,8 +1277,16 @@ static int isofs_read_inode(struct inode *inode)
1265 * do it the hard way. 1277 * do it the hard way.
1266 */ 1278 */
1267 } else { 1279 } else {
1268 /* Everybody gets to read the file. */ 1280 if (sbi->s_fmode != ISOFS_INVALID_MODE) {
1269 inode->i_mode = sbi->s_fmode | S_IFREG; 1281 inode->i_mode = S_IFREG | sbi->s_fmode;
1282 } else {
1283 /*
1284 * Set default permissions: r-x for all. The disc
1285 * could be shared with DOS machines so virtually
1286 * anything could be a valid executable.
1287 */
1288 inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
1289 }
1270 inode->i_nlink = 1; 1290 inode->i_nlink = 1;
1271 } 1291 }
1272 inode->i_uid = sbi->s_uid; 1292 inode->i_uid = sbi->s_uid;
@@ -1295,7 +1315,7 @@ static int isofs_read_inode(struct inode *inode)
1295 * this CDROM was mounted with the cruft option. 1315 * this CDROM was mounted with the cruft option.
1296 */ 1316 */
1297 1317
1298 if (sbi->s_cruft == 'y') 1318 if (sbi->s_cruft)
1299 inode->i_size &= 0x00ffffff; 1319 inode->i_size &= 0x00ffffff;
1300 1320
1301 if (de->interleave[0]) { 1321 if (de->interleave[0]) {
@@ -1341,9 +1361,18 @@ static int isofs_read_inode(struct inode *inode)
1341 if (!high_sierra) { 1361 if (!high_sierra) {
1342 parse_rock_ridge_inode(de, inode); 1362 parse_rock_ridge_inode(de, inode);
1343 /* if we want uid/gid set, override the rock ridge setting */ 1363 /* if we want uid/gid set, override the rock ridge setting */
1344 test_and_set_uid(&inode->i_uid, sbi->s_uid); 1364 if (sbi->s_uid_set)
1345 test_and_set_gid(&inode->i_gid, sbi->s_gid); 1365 inode->i_uid = sbi->s_uid;
1366 if (sbi->s_gid_set)
1367 inode->i_gid = sbi->s_gid;
1346 } 1368 }
1369 /* Now set final access rights if overriding rock ridge setting */
1370 if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm &&
1371 sbi->s_dmode != ISOFS_INVALID_MODE)
1372 inode->i_mode = S_IFDIR | sbi->s_dmode;
1373 if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm &&
1374 sbi->s_fmode != ISOFS_INVALID_MODE)
1375 inode->i_mode = S_IFREG | sbi->s_fmode;
1347 1376
1348 /* Install the inode operations vector */ 1377 /* Install the inode operations vector */
1349 if (S_ISREG(inode->i_mode)) { 1378 if (S_ISREG(inode->i_mode)) {
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index ccbf72faf27a..7d33de84f52a 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -35,21 +35,20 @@ struct isofs_sb_info {
35 unsigned long s_log_zone_size; 35 unsigned long s_log_zone_size;
36 unsigned long s_max_size; 36 unsigned long s_max_size;
37 37
38 unsigned char s_high_sierra; /* A simple flag */
39 unsigned char s_mapping;
40 int s_rock_offset; /* offset of SUSP fields within SU area */ 38 int s_rock_offset; /* offset of SUSP fields within SU area */
41 unsigned char s_rock;
42 unsigned char s_joliet_level; 39 unsigned char s_joliet_level;
43 unsigned char s_utf8; 40 unsigned char s_mapping;
44 unsigned char s_cruft; /* Broken disks with high 41 unsigned int s_high_sierra:1;
45 byte of length containing 42 unsigned int s_rock:2;
46 junk */ 43 unsigned int s_utf8:1;
47 unsigned char s_unhide; 44 unsigned int s_cruft:1; /* Broken disks with high byte of length
48 unsigned char s_nosuid; 45 * containing junk */
49 unsigned char s_nodev; 46 unsigned int s_nocompress:1;
50 unsigned char s_nocompress; 47 unsigned int s_hide:1;
51 unsigned char s_hide; 48 unsigned int s_showassoc:1;
52 unsigned char s_showassoc; 49 unsigned int s_overriderockperm:1;
50 unsigned int s_uid_set:1;
51 unsigned int s_gid_set:1;
53 52
54 mode_t s_fmode; 53 mode_t s_fmode;
55 mode_t s_dmode; 54 mode_t s_dmode;
@@ -58,6 +57,8 @@ struct isofs_sb_info {
58 struct nls_table *s_nls_iocharset; /* Native language support table */ 57 struct nls_table *s_nls_iocharset; /* Native language support table */
59}; 58};
60 59
60#define ISOFS_INVALID_MODE ((mode_t) -1)
61
61static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb) 62static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb)
62{ 63{
63 return sb->s_fs_info; 64 return sb->s_fs_info;
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index 92c14b850e9c..a048de81c093 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
37 return (op - ascii); 37 return (op - ascii);
38} 38}
39 39
40/* Convert big endian wide character string to utf8 */
41static int
42wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen)
43{
44 const __u8 *ip;
45 __u8 *op;
46 int size;
47 __u16 c;
48
49 op = s;
50 ip = pwcs;
51 while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) {
52 c = (*ip << 8) | ip[1];
53 if (c > 0x7f) {
54 size = utf8_wctomb(op, c, maxlen);
55 if (size == -1) {
56 /* Ignore character and move on */
57 maxlen--;
58 } else {
59 op += size;
60 maxlen -= size;
61 }
62 } else {
63 *op++ = (__u8) c;
64 }
65 ip += 2;
66 inlen--;
67 }
68 return (op - s);
69}
70
71int 40int
72get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode) 41get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
73{ 42{
@@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
79 nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset; 48 nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
80 49
81 if (utf8) { 50 if (utf8) {
82 len = wcsntombs_be(outname, de->name, 51 len = utf16s_to_utf8s((const wchar_t *) de->name,
83 de->name_len[0] >> 1, PAGE_SIZE); 52 de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
53 outname, PAGE_SIZE);
84 } else { 54 } else {
85 len = uni16_to_x8(outname, (__be16 *) de->name, 55 len = uni16_to_x8(outname, (__be16 *) de->name,
86 de->name_len[0] >> 1, nls); 56 de->name_len[0] >> 1, nls);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 8299889a835e..eaa831311c9c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -142,9 +142,9 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
142 */ 142 */
143 match = 0; 143 match = 0;
144 if (dlen > 0 && 144 if (dlen > 0 &&
145 (sbi->s_hide =='n' || 145 (!sbi->s_hide ||
146 (!(de->flags[-sbi->s_high_sierra] & 1))) && 146 (!(de->flags[-sbi->s_high_sierra] & 1))) &&
147 (sbi->s_showassoc =='y' || 147 (sbi->s_showassoc ||
148 (!(de->flags[-sbi->s_high_sierra] & 4)))) { 148 (!(de->flags[-sbi->s_high_sierra] & 4)))) {
149 match = (isofs_cmp(dentry, dpnt, dlen) == 0); 149 match = (isofs_cmp(dentry, dpnt, dlen) == 0);
150 } 150 }
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 737f7246a4b5..f96f85092d1c 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
287 struct page *new_page; 287 struct page *new_page;
288 unsigned int new_offset; 288 unsigned int new_offset;
289 struct buffer_head *bh_in = jh2bh(jh_in); 289 struct buffer_head *bh_in = jh2bh(jh_in);
290 journal_t *journal = transaction->t_journal;
290 291
291 /* 292 /*
292 * The buffer really shouldn't be locked: only the current committing 293 * The buffer really shouldn't be locked: only the current committing
@@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction,
300 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 301 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
301 302
302 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 303 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
304 /* keep subsequent assertions sane */
305 new_bh->b_state = 0;
306 init_buffer(new_bh, NULL, NULL);
307 atomic_set(&new_bh->b_count, 1);
308 new_jh = journal_add_journal_head(new_bh); /* This sleeps */
303 309
304 /* 310 /*
305 * If a new transaction has already done a buffer copy-out, then 311 * If a new transaction has already done a buffer copy-out, then
@@ -361,14 +367,6 @@ repeat:
361 kunmap_atomic(mapped_data, KM_USER0); 367 kunmap_atomic(mapped_data, KM_USER0);
362 } 368 }
363 369
364 /* keep subsequent assertions sane */
365 new_bh->b_state = 0;
366 init_buffer(new_bh, NULL, NULL);
367 atomic_set(&new_bh->b_count, 1);
368 jbd_unlock_bh_state(bh_in);
369
370 new_jh = journal_add_journal_head(new_bh); /* This sleeps */
371
372 set_bh_page(new_bh, new_page, new_offset); 370 set_bh_page(new_bh, new_page, new_offset);
373 new_jh->b_transaction = NULL; 371 new_jh->b_transaction = NULL;
374 new_bh->b_size = jh2bh(jh_in)->b_size; 372 new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -385,7 +383,11 @@ repeat:
385 * copying is moved to the transaction's shadow queue. 383 * copying is moved to the transaction's shadow queue.
386 */ 384 */
387 JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); 385 JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
388 journal_file_buffer(jh_in, transaction, BJ_Shadow); 386 spin_lock(&journal->j_list_lock);
387 __journal_file_buffer(jh_in, transaction, BJ_Shadow);
388 spin_unlock(&journal->j_list_lock);
389 jbd_unlock_bh_state(bh_in);
390
389 JBUFFER_TRACE(new_jh, "file as BJ_IO"); 391 JBUFFER_TRACE(new_jh, "file as BJ_IO");
390 journal_file_buffer(new_jh, transaction, BJ_IO); 392 journal_file_buffer(new_jh, transaction, BJ_IO);
391 393
@@ -848,6 +850,12 @@ static int journal_reset(journal_t *journal)
848 850
849 first = be32_to_cpu(sb->s_first); 851 first = be32_to_cpu(sb->s_first);
850 last = be32_to_cpu(sb->s_maxlen); 852 last = be32_to_cpu(sb->s_maxlen);
853 if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
854 printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
855 first, last);
856 journal_fail_superblock(journal);
857 return -EINVAL;
858 }
851 859
852 journal->j_first = first; 860 journal->j_first = first;
853 journal->j_last = last; 861 journal->j_last = last;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index ed886e6db399..c03ac11f74be 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal)
489 wake_up(&journal->j_wait_transaction_locked); 489 wake_up(&journal->j_wait_transaction_locked);
490} 490}
491 491
492/* 492static void warn_dirty_buffer(struct buffer_head *bh)
493 * Report any unexpected dirty buffers which turn up. Normally those
494 * indicate an error, but they can occur if the user is running (say)
495 * tune2fs to modify the live filesystem, so we need the option of
496 * continuing as gracefully as possible. #
497 *
498 * The caller should already hold the journal lock and
499 * j_list_lock spinlock: most callers will need those anyway
500 * in order to probe the buffer's journaling state safely.
501 */
502static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
503{ 493{
504 int jlist; 494 char b[BDEVNAME_SIZE];
505
506 /* If this buffer is one which might reasonably be dirty
507 * --- ie. data, or not part of this journal --- then
508 * we're OK to leave it alone, but otherwise we need to
509 * move the dirty bit to the journal's own internal
510 * JBDDirty bit. */
511 jlist = jh->b_jlist;
512
513 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
514 jlist == BJ_Shadow || jlist == BJ_Forget) {
515 struct buffer_head *bh = jh2bh(jh);
516 495
517 if (test_clear_buffer_dirty(bh)) 496 printk(KERN_WARNING
518 set_buffer_jbddirty(bh); 497 "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
519 } 498 "There's a risk of filesystem corruption in case of system "
499 "crash.\n",
500 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
520} 501}
521 502
522/* 503/*
@@ -583,14 +564,16 @@ repeat:
583 if (jh->b_next_transaction) 564 if (jh->b_next_transaction)
584 J_ASSERT_JH(jh, jh->b_next_transaction == 565 J_ASSERT_JH(jh, jh->b_next_transaction ==
585 transaction); 566 transaction);
567 warn_dirty_buffer(bh);
586 } 568 }
587 /* 569 /*
588 * In any case we need to clean the dirty flag and we must 570 * In any case we need to clean the dirty flag and we must
589 * do it under the buffer lock to be sure we don't race 571 * do it under the buffer lock to be sure we don't race
590 * with running write-out. 572 * with running write-out.
591 */ 573 */
592 JBUFFER_TRACE(jh, "Unexpected dirty buffer"); 574 JBUFFER_TRACE(jh, "Journalling dirty buffer");
593 jbd_unexpected_dirty_buffer(jh); 575 clear_buffer_dirty(bh);
576 set_buffer_jbddirty(bh);
594 } 577 }
595 578
596 unlock_buffer(bh); 579 unlock_buffer(bh);
@@ -826,6 +809,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
826 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 809 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
827 810
828 if (jh->b_transaction == NULL) { 811 if (jh->b_transaction == NULL) {
812 /*
813 * Previous journal_forget() could have left the buffer
814 * with jbddirty bit set because it was being committed. When
815 * the commit finished, we've filed the buffer for
816 * checkpointing and marked it dirty. Now we are reallocating
817 * the buffer so the transaction freeing it must have
818 * committed and so it's safe to clear the dirty bit.
819 */
820 clear_buffer_dirty(jh2bh(jh));
829 jh->b_transaction = transaction; 821 jh->b_transaction = transaction;
830 822
831 /* first access by this transaction */ 823 /* first access by this transaction */
@@ -1686,35 +1678,6 @@ out:
1686 return; 1678 return;
1687} 1679}
1688 1680
1689/*
1690 * journal_try_to_free_buffers() could race with journal_commit_transaction()
1691 * The latter might still hold the a count on buffers when inspecting
1692 * them on t_syncdata_list or t_locked_list.
1693 *
1694 * journal_try_to_free_buffers() will call this function to
1695 * wait for the current transaction to finish syncing data buffers, before
1696 * tryinf to free that buffer.
1697 *
1698 * Called with journal->j_state_lock held.
1699 */
1700static void journal_wait_for_transaction_sync_data(journal_t *journal)
1701{
1702 transaction_t *transaction = NULL;
1703 tid_t tid;
1704
1705 spin_lock(&journal->j_state_lock);
1706 transaction = journal->j_committing_transaction;
1707
1708 if (!transaction) {
1709 spin_unlock(&journal->j_state_lock);
1710 return;
1711 }
1712
1713 tid = transaction->t_tid;
1714 spin_unlock(&journal->j_state_lock);
1715 log_wait_commit(journal, tid);
1716}
1717
1718/** 1681/**
1719 * int journal_try_to_free_buffers() - try to free page buffers. 1682 * int journal_try_to_free_buffers() - try to free page buffers.
1720 * @journal: journal for operation 1683 * @journal: journal for operation
@@ -1786,25 +1749,6 @@ int journal_try_to_free_buffers(journal_t *journal,
1786 1749
1787 ret = try_to_free_buffers(page); 1750 ret = try_to_free_buffers(page);
1788 1751
1789 /*
1790 * There are a number of places where journal_try_to_free_buffers()
1791 * could race with journal_commit_transaction(), the later still
1792 * holds the reference to the buffers to free while processing them.
1793 * try_to_free_buffers() failed to free those buffers. Some of the
1794 * caller of releasepage() request page buffers to be dropped, otherwise
1795 * treat the fail-to-free as errors (such as generic_file_direct_IO())
1796 *
1797 * So, if the caller of try_to_release_page() wants the synchronous
1798 * behaviour(i.e make sure buffers are dropped upon return),
1799 * let's wait for the current transaction to finish flush of
1800 * dirty data buffers, then try to free those buffers again,
1801 * with the journal locked.
1802 */
1803 if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
1804 journal_wait_for_transaction_sync_data(journal);
1805 ret = try_to_free_buffers(page);
1806 }
1807
1808busy: 1752busy:
1809 return ret; 1753 return ret;
1810} 1754}
@@ -1830,8 +1774,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1830 1774
1831 if (jh->b_cp_transaction) { 1775 if (jh->b_cp_transaction) {
1832 JBUFFER_TRACE(jh, "on running+cp transaction"); 1776 JBUFFER_TRACE(jh, "on running+cp transaction");
1777 /*
1778 * We don't want to write the buffer anymore, clear the
1779 * bit so that we don't confuse checks in
1780 * __journal_file_buffer
1781 */
1782 clear_buffer_dirty(bh);
1833 __journal_file_buffer(jh, transaction, BJ_Forget); 1783 __journal_file_buffer(jh, transaction, BJ_Forget);
1834 clear_buffer_jbddirty(bh);
1835 may_free = 0; 1784 may_free = 0;
1836 } else { 1785 } else {
1837 JBUFFER_TRACE(jh, "on running transaction"); 1786 JBUFFER_TRACE(jh, "on running transaction");
@@ -2089,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh,
2089 if (jh->b_transaction && jh->b_jlist == jlist) 2038 if (jh->b_transaction && jh->b_jlist == jlist)
2090 return; 2039 return;
2091 2040
2092 /* The following list of buffer states needs to be consistent
2093 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
2094 * state. */
2095
2096 if (jlist == BJ_Metadata || jlist == BJ_Reserved || 2041 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
2097 jlist == BJ_Shadow || jlist == BJ_Forget) { 2042 jlist == BJ_Shadow || jlist == BJ_Forget) {
2043 /*
2044 * For metadata buffers, we track dirty bit in buffer_jbddirty
2045 * instead of buffer_dirty. We should not see a dirty bit set
2046 * here because we clear it in do_get_write_access but e.g.
2047 * tune2fs can modify the sb and set the dirty bit at any time
2048 * so we try to gracefully handle that.
2049 */
2050 if (buffer_dirty(bh))
2051 warn_dirty_buffer(bh);
2098 if (test_clear_buffer_dirty(bh) || 2052 if (test_clear_buffer_dirty(bh) ||
2099 test_clear_buffer_jbddirty(bh)) 2053 test_clear_buffer_jbddirty(bh))
2100 was_dirty = 1; 2054 was_dirty = 1;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 17159cacbd9e..5d70b3e6d49b 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,9 +20,9 @@
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/jbd2.h> 22#include <linux/jbd2.h>
23#include <linux/marker.h>
24#include <linux/errno.h> 23#include <linux/errno.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
25#include <trace/events/jbd2.h>
26 26
27/* 27/*
28 * Unlink a buffer from a transaction checkpoint list. 28 * Unlink a buffer from a transaction checkpoint list.
@@ -358,8 +358,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
358 * journal straight away. 358 * journal straight away.
359 */ 359 */
360 result = jbd2_cleanup_journal_tail(journal); 360 result = jbd2_cleanup_journal_tail(journal);
361 trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d", 361 trace_jbd2_checkpoint(journal, result);
362 journal->j_devname, result);
363 jbd_debug(1, "cleanup_journal_tail returned %d\n", result); 362 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
364 if (result <= 0) 363 if (result <= 0)
365 return result; 364 return result;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0b7d3b8226fd..7b4088b2364d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,7 +16,6 @@
16#include <linux/time.h> 16#include <linux/time.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/jbd2.h> 18#include <linux/jbd2.h>
19#include <linux/marker.h>
20#include <linux/errno.h> 19#include <linux/errno.h>
21#include <linux/slab.h> 20#include <linux/slab.h>
22#include <linux/mm.h> 21#include <linux/mm.h>
@@ -26,6 +25,7 @@
26#include <linux/writeback.h> 25#include <linux/writeback.h>
27#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
28#include <linux/bio.h> 27#include <linux/bio.h>
28#include <trace/events/jbd2.h>
29 29
30/* 30/*
31 * Default IO end handler for temporary BJ_IO buffer_heads. 31 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -253,6 +253,7 @@ static int journal_submit_data_buffers(journal_t *journal,
253 * block allocation with delalloc. We need to write 253 * block allocation with delalloc. We need to write
254 * only allocated blocks here. 254 * only allocated blocks here.
255 */ 255 */
256 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
256 err = journal_submit_inode_data_buffers(mapping); 257 err = journal_submit_inode_data_buffers(mapping);
257 if (!ret) 258 if (!ret)
258 ret = err; 259 ret = err;
@@ -394,8 +395,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
394 commit_transaction = journal->j_running_transaction; 395 commit_transaction = journal->j_running_transaction;
395 J_ASSERT(commit_transaction->t_state == T_RUNNING); 396 J_ASSERT(commit_transaction->t_state == T_RUNNING);
396 397
397 trace_mark(jbd2_start_commit, "dev %s transaction %d", 398 trace_jbd2_start_commit(journal, commit_transaction);
398 journal->j_devname, commit_transaction->t_tid);
399 jbd_debug(1, "JBD: starting commit of transaction %d\n", 399 jbd_debug(1, "JBD: starting commit of transaction %d\n",
400 commit_transaction->t_tid); 400 commit_transaction->t_tid);
401 401
@@ -409,6 +409,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
409 */ 409 */
410 if (commit_transaction->t_synchronous_commit) 410 if (commit_transaction->t_synchronous_commit)
411 write_op = WRITE_SYNC_PLUG; 411 write_op = WRITE_SYNC_PLUG;
412 trace_jbd2_commit_locking(journal, commit_transaction);
412 stats.u.run.rs_wait = commit_transaction->t_max_wait; 413 stats.u.run.rs_wait = commit_transaction->t_max_wait;
413 stats.u.run.rs_locked = jiffies; 414 stats.u.run.rs_locked = jiffies;
414 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 415 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -484,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
484 */ 485 */
485 jbd2_journal_switch_revoke_table(journal); 486 jbd2_journal_switch_revoke_table(journal);
486 487
488 trace_jbd2_commit_flushing(journal, commit_transaction);
487 stats.u.run.rs_flushing = jiffies; 489 stats.u.run.rs_flushing = jiffies;
488 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, 490 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
489 stats.u.run.rs_flushing); 491 stats.u.run.rs_flushing);
@@ -520,6 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
520 commit_transaction->t_state = T_COMMIT; 522 commit_transaction->t_state = T_COMMIT;
521 spin_unlock(&journal->j_state_lock); 523 spin_unlock(&journal->j_state_lock);
522 524
525 trace_jbd2_commit_logging(journal, commit_transaction);
523 stats.u.run.rs_logging = jiffies; 526 stats.u.run.rs_logging = jiffies;
524 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, 527 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
525 stats.u.run.rs_logging); 528 stats.u.run.rs_logging);
@@ -1054,9 +1057,7 @@ restart_loop:
1054 if (journal->j_commit_callback) 1057 if (journal->j_commit_callback)
1055 journal->j_commit_callback(journal, commit_transaction); 1058 journal->j_commit_callback(journal, commit_transaction);
1056 1059
1057 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", 1060 trace_jbd2_end_commit(journal, commit_transaction);
1058 journal->j_devname, commit_transaction->t_tid,
1059 journal->j_tail_sequence);
1060 jbd_debug(1, "JBD: commit %d complete, head %d\n", 1061 jbd_debug(1, "JBD: commit %d complete, head %d\n",
1061 journal->j_commit_sequence, journal->j_tail_sequence); 1062 journal->j_commit_sequence, journal->j_tail_sequence);
1062 if (to_free) 1063 if (to_free)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 58144102bf25..e378cb383979 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -38,6 +38,10 @@
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/seq_file.h> 39#include <linux/seq_file.h>
40#include <linux/math64.h> 40#include <linux/math64.h>
41#include <linux/hash.h>
42
43#define CREATE_TRACE_POINTS
44#include <trace/events/jbd2.h>
41 45
42#include <asm/uaccess.h> 46#include <asm/uaccess.h>
43#include <asm/page.h> 47#include <asm/page.h>
@@ -293,6 +297,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
293 unsigned int new_offset; 297 unsigned int new_offset;
294 struct buffer_head *bh_in = jh2bh(jh_in); 298 struct buffer_head *bh_in = jh2bh(jh_in);
295 struct jbd2_buffer_trigger_type *triggers; 299 struct jbd2_buffer_trigger_type *triggers;
300 journal_t *journal = transaction->t_journal;
296 301
297 /* 302 /*
298 * The buffer really shouldn't be locked: only the current committing 303 * The buffer really shouldn't be locked: only the current committing
@@ -306,6 +311,11 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
306 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 311 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
307 312
308 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 313 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
314 /* keep subsequent assertions sane */
315 new_bh->b_state = 0;
316 init_buffer(new_bh, NULL, NULL);
317 atomic_set(&new_bh->b_count, 1);
318 new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
309 319
310 /* 320 /*
311 * If a new transaction has already done a buffer copy-out, then 321 * If a new transaction has already done a buffer copy-out, then
@@ -384,14 +394,6 @@ repeat:
384 kunmap_atomic(mapped_data, KM_USER0); 394 kunmap_atomic(mapped_data, KM_USER0);
385 } 395 }
386 396
387 /* keep subsequent assertions sane */
388 new_bh->b_state = 0;
389 init_buffer(new_bh, NULL, NULL);
390 atomic_set(&new_bh->b_count, 1);
391 jbd_unlock_bh_state(bh_in);
392
393 new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
394
395 set_bh_page(new_bh, new_page, new_offset); 397 set_bh_page(new_bh, new_page, new_offset);
396 new_jh->b_transaction = NULL; 398 new_jh->b_transaction = NULL;
397 new_bh->b_size = jh2bh(jh_in)->b_size; 399 new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -408,7 +410,11 @@ repeat:
408 * copying is moved to the transaction's shadow queue. 410 * copying is moved to the transaction's shadow queue.
409 */ 411 */
410 JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); 412 JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
411 jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); 413 spin_lock(&journal->j_list_lock);
414 __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
415 spin_unlock(&journal->j_list_lock);
416 jbd_unlock_bh_state(bh_in);
417
412 JBUFFER_TRACE(new_jh, "file as BJ_IO"); 418 JBUFFER_TRACE(new_jh, "file as BJ_IO");
413 jbd2_journal_file_buffer(new_jh, transaction, BJ_IO); 419 jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
414 420
@@ -1781,7 +1787,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1781 * Journal abort has very specific semantics, which we describe 1787 * Journal abort has very specific semantics, which we describe
1782 * for journal abort. 1788 * for journal abort.
1783 * 1789 *
1784 * Two internal function, which provide abort to te jbd layer 1790 * Two internal functions, which provide abort to the jbd layer
1785 * itself are here. 1791 * itself are here.
1786 */ 1792 */
1787 1793
@@ -1879,7 +1885,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
1879 * int jbd2_journal_errno () - returns the journal's error state. 1885 * int jbd2_journal_errno () - returns the journal's error state.
1880 * @journal: journal to examine. 1886 * @journal: journal to examine.
1881 * 1887 *
1882 * This is the errno numbet set with jbd2_journal_abort(), the last 1888 * This is the errno number set with jbd2_journal_abort(), the last
1883 * time the journal was mounted - if the journal was stopped 1889 * time the journal was mounted - if the journal was stopped
1884 * without calling abort this will be 0. 1890 * without calling abort this will be 0.
1885 * 1891 *
@@ -1903,7 +1909,7 @@ int jbd2_journal_errno(journal_t *journal)
1903 * int jbd2_journal_clear_err () - clears the journal's error state 1909 * int jbd2_journal_clear_err () - clears the journal's error state
1904 * @journal: journal to act on. 1910 * @journal: journal to act on.
1905 * 1911 *
1906 * An error must be cleared or Acked to take a FS out of readonly 1912 * An error must be cleared or acked to take a FS out of readonly
1907 * mode. 1913 * mode.
1908 */ 1914 */
1909int jbd2_journal_clear_err(journal_t *journal) 1915int jbd2_journal_clear_err(journal_t *journal)
@@ -1923,7 +1929,7 @@ int jbd2_journal_clear_err(journal_t *journal)
1923 * void jbd2_journal_ack_err() - Ack journal err. 1929 * void jbd2_journal_ack_err() - Ack journal err.
1924 * @journal: journal to act on. 1930 * @journal: journal to act on.
1925 * 1931 *
1926 * An error must be cleared or Acked to take a FS out of readonly 1932 * An error must be cleared or acked to take a FS out of readonly
1927 * mode. 1933 * mode.
1928 */ 1934 */
1929void jbd2_journal_ack_err(journal_t *journal) 1935void jbd2_journal_ack_err(journal_t *journal)
@@ -2377,6 +2383,72 @@ static void __exit journal_exit(void)
2377 jbd2_journal_destroy_caches(); 2383 jbd2_journal_destroy_caches();
2378} 2384}
2379 2385
2386/*
2387 * jbd2_dev_to_name is a utility function used by the jbd2 and ext4
2388 * tracing infrastructure to map a dev_t to a device name.
2389 *
2390 * The caller should use rcu_read_lock() in order to make sure the
2391 * device name stays valid until its done with it. We use
2392 * rcu_read_lock() as well to make sure we're safe in case the caller
2393 * gets sloppy, and because rcu_read_lock() is cheap and can be safely
2394 * nested.
2395 */
2396struct devname_cache {
2397 struct rcu_head rcu;
2398 dev_t device;
2399 char devname[BDEVNAME_SIZE];
2400};
2401#define CACHE_SIZE_BITS 6
2402static struct devname_cache *devcache[1 << CACHE_SIZE_BITS];
2403static DEFINE_SPINLOCK(devname_cache_lock);
2404
2405static void free_devcache(struct rcu_head *rcu)
2406{
2407 kfree(rcu);
2408}
2409
2410const char *jbd2_dev_to_name(dev_t device)
2411{
2412 int i = hash_32(device, CACHE_SIZE_BITS);
2413 char *ret;
2414 struct block_device *bd;
2415 static struct devname_cache *new_dev;
2416
2417 rcu_read_lock();
2418 if (devcache[i] && devcache[i]->device == device) {
2419 ret = devcache[i]->devname;
2420 rcu_read_unlock();
2421 return ret;
2422 }
2423 rcu_read_unlock();
2424
2425 new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
2426 if (!new_dev)
2427 return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
2428 spin_lock(&devname_cache_lock);
2429 if (devcache[i]) {
2430 if (devcache[i]->device == device) {
2431 kfree(new_dev);
2432 ret = devcache[i]->devname;
2433 spin_unlock(&devname_cache_lock);
2434 return ret;
2435 }
2436 call_rcu(&devcache[i]->rcu, free_devcache);
2437 }
2438 devcache[i] = new_dev;
2439 devcache[i]->device = device;
2440 bd = bdget(device);
2441 if (bd) {
2442 bdevname(bd, devcache[i]->devname);
2443 bdput(bd);
2444 } else
2445 __bdevname(device, devcache[i]->devname);
2446 ret = devcache[i]->devname;
2447 spin_unlock(&devname_cache_lock);
2448 return ret;
2449}
2450EXPORT_SYMBOL(jbd2_dev_to_name);
2451
2380MODULE_LICENSE("GPL"); 2452MODULE_LICENSE("GPL");
2381module_init(journal_init); 2453module_init(journal_init);
2382module_exit(journal_exit); 2454module_exit(journal_exit);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 996ffda06bf3..6213ac728f30 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -499,34 +499,15 @@ void jbd2_journal_unlock_updates (journal_t *journal)
499 wake_up(&journal->j_wait_transaction_locked); 499 wake_up(&journal->j_wait_transaction_locked);
500} 500}
501 501
502/* 502static void warn_dirty_buffer(struct buffer_head *bh)
503 * Report any unexpected dirty buffers which turn up. Normally those
504 * indicate an error, but they can occur if the user is running (say)
505 * tune2fs to modify the live filesystem, so we need the option of
506 * continuing as gracefully as possible. #
507 *
508 * The caller should already hold the journal lock and
509 * j_list_lock spinlock: most callers will need those anyway
510 * in order to probe the buffer's journaling state safely.
511 */
512static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
513{ 503{
514 int jlist; 504 char b[BDEVNAME_SIZE];
515
516 /* If this buffer is one which might reasonably be dirty
517 * --- ie. data, or not part of this journal --- then
518 * we're OK to leave it alone, but otherwise we need to
519 * move the dirty bit to the journal's own internal
520 * JBDDirty bit. */
521 jlist = jh->b_jlist;
522
523 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
524 jlist == BJ_Shadow || jlist == BJ_Forget) {
525 struct buffer_head *bh = jh2bh(jh);
526 505
527 if (test_clear_buffer_dirty(bh)) 506 printk(KERN_WARNING
528 set_buffer_jbddirty(bh); 507 "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
529 } 508 "There's a risk of filesystem corruption in case of system "
509 "crash.\n",
510 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
530} 511}
531 512
532/* 513/*
@@ -593,14 +574,16 @@ repeat:
593 if (jh->b_next_transaction) 574 if (jh->b_next_transaction)
594 J_ASSERT_JH(jh, jh->b_next_transaction == 575 J_ASSERT_JH(jh, jh->b_next_transaction ==
595 transaction); 576 transaction);
577 warn_dirty_buffer(bh);
596 } 578 }
597 /* 579 /*
598 * In any case we need to clean the dirty flag and we must 580 * In any case we need to clean the dirty flag and we must
599 * do it under the buffer lock to be sure we don't race 581 * do it under the buffer lock to be sure we don't race
600 * with running write-out. 582 * with running write-out.
601 */ 583 */
602 JBUFFER_TRACE(jh, "Unexpected dirty buffer"); 584 JBUFFER_TRACE(jh, "Journalling dirty buffer");
603 jbd_unexpected_dirty_buffer(jh); 585 clear_buffer_dirty(bh);
586 set_buffer_jbddirty(bh);
604 } 587 }
605 588
606 unlock_buffer(bh); 589 unlock_buffer(bh);
@@ -843,6 +826,15 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
843 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 826 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
844 827
845 if (jh->b_transaction == NULL) { 828 if (jh->b_transaction == NULL) {
829 /*
830 * Previous jbd2_journal_forget() could have left the buffer
831 * with jbddirty bit set because it was being committed. When
832 * the commit finished, we've filed the buffer for
833 * checkpointing and marked it dirty. Now we are reallocating
834 * the buffer so the transaction freeing it must have
835 * committed and so it's safe to clear the dirty bit.
836 */
837 clear_buffer_dirty(jh2bh(jh));
846 jh->b_transaction = transaction; 838 jh->b_transaction = transaction;
847 839
848 /* first access by this transaction */ 840 /* first access by this transaction */
@@ -1547,36 +1539,6 @@ out:
1547 return; 1539 return;
1548} 1540}
1549 1541
1550/*
1551 * jbd2_journal_try_to_free_buffers() could race with
1552 * jbd2_journal_commit_transaction(). The later might still hold the
1553 * reference count to the buffers when inspecting them on
1554 * t_syncdata_list or t_locked_list.
1555 *
1556 * jbd2_journal_try_to_free_buffers() will call this function to
1557 * wait for the current transaction to finish syncing data buffers, before
1558 * try to free that buffer.
1559 *
1560 * Called with journal->j_state_lock hold.
1561 */
1562static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
1563{
1564 transaction_t *transaction;
1565 tid_t tid;
1566
1567 spin_lock(&journal->j_state_lock);
1568 transaction = journal->j_committing_transaction;
1569
1570 if (!transaction) {
1571 spin_unlock(&journal->j_state_lock);
1572 return;
1573 }
1574
1575 tid = transaction->t_tid;
1576 spin_unlock(&journal->j_state_lock);
1577 jbd2_log_wait_commit(journal, tid);
1578}
1579
1580/** 1542/**
1581 * int jbd2_journal_try_to_free_buffers() - try to free page buffers. 1543 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
1582 * @journal: journal for operation 1544 * @journal: journal for operation
@@ -1649,25 +1611,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
1649 1611
1650 ret = try_to_free_buffers(page); 1612 ret = try_to_free_buffers(page);
1651 1613
1652 /*
1653 * There are a number of places where jbd2_journal_try_to_free_buffers()
1654 * could race with jbd2_journal_commit_transaction(), the later still
1655 * holds the reference to the buffers to free while processing them.
1656 * try_to_free_buffers() failed to free those buffers. Some of the
1657 * caller of releasepage() request page buffers to be dropped, otherwise
1658 * treat the fail-to-free as errors (such as generic_file_direct_IO())
1659 *
1660 * So, if the caller of try_to_release_page() wants the synchronous
1661 * behaviour(i.e make sure buffers are dropped upon return),
1662 * let's wait for the current transaction to finish flush of
1663 * dirty data buffers, then try to free those buffers again,
1664 * with the journal locked.
1665 */
1666 if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
1667 jbd2_journal_wait_for_transaction_sync_data(journal);
1668 ret = try_to_free_buffers(page);
1669 }
1670
1671busy: 1614busy:
1672 return ret; 1615 return ret;
1673} 1616}
@@ -1693,8 +1636,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1693 1636
1694 if (jh->b_cp_transaction) { 1637 if (jh->b_cp_transaction) {
1695 JBUFFER_TRACE(jh, "on running+cp transaction"); 1638 JBUFFER_TRACE(jh, "on running+cp transaction");
1639 /*
1640 * We don't want to write the buffer anymore, clear the
1641 * bit so that we don't confuse checks in
1642 * __journal_file_buffer
1643 */
1644 clear_buffer_dirty(bh);
1696 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); 1645 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1697 clear_buffer_jbddirty(bh);
1698 may_free = 0; 1646 may_free = 0;
1699 } else { 1647 } else {
1700 JBUFFER_TRACE(jh, "on running transaction"); 1648 JBUFFER_TRACE(jh, "on running transaction");
@@ -1945,12 +1893,17 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
1945 if (jh->b_transaction && jh->b_jlist == jlist) 1893 if (jh->b_transaction && jh->b_jlist == jlist)
1946 return; 1894 return;
1947 1895
1948 /* The following list of buffer states needs to be consistent
1949 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
1950 * state. */
1951
1952 if (jlist == BJ_Metadata || jlist == BJ_Reserved || 1896 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
1953 jlist == BJ_Shadow || jlist == BJ_Forget) { 1897 jlist == BJ_Shadow || jlist == BJ_Forget) {
1898 /*
1899 * For metadata buffers, we track dirty bit in buffer_jbddirty
1900 * instead of buffer_dirty. We should not see a dirty bit set
1901 * here because we clear it in do_get_write_access but e.g.
1902 * tune2fs can modify the sb and set the dirty bit at any time
1903 * so we try to gracefully handle that.
1904 */
1905 if (buffer_dirty(bh))
1906 warn_dirty_buffer(bh);
1954 if (test_clear_buffer_dirty(bh) || 1907 if (test_clear_buffer_dirty(bh) ||
1955 test_clear_buffer_jbddirty(bh)) 1908 test_clear_buffer_jbddirty(bh))
1956 was_dirty = 1; 1909 was_dirty = 1;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 043740dde20c..8fcb6239218e 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -156,48 +156,25 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
156 return ERR_PTR(-EINVAL); 156 return ERR_PTR(-EINVAL);
157} 157}
158 158
159static struct posix_acl *jffs2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
160{
161 struct posix_acl *acl = JFFS2_ACL_NOT_CACHED;
162
163 spin_lock(&inode->i_lock);
164 if (*i_acl != JFFS2_ACL_NOT_CACHED)
165 acl = posix_acl_dup(*i_acl);
166 spin_unlock(&inode->i_lock);
167 return acl;
168}
169
170static void jffs2_iset_acl(struct inode *inode, struct posix_acl **i_acl, struct posix_acl *acl)
171{
172 spin_lock(&inode->i_lock);
173 if (*i_acl != JFFS2_ACL_NOT_CACHED)
174 posix_acl_release(*i_acl);
175 *i_acl = posix_acl_dup(acl);
176 spin_unlock(&inode->i_lock);
177}
178
179static struct posix_acl *jffs2_get_acl(struct inode *inode, int type) 159static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
180{ 160{
181 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
182 struct posix_acl *acl; 161 struct posix_acl *acl;
183 char *value = NULL; 162 char *value = NULL;
184 int rc, xprefix; 163 int rc, xprefix;
185 164
165 acl = get_cached_acl(inode, type);
166 if (acl != ACL_NOT_CACHED)
167 return acl;
168
186 switch (type) { 169 switch (type) {
187 case ACL_TYPE_ACCESS: 170 case ACL_TYPE_ACCESS:
188 acl = jffs2_iget_acl(inode, &f->i_acl_access);
189 if (acl != JFFS2_ACL_NOT_CACHED)
190 return acl;
191 xprefix = JFFS2_XPREFIX_ACL_ACCESS; 171 xprefix = JFFS2_XPREFIX_ACL_ACCESS;
192 break; 172 break;
193 case ACL_TYPE_DEFAULT: 173 case ACL_TYPE_DEFAULT:
194 acl = jffs2_iget_acl(inode, &f->i_acl_default);
195 if (acl != JFFS2_ACL_NOT_CACHED)
196 return acl;
197 xprefix = JFFS2_XPREFIX_ACL_DEFAULT; 174 xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
198 break; 175 break;
199 default: 176 default:
200 return ERR_PTR(-EINVAL); 177 BUG();
201 } 178 }
202 rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0); 179 rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0);
203 if (rc > 0) { 180 if (rc > 0) {
@@ -215,16 +192,8 @@ static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
215 } 192 }
216 if (value) 193 if (value)
217 kfree(value); 194 kfree(value);
218 if (!IS_ERR(acl)) { 195 if (!IS_ERR(acl))
219 switch (type) { 196 set_cached_acl(inode, type, acl);
220 case ACL_TYPE_ACCESS:
221 jffs2_iset_acl(inode, &f->i_acl_access, acl);
222 break;
223 case ACL_TYPE_DEFAULT:
224 jffs2_iset_acl(inode, &f->i_acl_default, acl);
225 break;
226 }
227 }
228 return acl; 197 return acl;
229} 198}
230 199
@@ -249,7 +218,6 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
249 218
250static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) 219static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
251{ 220{
252 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
253 int rc, xprefix; 221 int rc, xprefix;
254 222
255 if (S_ISLNK(inode->i_mode)) 223 if (S_ISLNK(inode->i_mode))
@@ -285,16 +253,8 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
285 return -EINVAL; 253 return -EINVAL;
286 } 254 }
287 rc = __jffs2_set_acl(inode, xprefix, acl); 255 rc = __jffs2_set_acl(inode, xprefix, acl);
288 if (!rc) { 256 if (!rc)
289 switch(type) { 257 set_cached_acl(inode, type, acl);
290 case ACL_TYPE_ACCESS:
291 jffs2_iset_acl(inode, &f->i_acl_access, acl);
292 break;
293 case ACL_TYPE_DEFAULT:
294 jffs2_iset_acl(inode, &f->i_acl_default, acl);
295 break;
296 }
297 }
298 return rc; 258 return rc;
299} 259}
300 260
@@ -321,12 +281,10 @@ int jffs2_permission(struct inode *inode, int mask)
321 281
322int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) 282int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
323{ 283{
324 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
325 struct posix_acl *acl, *clone; 284 struct posix_acl *acl, *clone;
326 int rc; 285 int rc;
327 286
328 f->i_acl_default = NULL; 287 cache_no_acl(inode);
329 f->i_acl_access = NULL;
330 288
331 if (S_ISLNK(*i_mode)) 289 if (S_ISLNK(*i_mode))
332 return 0; /* Symlink always has no-ACL */ 290 return 0; /* Symlink always has no-ACL */
@@ -339,7 +297,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
339 *i_mode &= ~current_umask(); 297 *i_mode &= ~current_umask();
340 } else { 298 } else {
341 if (S_ISDIR(*i_mode)) 299 if (S_ISDIR(*i_mode))
342 jffs2_iset_acl(inode, &f->i_acl_default, acl); 300 set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
343 301
344 clone = posix_acl_clone(acl, GFP_KERNEL); 302 clone = posix_acl_clone(acl, GFP_KERNEL);
345 if (!clone) 303 if (!clone)
@@ -350,7 +308,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
350 return rc; 308 return rc;
351 } 309 }
352 if (rc > 0) 310 if (rc > 0)
353 jffs2_iset_acl(inode, &f->i_acl_access, clone); 311 set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
354 312
355 posix_acl_release(clone); 313 posix_acl_release(clone);
356 } 314 }
@@ -359,17 +317,16 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
359 317
360int jffs2_init_acl_post(struct inode *inode) 318int jffs2_init_acl_post(struct inode *inode)
361{ 319{
362 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
363 int rc; 320 int rc;
364 321
365 if (f->i_acl_default) { 322 if (inode->i_default_acl) {
366 rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_DEFAULT, f->i_acl_default); 323 rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_DEFAULT, inode->i_default_acl);
367 if (rc) 324 if (rc)
368 return rc; 325 return rc;
369 } 326 }
370 327
371 if (f->i_acl_access) { 328 if (inode->i_acl) {
372 rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_ACCESS, f->i_acl_access); 329 rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_ACCESS, inode->i_acl);
373 if (rc) 330 if (rc)
374 return rc; 331 return rc;
375 } 332 }
@@ -377,18 +334,6 @@ int jffs2_init_acl_post(struct inode *inode)
377 return 0; 334 return 0;
378} 335}
379 336
380void jffs2_clear_acl(struct jffs2_inode_info *f)
381{
382 if (f->i_acl_access && f->i_acl_access != JFFS2_ACL_NOT_CACHED) {
383 posix_acl_release(f->i_acl_access);
384 f->i_acl_access = JFFS2_ACL_NOT_CACHED;
385 }
386 if (f->i_acl_default && f->i_acl_default != JFFS2_ACL_NOT_CACHED) {
387 posix_acl_release(f->i_acl_default);
388 f->i_acl_default = JFFS2_ACL_NOT_CACHED;
389 }
390}
391
392int jffs2_acl_chmod(struct inode *inode) 337int jffs2_acl_chmod(struct inode *inode)
393{ 338{
394 struct posix_acl *acl, *clone; 339 struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 8ca058aed384..fc929f2a14f6 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,13 +26,10 @@ struct jffs2_acl_header {
26 26
27#ifdef CONFIG_JFFS2_FS_POSIX_ACL 27#ifdef CONFIG_JFFS2_FS_POSIX_ACL
28 28
29#define JFFS2_ACL_NOT_CACHED ((void *)-1)
30
31extern int jffs2_permission(struct inode *, int); 29extern int jffs2_permission(struct inode *, int);
32extern int jffs2_acl_chmod(struct inode *); 30extern int jffs2_acl_chmod(struct inode *);
33extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
34extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
35extern void jffs2_clear_acl(struct jffs2_inode_info *);
36 33
37extern struct xattr_handler jffs2_acl_access_xattr_handler; 34extern struct xattr_handler jffs2_acl_access_xattr_handler;
38extern struct xattr_handler jffs2_acl_default_xattr_handler; 35extern struct xattr_handler jffs2_acl_default_xattr_handler;
@@ -43,6 +40,5 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
43#define jffs2_acl_chmod(inode) (0) 40#define jffs2_acl_chmod(inode) (0)
44#define jffs2_init_acl_pre(dir_i,inode,mode) (0) 41#define jffs2_init_acl_pre(dir_i,inode,mode) (0)
45#define jffs2_init_acl_post(inode) (0) 42#define jffs2_init_acl_post(inode) (0)
46#define jffs2_clear_acl(f)
47 43
48#endif /* CONFIG_JFFS2_FS_POSIX_ACL */ 44#endif /* CONFIG_JFFS2_FS_POSIX_ACL */
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index a0244740b75a..b47679be118a 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -270,19 +270,21 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
270 D2({ 270 D2({
271 int i=0; 271 int i=0;
272 struct jffs2_raw_node_ref *this; 272 struct jffs2_raw_node_ref *this;
273 printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n" KERN_DEBUG); 273 printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n");
274 274
275 this = ic->nodes; 275 this = ic->nodes;
276 276
277 printk(KERN_DEBUG);
277 while(this) { 278 while(this) {
278 printk( "0x%08x(%d)->", ref_offset(this), ref_flags(this)); 279 printk(KERN_CONT "0x%08x(%d)->",
280 ref_offset(this), ref_flags(this));
279 if (++i == 5) { 281 if (++i == 5) {
280 printk("\n" KERN_DEBUG); 282 printk(KERN_DEBUG);
281 i=0; 283 i=0;
282 } 284 }
283 this = this->next_in_ino; 285 this = this->next_in_ino;
284 } 286 }
285 printk("\n"); 287 printk(KERN_CONT "\n");
286 }); 288 });
287 289
288 switch (ic->class) { 290 switch (ic->class) {
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5edc2bf20581..23c947539864 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -99,7 +99,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
99 kunmap(pg); 99 kunmap(pg);
100 100
101 D2(printk(KERN_DEBUG "readpage finished\n")); 101 D2(printk(KERN_DEBUG "readpage finished\n"));
102 return 0; 102 return ret;
103} 103}
104 104
105int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg) 105int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg)
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 249305d65d5b..3451a81b2142 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -20,6 +20,7 @@
20#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
21#include <linux/vfs.h> 21#include <linux/vfs.h>
22#include <linux/crc32.h> 22#include <linux/crc32.h>
23#include <linux/smp_lock.h>
23#include "nodelist.h" 24#include "nodelist.h"
24 25
25static int jffs2_flash_setup(struct jffs2_sb_info *c); 26static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -387,6 +388,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
387 This also catches the case where it was stopped and this 388 This also catches the case where it was stopped and this
388 is just a remount to restart it. 389 is just a remount to restart it.
389 Flush the writebuffer, if neccecary, else we loose it */ 390 Flush the writebuffer, if neccecary, else we loose it */
391 lock_kernel();
390 if (!(sb->s_flags & MS_RDONLY)) { 392 if (!(sb->s_flags & MS_RDONLY)) {
391 jffs2_stop_garbage_collect_thread(c); 393 jffs2_stop_garbage_collect_thread(c);
392 mutex_lock(&c->alloc_sem); 394 mutex_lock(&c->alloc_sem);
@@ -399,24 +401,10 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
399 401
400 *flags |= MS_NOATIME; 402 *flags |= MS_NOATIME;
401 403
404 unlock_kernel();
402 return 0; 405 return 0;
403} 406}
404 407
405void jffs2_write_super (struct super_block *sb)
406{
407 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
408 sb->s_dirt = 0;
409
410 if (sb->s_flags & MS_RDONLY)
411 return;
412
413 D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
414 jffs2_garbage_collect_trigger(c);
415 jffs2_erase_pending_blocks(c, 0);
416 jffs2_flush_wbuf_gc(c, 0);
417}
418
419
420/* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, 408/* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
421 fill in the raw_inode while you're at it. */ 409 fill in the raw_inode while you're at it. */
422struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri) 410struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri)
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 4c41db91eaa4..c6923da98263 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -50,10 +50,6 @@ struct jffs2_inode_info {
50 uint16_t flags; 50 uint16_t flags;
51 uint8_t usercompr; 51 uint8_t usercompr;
52 struct inode vfs_inode; 52 struct inode vfs_inode;
53#ifdef CONFIG_JFFS2_FS_POSIX_ACL
54 struct posix_acl *i_acl_access;
55 struct posix_acl *i_acl_default;
56#endif
57}; 53};
58 54
59#endif /* _JFFS2_FS_I */ 55#endif /* _JFFS2_FS_I */
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 5e194a5c8e29..a7f03b7ebcb3 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -56,10 +56,6 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
56 f->target = NULL; 56 f->target = NULL;
57 f->flags = 0; 57 f->flags = 0;
58 f->usercompr = 0; 58 f->usercompr = 0;
59#ifdef CONFIG_JFFS2_FS_POSIX_ACL
60 f->i_acl_access = JFFS2_ACL_NOT_CACHED;
61 f->i_acl_default = JFFS2_ACL_NOT_CACHED;
62#endif
63} 59}
64 60
65 61
@@ -181,7 +177,6 @@ void jffs2_dirty_inode(struct inode *inode);
181struct inode *jffs2_new_inode (struct inode *dir_i, int mode, 177struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
182 struct jffs2_raw_inode *ri); 178 struct jffs2_raw_inode *ri);
183int jffs2_statfs (struct dentry *, struct kstatfs *); 179int jffs2_statfs (struct dentry *, struct kstatfs *);
184void jffs2_write_super (struct super_block *);
185int jffs2_remount_fs (struct super_block *, int *, char *); 180int jffs2_remount_fs (struct super_block *, int *, char *);
186int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); 181int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
187void jffs2_gc_release_inode(struct jffs2_sb_info *c, 182void jffs2_gc_release_inode(struct jffs2_sb_info *c,
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1fc1e92356ee..1a80301004b8 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1424,7 +1424,6 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f)
1424 struct jffs2_full_dirent *fd, *fds; 1424 struct jffs2_full_dirent *fd, *fds;
1425 int deleted; 1425 int deleted;
1426 1426
1427 jffs2_clear_acl(f);
1428 jffs2_xattr_delete_inode(c, f->inocache); 1427 jffs2_xattr_delete_inode(c, f->inocache);
1429 mutex_lock(&f->sem); 1428 mutex_lock(&f->sem);
1430 deleted = f->inocache && !f->inocache->pino_nlink; 1429 deleted = f->inocache && !f->inocache->pino_nlink;
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 1d437de1e9a8..696686cc206e 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -130,9 +130,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
130 if (jffs2_sum_active()) { 130 if (jffs2_sum_active()) {
131 s = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL); 131 s = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
132 if (!s) { 132 if (!s) {
133 kfree(flashbuf);
134 JFFS2_WARNING("Can't allocate memory for summary\n"); 133 JFFS2_WARNING("Can't allocate memory for summary\n");
135 return -ENOMEM; 134 ret = -ENOMEM;
135 goto out;
136 } 136 }
137 } 137 }
138 138
@@ -196,7 +196,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
196 if (c->nextblock) { 196 if (c->nextblock) {
197 ret = file_dirty(c, c->nextblock); 197 ret = file_dirty(c, c->nextblock);
198 if (ret) 198 if (ret)
199 return ret; 199 goto out;
200 /* deleting summary information of the old nextblock */ 200 /* deleting summary information of the old nextblock */
201 jffs2_sum_reset_collected(c->summary); 201 jffs2_sum_reset_collected(c->summary);
202 } 202 }
@@ -207,7 +207,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
207 } else { 207 } else {
208 ret = file_dirty(c, jeb); 208 ret = file_dirty(c, jeb);
209 if (ret) 209 if (ret)
210 return ret; 210 goto out;
211 } 211 }
212 break; 212 break;
213 213
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 4c4e18c54a51..0035c021395a 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -12,6 +12,7 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/smp_lock.h>
15#include <linux/init.h> 16#include <linux/init.h>
16#include <linux/list.h> 17#include <linux/list.h>
17#include <linux/fs.h> 18#include <linux/fs.h>
@@ -53,10 +54,29 @@ static void jffs2_i_init_once(void *foo)
53 inode_init_once(&f->vfs_inode); 54 inode_init_once(&f->vfs_inode);
54} 55}
55 56
57static void jffs2_write_super(struct super_block *sb)
58{
59 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
60
61 lock_super(sb);
62 sb->s_dirt = 0;
63
64 if (!(sb->s_flags & MS_RDONLY)) {
65 D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
66 jffs2_garbage_collect_trigger(c);
67 jffs2_erase_pending_blocks(c, 0);
68 jffs2_flush_wbuf_gc(c, 0);
69 }
70
71 unlock_super(sb);
72}
73
56static int jffs2_sync_fs(struct super_block *sb, int wait) 74static int jffs2_sync_fs(struct super_block *sb, int wait)
57{ 75{
58 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 76 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
59 77
78 jffs2_write_super(sb);
79
60 mutex_lock(&c->alloc_sem); 80 mutex_lock(&c->alloc_sem);
61 jffs2_flush_wbuf_pad(c); 81 jffs2_flush_wbuf_pad(c);
62 mutex_unlock(&c->alloc_sem); 82 mutex_unlock(&c->alloc_sem);
@@ -174,6 +194,11 @@ static void jffs2_put_super (struct super_block *sb)
174 194
175 D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); 195 D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
176 196
197 lock_kernel();
198
199 if (sb->s_dirt)
200 jffs2_write_super(sb);
201
177 mutex_lock(&c->alloc_sem); 202 mutex_lock(&c->alloc_sem);
178 jffs2_flush_wbuf_pad(c); 203 jffs2_flush_wbuf_pad(c);
179 mutex_unlock(&c->alloc_sem); 204 mutex_unlock(&c->alloc_sem);
@@ -192,6 +217,8 @@ static void jffs2_put_super (struct super_block *sb)
192 if (c->mtd->sync) 217 if (c->mtd->sync)
193 c->mtd->sync(c->mtd); 218 c->mtd->sync(c->mtd);
194 219
220 unlock_kernel();
221
195 D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); 222 D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
196} 223}
197 224
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 06ca1b8d2054..a29c7c3e3fb8 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -31,27 +31,24 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
31{ 31{
32 struct posix_acl *acl; 32 struct posix_acl *acl;
33 char *ea_name; 33 char *ea_name;
34 struct jfs_inode_info *ji = JFS_IP(inode);
35 struct posix_acl **p_acl;
36 int size; 34 int size;
37 char *value = NULL; 35 char *value = NULL;
38 36
37 acl = get_cached_acl(inode, type);
38 if (acl != ACL_NOT_CACHED)
39 return acl;
40
39 switch(type) { 41 switch(type) {
40 case ACL_TYPE_ACCESS: 42 case ACL_TYPE_ACCESS:
41 ea_name = POSIX_ACL_XATTR_ACCESS; 43 ea_name = POSIX_ACL_XATTR_ACCESS;
42 p_acl = &ji->i_acl;
43 break; 44 break;
44 case ACL_TYPE_DEFAULT: 45 case ACL_TYPE_DEFAULT:
45 ea_name = POSIX_ACL_XATTR_DEFAULT; 46 ea_name = POSIX_ACL_XATTR_DEFAULT;
46 p_acl = &ji->i_default_acl;
47 break; 47 break;
48 default: 48 default:
49 return ERR_PTR(-EINVAL); 49 return ERR_PTR(-EINVAL);
50 } 50 }
51 51
52 if (*p_acl != JFS_ACL_NOT_CACHED)
53 return posix_acl_dup(*p_acl);
54
55 size = __jfs_getxattr(inode, ea_name, NULL, 0); 52 size = __jfs_getxattr(inode, ea_name, NULL, 0);
56 53
57 if (size > 0) { 54 if (size > 0) {
@@ -62,17 +59,16 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
62 } 59 }
63 60
64 if (size < 0) { 61 if (size < 0) {
65 if (size == -ENODATA) { 62 if (size == -ENODATA)
66 *p_acl = NULL;
67 acl = NULL; 63 acl = NULL;
68 } else 64 else
69 acl = ERR_PTR(size); 65 acl = ERR_PTR(size);
70 } else { 66 } else {
71 acl = posix_acl_from_xattr(value, size); 67 acl = posix_acl_from_xattr(value, size);
72 if (!IS_ERR(acl))
73 *p_acl = posix_acl_dup(acl);
74 } 68 }
75 kfree(value); 69 kfree(value);
70 if (!IS_ERR(acl))
71 set_cached_acl(inode, type, acl);
76 return acl; 72 return acl;
77} 73}
78 74
@@ -80,8 +76,6 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
80 struct posix_acl *acl) 76 struct posix_acl *acl)
81{ 77{
82 char *ea_name; 78 char *ea_name;
83 struct jfs_inode_info *ji = JFS_IP(inode);
84 struct posix_acl **p_acl;
85 int rc; 79 int rc;
86 int size = 0; 80 int size = 0;
87 char *value = NULL; 81 char *value = NULL;
@@ -92,11 +86,9 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
92 switch(type) { 86 switch(type) {
93 case ACL_TYPE_ACCESS: 87 case ACL_TYPE_ACCESS:
94 ea_name = POSIX_ACL_XATTR_ACCESS; 88 ea_name = POSIX_ACL_XATTR_ACCESS;
95 p_acl = &ji->i_acl;
96 break; 89 break;
97 case ACL_TYPE_DEFAULT: 90 case ACL_TYPE_DEFAULT:
98 ea_name = POSIX_ACL_XATTR_DEFAULT; 91 ea_name = POSIX_ACL_XATTR_DEFAULT;
99 p_acl = &ji->i_default_acl;
100 if (!S_ISDIR(inode->i_mode)) 92 if (!S_ISDIR(inode->i_mode))
101 return acl ? -EACCES : 0; 93 return acl ? -EACCES : 0;
102 break; 94 break;
@@ -116,27 +108,24 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
116out: 108out:
117 kfree(value); 109 kfree(value);
118 110
119 if (!rc) { 111 if (!rc)
120 if (*p_acl && (*p_acl != JFS_ACL_NOT_CACHED)) 112 set_cached_acl(inode, type, acl);
121 posix_acl_release(*p_acl); 113
122 *p_acl = posix_acl_dup(acl);
123 }
124 return rc; 114 return rc;
125} 115}
126 116
127static int jfs_check_acl(struct inode *inode, int mask) 117static int jfs_check_acl(struct inode *inode, int mask)
128{ 118{
129 struct jfs_inode_info *ji = JFS_IP(inode); 119 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
130 120
131 if (ji->i_acl == JFS_ACL_NOT_CACHED) { 121 if (IS_ERR(acl))
132 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); 122 return PTR_ERR(acl);
133 if (IS_ERR(acl)) 123 if (acl) {
134 return PTR_ERR(acl); 124 int error = posix_acl_permission(inode, acl, mask);
135 posix_acl_release(acl); 125 posix_acl_release(acl);
126 return error;
136 } 127 }
137 128
138 if (ji->i_acl)
139 return posix_acl_permission(inode, ji->i_acl, mask);
140 return -EAGAIN; 129 return -EAGAIN;
141} 130}
142 131
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index bbbd5f202e37..41d6045dbeb0 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -391,6 +391,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
391 } 391 }
392 XADaddress(xp, xaddr); 392 XADaddress(xp, xaddr);
393 XADlength(xp, xlen); 393 XADlength(xp, xlen);
394 XADoffset(xp, prev);
394 /* 395 /*
395 * only preserve the abnr flag within the xad flags 396 * only preserve the abnr flag within the xad flags
396 * of the returned hint. 397 * of the returned hint.
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 346057218edc..0fc30407f039 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2571,6 +2571,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2571 2571
2572 txAbort(tid, 0); 2572 txAbort(tid, 0);
2573 txEnd(tid); 2573 txEnd(tid);
2574 mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
2574 2575
2575 /* release the inode map lock */ 2576 /* release the inode map lock */
2576 IWRITE_UNLOCK(ipimap); 2577 IWRITE_UNLOCK(ipimap);
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 439901d205fe..1439f119ec83 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -74,10 +74,6 @@ struct jfs_inode_info {
74 /* xattr_sem allows us to access the xattrs without taking i_mutex */ 74 /* xattr_sem allows us to access the xattrs without taking i_mutex */
75 struct rw_semaphore xattr_sem; 75 struct rw_semaphore xattr_sem;
76 lid_t xtlid; /* lid of xtree lock on directory */ 76 lid_t xtlid; /* lid of xtree lock on directory */
77#ifdef CONFIG_JFS_POSIX_ACL
78 struct posix_acl *i_acl;
79 struct posix_acl *i_default_acl;
80#endif
81 union { 77 union {
82 struct { 78 struct {
83 xtpage_t _xtroot; /* 288: xtree root */ 79 xtpage_t _xtroot; /* 288: xtree root */
@@ -107,8 +103,6 @@ struct jfs_inode_info {
107#define i_inline u.link._inline 103#define i_inline u.link._inline
108#define i_inline_ea u.link._inline_ea 104#define i_inline_ea u.link._inline_ea
109 105
110#define JFS_ACL_NOT_CACHED ((void *)-1)
111
112#define IREAD_LOCK(ip, subclass) \ 106#define IREAD_LOCK(ip, subclass) \
113 down_read_nested(&JFS_IP(ip)->rdwrlock, subclass) 107 down_read_nested(&JFS_IP(ip)->rdwrlock, subclass)
114#define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock) 108#define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6f21adf9479a..37e6dcda8fc8 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -32,6 +32,7 @@
32#include <linux/crc32.h> 32#include <linux/crc32.h>
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <linux/seq_file.h> 34#include <linux/seq_file.h>
35#include <linux/smp_lock.h>
35 36
36#include "jfs_incore.h" 37#include "jfs_incore.h"
37#include "jfs_filsys.h" 38#include "jfs_filsys.h"
@@ -127,18 +128,6 @@ static void jfs_destroy_inode(struct inode *inode)
127 ji->active_ag = -1; 128 ji->active_ag = -1;
128 } 129 }
129 spin_unlock_irq(&ji->ag_lock); 130 spin_unlock_irq(&ji->ag_lock);
130
131#ifdef CONFIG_JFS_POSIX_ACL
132 if (ji->i_acl != JFS_ACL_NOT_CACHED) {
133 posix_acl_release(ji->i_acl);
134 ji->i_acl = JFS_ACL_NOT_CACHED;
135 }
136 if (ji->i_default_acl != JFS_ACL_NOT_CACHED) {
137 posix_acl_release(ji->i_default_acl);
138 ji->i_default_acl = JFS_ACL_NOT_CACHED;
139 }
140#endif
141
142 kmem_cache_free(jfs_inode_cachep, ji); 131 kmem_cache_free(jfs_inode_cachep, ji);
143} 132}
144 133
@@ -183,6 +172,9 @@ static void jfs_put_super(struct super_block *sb)
183 int rc; 172 int rc;
184 173
185 jfs_info("In jfs_put_super"); 174 jfs_info("In jfs_put_super");
175
176 lock_kernel();
177
186 rc = jfs_umount(sb); 178 rc = jfs_umount(sb);
187 if (rc) 179 if (rc)
188 jfs_err("jfs_umount failed with return code %d", rc); 180 jfs_err("jfs_umount failed with return code %d", rc);
@@ -195,6 +187,8 @@ static void jfs_put_super(struct super_block *sb)
195 sbi->direct_inode = NULL; 187 sbi->direct_inode = NULL;
196 188
197 kfree(sbi); 189 kfree(sbi);
190
191 unlock_kernel();
198} 192}
199 193
200enum { 194enum {
@@ -370,19 +364,24 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
370 s64 newLVSize = 0; 364 s64 newLVSize = 0;
371 int rc = 0; 365 int rc = 0;
372 int flag = JFS_SBI(sb)->flag; 366 int flag = JFS_SBI(sb)->flag;
367 int ret;
373 368
374 if (!parse_options(data, sb, &newLVSize, &flag)) { 369 if (!parse_options(data, sb, &newLVSize, &flag)) {
375 return -EINVAL; 370 return -EINVAL;
376 } 371 }
372 lock_kernel();
377 if (newLVSize) { 373 if (newLVSize) {
378 if (sb->s_flags & MS_RDONLY) { 374 if (sb->s_flags & MS_RDONLY) {
379 printk(KERN_ERR 375 printk(KERN_ERR
380 "JFS: resize requires volume to be mounted read-write\n"); 376 "JFS: resize requires volume to be mounted read-write\n");
377 unlock_kernel();
381 return -EROFS; 378 return -EROFS;
382 } 379 }
383 rc = jfs_extendfs(sb, newLVSize, 0); 380 rc = jfs_extendfs(sb, newLVSize, 0);
384 if (rc) 381 if (rc) {
382 unlock_kernel();
385 return rc; 383 return rc;
384 }
386 } 385 }
387 386
388 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 387 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -393,23 +392,31 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
393 truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0); 392 truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0);
394 393
395 JFS_SBI(sb)->flag = flag; 394 JFS_SBI(sb)->flag = flag;
396 return jfs_mount_rw(sb, 1); 395 ret = jfs_mount_rw(sb, 1);
396 unlock_kernel();
397 return ret;
397 } 398 }
398 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { 399 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
399 rc = jfs_umount_rw(sb); 400 rc = jfs_umount_rw(sb);
400 JFS_SBI(sb)->flag = flag; 401 JFS_SBI(sb)->flag = flag;
402 unlock_kernel();
401 return rc; 403 return rc;
402 } 404 }
403 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) 405 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
404 if (!(sb->s_flags & MS_RDONLY)) { 406 if (!(sb->s_flags & MS_RDONLY)) {
405 rc = jfs_umount_rw(sb); 407 rc = jfs_umount_rw(sb);
406 if (rc) 408 if (rc) {
409 unlock_kernel();
407 return rc; 410 return rc;
411 }
408 JFS_SBI(sb)->flag = flag; 412 JFS_SBI(sb)->flag = flag;
409 return jfs_mount_rw(sb, 1); 413 ret = jfs_mount_rw(sb, 1);
414 unlock_kernel();
415 return ret;
410 } 416 }
411 JFS_SBI(sb)->flag = flag; 417 JFS_SBI(sb)->flag = flag;
412 418
419 unlock_kernel();
413 return 0; 420 return 0;
414} 421}
415 422
@@ -720,8 +727,10 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
720 blk++; 727 blk++;
721 } 728 }
722out: 729out:
723 if (len == towrite) 730 if (len == towrite) {
731 mutex_unlock(&inode->i_mutex);
724 return err; 732 return err;
733 }
725 if (inode->i_size < off+len-towrite) 734 if (inode->i_size < off+len-towrite)
726 i_size_write(inode, off+len-towrite); 735 i_size_write(inode, off+len-towrite);
727 inode->i_version++; 736 inode->i_version++;
@@ -777,10 +786,6 @@ static void init_once(void *foo)
777 init_rwsem(&jfs_ip->xattr_sem); 786 init_rwsem(&jfs_ip->xattr_sem);
778 spin_lock_init(&jfs_ip->ag_lock); 787 spin_lock_init(&jfs_ip->ag_lock);
779 jfs_ip->active_ag = -1; 788 jfs_ip->active_ag = -1;
780#ifdef CONFIG_JFS_POSIX_ACL
781 jfs_ip->i_acl = JFS_ACL_NOT_CACHED;
782 jfs_ip->i_default_acl = JFS_ACL_NOT_CACHED;
783#endif
784 inode_init_once(&jfs_ip->vfs_inode); 789 inode_init_once(&jfs_ip->vfs_inode);
785} 790}
786 791
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 61dfa8173ebc..fad364548bc9 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -727,10 +727,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
727 /* 727 /*
728 * We're changing the ACL. Get rid of the cached one 728 * We're changing the ACL. Get rid of the cached one
729 */ 729 */
730 acl =JFS_IP(inode)->i_acl; 730 forget_cached_acl(inode, ACL_TYPE_ACCESS);
731 if (acl != JFS_ACL_NOT_CACHED)
732 posix_acl_release(acl);
733 JFS_IP(inode)->i_acl = JFS_ACL_NOT_CACHED;
734 731
735 return 0; 732 return 0;
736 } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { 733 } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
@@ -746,10 +743,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
746 /* 743 /*
747 * We're changing the default ACL. Get rid of the cached one 744 * We're changing the default ACL. Get rid of the cached one
748 */ 745 */
749 acl =JFS_IP(inode)->i_default_acl; 746 forget_cached_acl(inode, ACL_TYPE_DEFAULT);
750 if (acl && (acl != JFS_ACL_NOT_CACHED))
751 posix_acl_release(acl);
752 JFS_IP(inode)->i_default_acl = JFS_ACL_NOT_CACHED;
753 747
754 return 0; 748 return 0;
755 } 749 }
diff --git a/fs/libfs.c b/fs/libfs.c
index 80046ddf5063..dcec3d3ea64f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -9,6 +9,8 @@
9#include <linux/vfs.h> 9#include <linux/vfs.h>
10#include <linux/mutex.h> 10#include <linux/mutex.h>
11#include <linux/exportfs.h> 11#include <linux/exportfs.h>
12#include <linux/writeback.h>
13#include <linux/buffer_head.h>
12 14
13#include <asm/uaccess.h> 15#include <asm/uaccess.h>
14 16
@@ -215,7 +217,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
215 return PTR_ERR(s); 217 return PTR_ERR(s);
216 218
217 s->s_flags = MS_NOUSER; 219 s->s_flags = MS_NOUSER;
218 s->s_maxbytes = ~0ULL; 220 s->s_maxbytes = MAX_LFS_FILESIZE;
219 s->s_blocksize = PAGE_SIZE; 221 s->s_blocksize = PAGE_SIZE;
220 s->s_blocksize_bits = PAGE_SHIFT; 222 s->s_blocksize_bits = PAGE_SHIFT;
221 s->s_magic = magic; 223 s->s_magic = magic;
@@ -807,6 +809,29 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
807} 809}
808EXPORT_SYMBOL_GPL(generic_fh_to_parent); 810EXPORT_SYMBOL_GPL(generic_fh_to_parent);
809 811
812int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
813{
814 struct writeback_control wbc = {
815 .sync_mode = WB_SYNC_ALL,
816 .nr_to_write = 0, /* metadata-only; caller takes care of data */
817 };
818 struct inode *inode = dentry->d_inode;
819 int err;
820 int ret;
821
822 ret = sync_mapping_buffers(inode->i_mapping);
823 if (!(inode->i_state & I_DIRTY))
824 return ret;
825 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
826 return ret;
827
828 err = sync_inode(inode, &wbc);
829 if (ret == 0)
830 ret = err;
831 return ret;
832}
833EXPORT_SYMBOL(simple_fsync);
834
810EXPORT_SYMBOL(dcache_dir_close); 835EXPORT_SYMBOL(dcache_dir_close);
811EXPORT_SYMBOL(dcache_dir_lseek); 836EXPORT_SYMBOL(dcache_dir_lseek);
812EXPORT_SYMBOL(dcache_dir_open); 837EXPORT_SYMBOL(dcache_dir_open);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index dd7957064a8c..4336adba952a 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/smp_lock.h>
10#include <linux/types.h> 11#include <linux/types.h>
11#include <linux/errno.h> 12#include <linux/errno.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
@@ -126,7 +127,6 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
126 struct nlm_lock *lock = &argp->lock; 127 struct nlm_lock *lock = &argp->lock;
127 128
128 nlmclnt_next_cookie(&argp->cookie); 129 nlmclnt_next_cookie(&argp->cookie);
129 argp->state = nsm_local_state;
130 memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh)); 130 memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh));
131 lock->caller = utsname()->nodename; 131 lock->caller = utsname()->nodename;
132 lock->oh.data = req->a_owner; 132 lock->oh.data = req->a_owner;
@@ -165,6 +165,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
165 /* Set up the argument struct */ 165 /* Set up the argument struct */
166 nlmclnt_setlockargs(call, fl); 166 nlmclnt_setlockargs(call, fl);
167 167
168 lock_kernel();
168 if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { 169 if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
169 if (fl->fl_type != F_UNLCK) { 170 if (fl->fl_type != F_UNLCK) {
170 call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; 171 call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -178,6 +179,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
178 179
179 fl->fl_ops->fl_release_private(fl); 180 fl->fl_ops->fl_release_private(fl);
180 fl->fl_ops = NULL; 181 fl->fl_ops = NULL;
182 unlock_kernel();
181 183
182 dprintk("lockd: clnt proc returns %d\n", status); 184 dprintk("lockd: clnt proc returns %d\n", status);
183 return status; 185 return status;
@@ -519,6 +521,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
519 521
520 if (nsm_monitor(host) < 0) 522 if (nsm_monitor(host) < 0)
521 goto out; 523 goto out;
524 req->a_args.state = nsm_local_state;
522 525
523 fl->fl_flags |= FL_ACCESS; 526 fl->fl_flags |= FL_ACCESS;
524 status = do_vfs_lock(fl); 527 status = do_vfs_lock(fl);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 6d5d4a4169e5..7fce1b525849 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -53,7 +53,7 @@ static DEFINE_SPINLOCK(nsm_lock);
53/* 53/*
54 * Local NSM state 54 * Local NSM state
55 */ 55 */
56int __read_mostly nsm_local_state; 56u32 __read_mostly nsm_local_state;
57int __read_mostly nsm_use_hostnames; 57int __read_mostly nsm_use_hostnames;
58 58
59static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) 59static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
@@ -112,6 +112,7 @@ static struct rpc_clnt *nsm_create(void)
112 .program = &nsm_program, 112 .program = &nsm_program,
113 .version = NSM_VERSION, 113 .version = NSM_VERSION,
114 .authflavor = RPC_AUTH_NULL, 114 .authflavor = RPC_AUTH_NULL,
115 .flags = RPC_CLNT_CREATE_NOPING,
115 }; 116 };
116 117
117 return rpc_create(&args); 118 return rpc_create(&args);
@@ -184,13 +185,19 @@ int nsm_monitor(const struct nlm_host *host)
184 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; 185 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
185 186
186 status = nsm_mon_unmon(nsm, NSMPROC_MON, &res); 187 status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
187 if (res.status != 0) 188 if (unlikely(res.status != 0))
188 status = -EIO; 189 status = -EIO;
189 if (status < 0) 190 if (unlikely(status < 0)) {
190 printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name); 191 printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
191 else 192 return status;
192 nsm->sm_monitored = 1; 193 }
193 return status; 194
195 nsm->sm_monitored = 1;
196 if (unlikely(nsm_local_state != res.state)) {
197 nsm_local_state = res.state;
198 dprintk("lockd: NSM state changed to %d\n", nsm_local_state);
199 }
200 return 0;
194} 201}
195 202
196/** 203/**
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 1725037374c5..bd173a6ca3b1 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -10,6 +10,7 @@
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/smp_lock.h>
13#include <linux/in.h> 14#include <linux/in.h>
14#include <linux/sunrpc/svc.h> 15#include <linux/sunrpc/svc.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 83ee34203bd7..e577a78d7bac 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -326,6 +326,8 @@ static void nlmsvc_freegrantargs(struct nlm_rqst *call)
326{ 326{
327 if (call->a_args.lock.oh.data != call->a_owner) 327 if (call->a_args.lock.oh.data != call->a_owner)
328 kfree(call->a_args.lock.oh.data); 328 kfree(call->a_args.lock.oh.data);
329
330 locks_release_private(&call->a_args.lock.fl);
329} 331}
330 332
331/* 333/*
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3688e55901fc..e1d28ddd2169 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -10,6 +10,7 @@
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/smp_lock.h>
13#include <linux/in.h> 14#include <linux/in.h>
14#include <linux/sunrpc/svc.h> 15#include <linux/sunrpc/svc.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
diff --git a/fs/locks.c b/fs/locks.c
index ec3deea29e37..b6440f52178f 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -151,7 +151,7 @@ static struct file_lock *locks_alloc_lock(void)
151 return kmem_cache_alloc(filelock_cache, GFP_KERNEL); 151 return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
152} 152}
153 153
154static void locks_release_private(struct file_lock *fl) 154void locks_release_private(struct file_lock *fl)
155{ 155{
156 if (fl->fl_ops) { 156 if (fl->fl_ops) {
157 if (fl->fl_ops->fl_release_private) 157 if (fl->fl_ops->fl_release_private)
@@ -165,6 +165,7 @@ static void locks_release_private(struct file_lock *fl)
165 } 165 }
166 166
167} 167}
168EXPORT_SYMBOL_GPL(locks_release_private);
168 169
169/* Free a lock which is not in use. */ 170/* Free a lock which is not in use. */
170static void locks_free_lock(struct file_lock *fl) 171static void locks_free_lock(struct file_lock *fl)
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 3aebe322271a..6ac693faae49 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -12,13 +12,14 @@
12/* bitmap.c contains the code that handles the inode and block bitmaps */ 12/* bitmap.c contains the code that handles the inode and block bitmaps */
13 13
14#include "minix.h" 14#include "minix.h"
15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include <linux/bitops.h> 16#include <linux/bitops.h>
18#include <linux/sched.h> 17#include <linux/sched.h>
19 18
20static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 }; 19static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
21 20
21static DEFINE_SPINLOCK(bitmap_lock);
22
22static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) 23static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits)
23{ 24{
24 unsigned i, j, sum = 0; 25 unsigned i, j, sum = 0;
@@ -69,11 +70,11 @@ void minix_free_block(struct inode *inode, unsigned long block)
69 return; 70 return;
70 } 71 }
71 bh = sbi->s_zmap[zone]; 72 bh = sbi->s_zmap[zone];
72 lock_kernel(); 73 spin_lock(&bitmap_lock);
73 if (!minix_test_and_clear_bit(bit, bh->b_data)) 74 if (!minix_test_and_clear_bit(bit, bh->b_data))
74 printk("minix_free_block (%s:%lu): bit already cleared\n", 75 printk("minix_free_block (%s:%lu): bit already cleared\n",
75 sb->s_id, block); 76 sb->s_id, block);
76 unlock_kernel(); 77 spin_unlock(&bitmap_lock);
77 mark_buffer_dirty(bh); 78 mark_buffer_dirty(bh);
78 return; 79 return;
79} 80}
@@ -88,18 +89,18 @@ int minix_new_block(struct inode * inode)
88 struct buffer_head *bh = sbi->s_zmap[i]; 89 struct buffer_head *bh = sbi->s_zmap[i];
89 int j; 90 int j;
90 91
91 lock_kernel(); 92 spin_lock(&bitmap_lock);
92 j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); 93 j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
93 if (j < bits_per_zone) { 94 if (j < bits_per_zone) {
94 minix_set_bit(j, bh->b_data); 95 minix_set_bit(j, bh->b_data);
95 unlock_kernel(); 96 spin_unlock(&bitmap_lock);
96 mark_buffer_dirty(bh); 97 mark_buffer_dirty(bh);
97 j += i * bits_per_zone + sbi->s_firstdatazone-1; 98 j += i * bits_per_zone + sbi->s_firstdatazone-1;
98 if (j < sbi->s_firstdatazone || j >= sbi->s_nzones) 99 if (j < sbi->s_firstdatazone || j >= sbi->s_nzones)
99 break; 100 break;
100 return j; 101 return j;
101 } 102 }
102 unlock_kernel(); 103 spin_unlock(&bitmap_lock);
103 } 104 }
104 return 0; 105 return 0;
105} 106}
@@ -211,10 +212,10 @@ void minix_free_inode(struct inode * inode)
211 minix_clear_inode(inode); /* clear on-disk copy */ 212 minix_clear_inode(inode); /* clear on-disk copy */
212 213
213 bh = sbi->s_imap[ino]; 214 bh = sbi->s_imap[ino];
214 lock_kernel(); 215 spin_lock(&bitmap_lock);
215 if (!minix_test_and_clear_bit(bit, bh->b_data)) 216 if (!minix_test_and_clear_bit(bit, bh->b_data))
216 printk("minix_free_inode: bit %lu already cleared\n", bit); 217 printk("minix_free_inode: bit %lu already cleared\n", bit);
217 unlock_kernel(); 218 spin_unlock(&bitmap_lock);
218 mark_buffer_dirty(bh); 219 mark_buffer_dirty(bh);
219 out: 220 out:
220 clear_inode(inode); /* clear in-memory copy */ 221 clear_inode(inode); /* clear in-memory copy */
@@ -237,7 +238,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
237 j = bits_per_zone; 238 j = bits_per_zone;
238 bh = NULL; 239 bh = NULL;
239 *error = -ENOSPC; 240 *error = -ENOSPC;
240 lock_kernel(); 241 spin_lock(&bitmap_lock);
241 for (i = 0; i < sbi->s_imap_blocks; i++) { 242 for (i = 0; i < sbi->s_imap_blocks; i++) {
242 bh = sbi->s_imap[i]; 243 bh = sbi->s_imap[i];
243 j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); 244 j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
@@ -245,17 +246,17 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
245 break; 246 break;
246 } 247 }
247 if (!bh || j >= bits_per_zone) { 248 if (!bh || j >= bits_per_zone) {
248 unlock_kernel(); 249 spin_unlock(&bitmap_lock);
249 iput(inode); 250 iput(inode);
250 return NULL; 251 return NULL;
251 } 252 }
252 if (minix_test_and_set_bit(j, bh->b_data)) { /* shouldn't happen */ 253 if (minix_test_and_set_bit(j, bh->b_data)) { /* shouldn't happen */
253 unlock_kernel(); 254 spin_unlock(&bitmap_lock);
254 printk("minix_new_inode: bit already set\n"); 255 printk("minix_new_inode: bit already set\n");
255 iput(inode); 256 iput(inode);
256 return NULL; 257 return NULL;
257 } 258 }
258 unlock_kernel(); 259 spin_unlock(&bitmap_lock);
259 mark_buffer_dirty(bh); 260 mark_buffer_dirty(bh);
260 j += i * bits_per_zone; 261 j += i * bits_per_zone;
261 if (!j || j > sbi->s_ninodes) { 262 if (!j || j > sbi->s_ninodes) {
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d4946c4c90e2..d407e7a0b6fe 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -11,7 +11,6 @@
11#include "minix.h" 11#include "minix.h"
12#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
13#include <linux/highmem.h> 13#include <linux/highmem.h>
14#include <linux/smp_lock.h>
15#include <linux/swap.h> 14#include <linux/swap.h>
16 15
17typedef struct minix_dir_entry minix_dirent; 16typedef struct minix_dir_entry minix_dirent;
@@ -20,9 +19,10 @@ typedef struct minix3_dir_entry minix3_dirent;
20static int minix_readdir(struct file *, void *, filldir_t); 19static int minix_readdir(struct file *, void *, filldir_t);
21 20
22const struct file_operations minix_dir_operations = { 21const struct file_operations minix_dir_operations = {
22 .llseek = generic_file_llseek,
23 .read = generic_read_dir, 23 .read = generic_read_dir,
24 .readdir = minix_readdir, 24 .readdir = minix_readdir,
25 .fsync = minix_sync_file, 25 .fsync = simple_fsync,
26}; 26};
27 27
28static inline void dir_put_page(struct page *page) 28static inline void dir_put_page(struct page *page)
@@ -102,8 +102,6 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
102 char *name; 102 char *name;
103 __u32 inumber; 103 __u32 inumber;
104 104
105 lock_kernel();
106
107 pos = (pos + chunk_size-1) & ~(chunk_size-1); 105 pos = (pos + chunk_size-1) & ~(chunk_size-1);
108 if (pos >= inode->i_size) 106 if (pos >= inode->i_size)
109 goto done; 107 goto done;
@@ -146,7 +144,6 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
146 144
147done: 145done:
148 filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset; 146 filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
149 unlock_kernel();
150 return 0; 147 return 0;
151} 148}
152 149
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 17765f697e50..3eec3e607a87 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -6,15 +6,12 @@
6 * minix regular file handling primitives 6 * minix regular file handling primitives
7 */ 7 */
8 8
9#include <linux/buffer_head.h> /* for fsync_inode_buffers() */
10#include "minix.h" 9#include "minix.h"
11 10
12/* 11/*
13 * We have mostly NULLs here: the current defaults are OK for 12 * We have mostly NULLs here: the current defaults are OK for
14 * the minix filesystem. 13 * the minix filesystem.
15 */ 14 */
16int minix_sync_file(struct file *, struct dentry *, int);
17
18const struct file_operations minix_file_operations = { 15const struct file_operations minix_file_operations = {
19 .llseek = generic_file_llseek, 16 .llseek = generic_file_llseek,
20 .read = do_sync_read, 17 .read = do_sync_read,
@@ -22,7 +19,7 @@ const struct file_operations minix_file_operations = {
22 .write = do_sync_write, 19 .write = do_sync_write,
23 .aio_write = generic_file_aio_write, 20 .aio_write = generic_file_aio_write,
24 .mmap = generic_file_mmap, 21 .mmap = generic_file_mmap,
25 .fsync = minix_sync_file, 22 .fsync = simple_fsync,
26 .splice_read = generic_file_splice_read, 23 .splice_read = generic_file_splice_read,
27}; 24};
28 25
@@ -30,18 +27,3 @@ const struct inode_operations minix_file_inode_operations = {
30 .truncate = minix_truncate, 27 .truncate = minix_truncate,
31 .getattr = minix_getattr, 28 .getattr = minix_getattr,
32}; 29};
33
34int minix_sync_file(struct file * file, struct dentry *dentry, int datasync)
35{
36 struct inode *inode = dentry->d_inode;
37 int err;
38
39 err = sync_mapping_buffers(inode->i_mapping);
40 if (!(inode->i_state & I_DIRTY))
41 return err;
42 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
43 return err;
44
45 err |= minix_sync_inode(inode);
46 return err ? -EIO : 0;
47}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index daad3c2740db..74ea82d72164 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -48,8 +48,6 @@ static void minix_put_super(struct super_block *sb)
48 kfree(sbi->s_imap); 48 kfree(sbi->s_imap);
49 sb->s_fs_info = NULL; 49 sb->s_fs_info = NULL;
50 kfree(sbi); 50 kfree(sbi);
51
52 return;
53} 51}
54 52
55static struct kmem_cache * minix_inode_cachep; 53static struct kmem_cache * minix_inode_cachep;
@@ -554,38 +552,25 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
554 return bh; 552 return bh;
555} 553}
556 554
557static struct buffer_head *minix_update_inode(struct inode *inode) 555static int minix_write_inode(struct inode *inode, int wait)
558{
559 if (INODE_VERSION(inode) == MINIX_V1)
560 return V1_minix_update_inode(inode);
561 else
562 return V2_minix_update_inode(inode);
563}
564
565static int minix_write_inode(struct inode * inode, int wait)
566{
567 brelse(minix_update_inode(inode));
568 return 0;
569}
570
571int minix_sync_inode(struct inode * inode)
572{ 556{
573 int err = 0; 557 int err = 0;
574 struct buffer_head *bh; 558 struct buffer_head *bh;
575 559
576 bh = minix_update_inode(inode); 560 if (INODE_VERSION(inode) == MINIX_V1)
577 if (bh && buffer_dirty(bh)) 561 bh = V1_minix_update_inode(inode);
578 { 562 else
563 bh = V2_minix_update_inode(inode);
564 if (!bh)
565 return -EIO;
566 if (wait && buffer_dirty(bh)) {
579 sync_dirty_buffer(bh); 567 sync_dirty_buffer(bh);
580 if (buffer_req(bh) && !buffer_uptodate(bh)) 568 if (buffer_req(bh) && !buffer_uptodate(bh)) {
581 {
582 printk("IO error syncing minix inode [%s:%08lx]\n", 569 printk("IO error syncing minix inode [%s:%08lx]\n",
583 inode->i_sb->s_id, inode->i_ino); 570 inode->i_sb->s_id, inode->i_ino);
584 err = -1; 571 err = -EIO;
585 } 572 }
586 } 573 }
587 else if (!bh)
588 err = -1;
589 brelse (bh); 574 brelse (bh);
590 return err; 575 return err;
591} 576}
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index e6a0b193bea4..9dcf95b42116 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -1,3 +1,6 @@
1#ifndef FS_MINIX_H
2#define FS_MINIX_H
3
1#include <linux/fs.h> 4#include <linux/fs.h>
2#include <linux/pagemap.h> 5#include <linux/pagemap.h>
3#include <linux/minix_fs.h> 6#include <linux/minix_fs.h>
@@ -57,7 +60,6 @@ extern int __minix_write_begin(struct file *file, struct address_space *mapping,
57extern void V1_minix_truncate(struct inode *); 60extern void V1_minix_truncate(struct inode *);
58extern void V2_minix_truncate(struct inode *); 61extern void V2_minix_truncate(struct inode *);
59extern void minix_truncate(struct inode *); 62extern void minix_truncate(struct inode *);
60extern int minix_sync_inode(struct inode *);
61extern void minix_set_inode(struct inode *, dev_t); 63extern void minix_set_inode(struct inode *, dev_t);
62extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int); 64extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int);
63extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int); 65extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int);
@@ -72,7 +74,6 @@ extern int minix_empty_dir(struct inode*);
72extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*); 74extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*);
73extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); 75extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**);
74extern ino_t minix_inode_by_name(struct dentry*); 76extern ino_t minix_inode_by_name(struct dentry*);
75extern int minix_sync_file(struct file *, struct dentry *, int);
76 77
77extern const struct inode_operations minix_file_inode_operations; 78extern const struct inode_operations minix_file_inode_operations;
78extern const struct inode_operations minix_dir_inode_operations; 79extern const struct inode_operations minix_dir_inode_operations;
@@ -88,3 +89,5 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
88{ 89{
89 return list_entry(inode, struct minix_inode_info, vfs_inode); 90 return list_entry(inode, struct minix_inode_info, vfs_inode);
90} 91}
92
93#endif /* FS_MINIX_H */
diff --git a/fs/mpage.c b/fs/mpage.c
index 680ba60863ff..42381bd6543b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -379,7 +379,8 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
379 struct buffer_head map_bh; 379 struct buffer_head map_bh;
380 unsigned long first_logical_block = 0; 380 unsigned long first_logical_block = 0;
381 381
382 clear_buffer_mapped(&map_bh); 382 map_bh.b_state = 0;
383 map_bh.b_size = 0;
383 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 384 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
384 struct page *page = list_entry(pages->prev, struct page, lru); 385 struct page *page = list_entry(pages->prev, struct page, lru);
385 386
@@ -412,7 +413,8 @@ int mpage_readpage(struct page *page, get_block_t get_block)
412 struct buffer_head map_bh; 413 struct buffer_head map_bh;
413 unsigned long first_logical_block = 0; 414 unsigned long first_logical_block = 0;
414 415
415 clear_buffer_mapped(&map_bh); 416 map_bh.b_state = 0;
417 map_bh.b_size = 0;
416 bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, 418 bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
417 &map_bh, &first_logical_block, get_block); 419 &map_bh, &first_logical_block, get_block);
418 if (bio) 420 if (bio)
diff --git a/fs/namei.c b/fs/namei.c
index 967c3db92724..f3c5b278895a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -552,6 +552,17 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
552 return result; 552 return result;
553} 553}
554 554
555static __always_inline void set_root(struct nameidata *nd)
556{
557 if (!nd->root.mnt) {
558 struct fs_struct *fs = current->fs;
559 read_lock(&fs->lock);
560 nd->root = fs->root;
561 path_get(&nd->root);
562 read_unlock(&fs->lock);
563 }
564}
565
555static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 566static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
556{ 567{
557 int res = 0; 568 int res = 0;
@@ -560,14 +571,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
560 goto fail; 571 goto fail;
561 572
562 if (*link == '/') { 573 if (*link == '/') {
563 struct fs_struct *fs = current->fs; 574 set_root(nd);
564
565 path_put(&nd->path); 575 path_put(&nd->path);
566 576 nd->path = nd->root;
567 read_lock(&fs->lock); 577 path_get(&nd->root);
568 nd->path = fs->root;
569 path_get(&fs->root);
570 read_unlock(&fs->lock);
571 } 578 }
572 579
573 res = link_path_walk(link, nd); 580 res = link_path_walk(link, nd);
@@ -668,23 +675,23 @@ loop:
668 return err; 675 return err;
669} 676}
670 677
671int follow_up(struct vfsmount **mnt, struct dentry **dentry) 678int follow_up(struct path *path)
672{ 679{
673 struct vfsmount *parent; 680 struct vfsmount *parent;
674 struct dentry *mountpoint; 681 struct dentry *mountpoint;
675 spin_lock(&vfsmount_lock); 682 spin_lock(&vfsmount_lock);
676 parent=(*mnt)->mnt_parent; 683 parent = path->mnt->mnt_parent;
677 if (parent == *mnt) { 684 if (parent == path->mnt) {
678 spin_unlock(&vfsmount_lock); 685 spin_unlock(&vfsmount_lock);
679 return 0; 686 return 0;
680 } 687 }
681 mntget(parent); 688 mntget(parent);
682 mountpoint=dget((*mnt)->mnt_mountpoint); 689 mountpoint = dget(path->mnt->mnt_mountpoint);
683 spin_unlock(&vfsmount_lock); 690 spin_unlock(&vfsmount_lock);
684 dput(*dentry); 691 dput(path->dentry);
685 *dentry = mountpoint; 692 path->dentry = mountpoint;
686 mntput(*mnt); 693 mntput(path->mnt);
687 *mnt = parent; 694 path->mnt = parent;
688 return 1; 695 return 1;
689} 696}
690 697
@@ -695,7 +702,7 @@ static int __follow_mount(struct path *path)
695{ 702{
696 int res = 0; 703 int res = 0;
697 while (d_mountpoint(path->dentry)) { 704 while (d_mountpoint(path->dentry)) {
698 struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry); 705 struct vfsmount *mounted = lookup_mnt(path);
699 if (!mounted) 706 if (!mounted)
700 break; 707 break;
701 dput(path->dentry); 708 dput(path->dentry);
@@ -708,32 +715,32 @@ static int __follow_mount(struct path *path)
708 return res; 715 return res;
709} 716}
710 717
711static void follow_mount(struct vfsmount **mnt, struct dentry **dentry) 718static void follow_mount(struct path *path)
712{ 719{
713 while (d_mountpoint(*dentry)) { 720 while (d_mountpoint(path->dentry)) {
714 struct vfsmount *mounted = lookup_mnt(*mnt, *dentry); 721 struct vfsmount *mounted = lookup_mnt(path);
715 if (!mounted) 722 if (!mounted)
716 break; 723 break;
717 dput(*dentry); 724 dput(path->dentry);
718 mntput(*mnt); 725 mntput(path->mnt);
719 *mnt = mounted; 726 path->mnt = mounted;
720 *dentry = dget(mounted->mnt_root); 727 path->dentry = dget(mounted->mnt_root);
721 } 728 }
722} 729}
723 730
724/* no need for dcache_lock, as serialization is taken care in 731/* no need for dcache_lock, as serialization is taken care in
725 * namespace.c 732 * namespace.c
726 */ 733 */
727int follow_down(struct vfsmount **mnt, struct dentry **dentry) 734int follow_down(struct path *path)
728{ 735{
729 struct vfsmount *mounted; 736 struct vfsmount *mounted;
730 737
731 mounted = lookup_mnt(*mnt, *dentry); 738 mounted = lookup_mnt(path);
732 if (mounted) { 739 if (mounted) {
733 dput(*dentry); 740 dput(path->dentry);
734 mntput(*mnt); 741 mntput(path->mnt);
735 *mnt = mounted; 742 path->mnt = mounted;
736 *dentry = dget(mounted->mnt_root); 743 path->dentry = dget(mounted->mnt_root);
737 return 1; 744 return 1;
738 } 745 }
739 return 0; 746 return 0;
@@ -741,19 +748,16 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
741 748
742static __always_inline void follow_dotdot(struct nameidata *nd) 749static __always_inline void follow_dotdot(struct nameidata *nd)
743{ 750{
744 struct fs_struct *fs = current->fs; 751 set_root(nd);
745 752
746 while(1) { 753 while(1) {
747 struct vfsmount *parent; 754 struct vfsmount *parent;
748 struct dentry *old = nd->path.dentry; 755 struct dentry *old = nd->path.dentry;
749 756
750 read_lock(&fs->lock); 757 if (nd->path.dentry == nd->root.dentry &&
751 if (nd->path.dentry == fs->root.dentry && 758 nd->path.mnt == nd->root.mnt) {
752 nd->path.mnt == fs->root.mnt) {
753 read_unlock(&fs->lock);
754 break; 759 break;
755 } 760 }
756 read_unlock(&fs->lock);
757 spin_lock(&dcache_lock); 761 spin_lock(&dcache_lock);
758 if (nd->path.dentry != nd->path.mnt->mnt_root) { 762 if (nd->path.dentry != nd->path.mnt->mnt_root) {
759 nd->path.dentry = dget(nd->path.dentry->d_parent); 763 nd->path.dentry = dget(nd->path.dentry->d_parent);
@@ -775,7 +779,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
775 mntput(nd->path.mnt); 779 mntput(nd->path.mnt);
776 nd->path.mnt = parent; 780 nd->path.mnt = parent;
777 } 781 }
778 follow_mount(&nd->path.mnt, &nd->path.dentry); 782 follow_mount(&nd->path);
779} 783}
780 784
781/* 785/*
@@ -853,7 +857,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
853 err = inode_permission(nd->path.dentry->d_inode, 857 err = inode_permission(nd->path.dentry->d_inode,
854 MAY_EXEC); 858 MAY_EXEC);
855 if (!err) 859 if (!err)
856 err = ima_path_check(&nd->path, MAY_EXEC); 860 err = ima_path_check(&nd->path, MAY_EXEC,
861 IMA_COUNT_UPDATE);
857 if (err) 862 if (err)
858 break; 863 break;
859 864
@@ -1016,25 +1021,23 @@ static int path_walk(const char *name, struct nameidata *nd)
1016 return link_path_walk(name, nd); 1021 return link_path_walk(name, nd);
1017} 1022}
1018 1023
1019/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ 1024static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1020static int do_path_lookup(int dfd, const char *name,
1021 unsigned int flags, struct nameidata *nd)
1022{ 1025{
1023 int retval = 0; 1026 int retval = 0;
1024 int fput_needed; 1027 int fput_needed;
1025 struct file *file; 1028 struct file *file;
1026 struct fs_struct *fs = current->fs;
1027 1029
1028 nd->last_type = LAST_ROOT; /* if there are only slashes... */ 1030 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1029 nd->flags = flags; 1031 nd->flags = flags;
1030 nd->depth = 0; 1032 nd->depth = 0;
1033 nd->root.mnt = NULL;
1031 1034
1032 if (*name=='/') { 1035 if (*name=='/') {
1033 read_lock(&fs->lock); 1036 set_root(nd);
1034 nd->path = fs->root; 1037 nd->path = nd->root;
1035 path_get(&fs->root); 1038 path_get(&nd->root);
1036 read_unlock(&fs->lock);
1037 } else if (dfd == AT_FDCWD) { 1039 } else if (dfd == AT_FDCWD) {
1040 struct fs_struct *fs = current->fs;
1038 read_lock(&fs->lock); 1041 read_lock(&fs->lock);
1039 nd->path = fs->pwd; 1042 nd->path = fs->pwd;
1040 path_get(&fs->pwd); 1043 path_get(&fs->pwd);
@@ -1062,17 +1065,29 @@ static int do_path_lookup(int dfd, const char *name,
1062 1065
1063 fput_light(file, fput_needed); 1066 fput_light(file, fput_needed);
1064 } 1067 }
1068 return 0;
1065 1069
1066 retval = path_walk(name, nd); 1070fput_fail:
1071 fput_light(file, fput_needed);
1072out_fail:
1073 return retval;
1074}
1075
1076/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1077static int do_path_lookup(int dfd, const char *name,
1078 unsigned int flags, struct nameidata *nd)
1079{
1080 int retval = path_init(dfd, name, flags, nd);
1081 if (!retval)
1082 retval = path_walk(name, nd);
1067 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1083 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1068 nd->path.dentry->d_inode)) 1084 nd->path.dentry->d_inode))
1069 audit_inode(name, nd->path.dentry); 1085 audit_inode(name, nd->path.dentry);
1070out_fail: 1086 if (nd->root.mnt) {
1087 path_put(&nd->root);
1088 nd->root.mnt = NULL;
1089 }
1071 return retval; 1090 return retval;
1072
1073fput_fail:
1074 fput_light(file, fput_needed);
1075 goto out_fail;
1076} 1091}
1077 1092
1078int path_lookup(const char *name, unsigned int flags, 1093int path_lookup(const char *name, unsigned int flags,
@@ -1112,14 +1127,18 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1112 nd->path.dentry = dentry; 1127 nd->path.dentry = dentry;
1113 nd->path.mnt = mnt; 1128 nd->path.mnt = mnt;
1114 path_get(&nd->path); 1129 path_get(&nd->path);
1130 nd->root = nd->path;
1131 path_get(&nd->root);
1115 1132
1116 retval = path_walk(name, nd); 1133 retval = path_walk(name, nd);
1117 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1134 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1118 nd->path.dentry->d_inode)) 1135 nd->path.dentry->d_inode))
1119 audit_inode(name, nd->path.dentry); 1136 audit_inode(name, nd->path.dentry);
1120 1137
1121 return retval; 1138 path_put(&nd->root);
1139 nd->root.mnt = NULL;
1122 1140
1141 return retval;
1123} 1142}
1124 1143
1125/** 1144/**
@@ -1515,7 +1534,8 @@ int may_open(struct path *path, int acc_mode, int flag)
1515 return error; 1534 return error;
1516 1535
1517 error = ima_path_check(path, 1536 error = ima_path_check(path,
1518 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); 1537 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
1538 IMA_COUNT_UPDATE);
1519 if (error) 1539 if (error)
1520 return error; 1540 return error;
1521 /* 1541 /*
@@ -1674,9 +1694,17 @@ struct file *do_filp_open(int dfd, const char *pathname,
1674 /* 1694 /*
1675 * Create - we need to know the parent. 1695 * Create - we need to know the parent.
1676 */ 1696 */
1677 error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); 1697 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
1678 if (error) 1698 if (error)
1679 return ERR_PTR(error); 1699 return ERR_PTR(error);
1700 error = path_walk(pathname, &nd);
1701 if (error) {
1702 if (nd.root.mnt)
1703 path_put(&nd.root);
1704 return ERR_PTR(error);
1705 }
1706 if (unlikely(!audit_dummy_context()))
1707 audit_inode(pathname, nd.path.dentry);
1680 1708
1681 /* 1709 /*
1682 * We have the parent and last component. First of all, check 1710 * We have the parent and last component. First of all, check
@@ -1733,7 +1761,13 @@ do_last:
1733 goto exit; 1761 goto exit;
1734 } 1762 }
1735 filp = nameidata_to_filp(&nd, open_flag); 1763 filp = nameidata_to_filp(&nd, open_flag);
1764 if (IS_ERR(filp))
1765 ima_counts_put(&nd.path,
1766 acc_mode & (MAY_READ | MAY_WRITE |
1767 MAY_EXEC));
1736 mnt_drop_write(nd.path.mnt); 1768 mnt_drop_write(nd.path.mnt);
1769 if (nd.root.mnt)
1770 path_put(&nd.root);
1737 return filp; 1771 return filp;
1738 } 1772 }
1739 1773
@@ -1787,6 +1821,9 @@ ok:
1787 goto exit; 1821 goto exit;
1788 } 1822 }
1789 filp = nameidata_to_filp(&nd, open_flag); 1823 filp = nameidata_to_filp(&nd, open_flag);
1824 if (IS_ERR(filp))
1825 ima_counts_put(&nd.path,
1826 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
1790 /* 1827 /*
1791 * It is now safe to drop the mnt write 1828 * It is now safe to drop the mnt write
1792 * because the filp has had a write taken 1829 * because the filp has had a write taken
@@ -1794,6 +1831,8 @@ ok:
1794 */ 1831 */
1795 if (will_write) 1832 if (will_write)
1796 mnt_drop_write(nd.path.mnt); 1833 mnt_drop_write(nd.path.mnt);
1834 if (nd.root.mnt)
1835 path_put(&nd.root);
1797 return filp; 1836 return filp;
1798 1837
1799exit_mutex_unlock: 1838exit_mutex_unlock:
@@ -1804,6 +1843,8 @@ exit:
1804 if (!IS_ERR(nd.intent.open.file)) 1843 if (!IS_ERR(nd.intent.open.file))
1805 release_open_intent(&nd); 1844 release_open_intent(&nd);
1806exit_parent: 1845exit_parent:
1846 if (nd.root.mnt)
1847 path_put(&nd.root);
1807 path_put(&nd.path); 1848 path_put(&nd.path);
1808 return ERR_PTR(error); 1849 return ERR_PTR(error);
1809 1850
@@ -1832,6 +1873,8 @@ do_link:
1832 * with "intent.open". 1873 * with "intent.open".
1833 */ 1874 */
1834 release_open_intent(&nd); 1875 release_open_intent(&nd);
1876 if (nd.root.mnt)
1877 path_put(&nd.root);
1835 return ERR_PTR(error); 1878 return ERR_PTR(error);
1836 } 1879 }
1837 nd.flags &= ~LOOKUP_PARENT; 1880 nd.flags &= ~LOOKUP_PARENT;
diff --git a/fs/namespace.c b/fs/namespace.c
index 134d494158d9..7230787d18b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -22,6 +22,7 @@
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/mnt_namespace.h> 23#include <linux/mnt_namespace.h>
24#include <linux/namei.h> 24#include <linux/namei.h>
25#include <linux/nsproxy.h>
25#include <linux/security.h> 26#include <linux/security.h>
26#include <linux/mount.h> 27#include <linux/mount.h>
27#include <linux/ramfs.h> 28#include <linux/ramfs.h>
@@ -42,6 +43,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
42static int event; 43static int event;
43static DEFINE_IDA(mnt_id_ida); 44static DEFINE_IDA(mnt_id_ida);
44static DEFINE_IDA(mnt_group_ida); 45static DEFINE_IDA(mnt_group_ida);
46static int mnt_id_start = 0;
47static int mnt_group_start = 1;
45 48
46static struct list_head *mount_hashtable __read_mostly; 49static struct list_head *mount_hashtable __read_mostly;
47static struct kmem_cache *mnt_cache __read_mostly; 50static struct kmem_cache *mnt_cache __read_mostly;
@@ -69,7 +72,9 @@ static int mnt_alloc_id(struct vfsmount *mnt)
69retry: 72retry:
70 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 73 ida_pre_get(&mnt_id_ida, GFP_KERNEL);
71 spin_lock(&vfsmount_lock); 74 spin_lock(&vfsmount_lock);
72 res = ida_get_new(&mnt_id_ida, &mnt->mnt_id); 75 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
76 if (!res)
77 mnt_id_start = mnt->mnt_id + 1;
73 spin_unlock(&vfsmount_lock); 78 spin_unlock(&vfsmount_lock);
74 if (res == -EAGAIN) 79 if (res == -EAGAIN)
75 goto retry; 80 goto retry;
@@ -79,8 +84,11 @@ retry:
79 84
80static void mnt_free_id(struct vfsmount *mnt) 85static void mnt_free_id(struct vfsmount *mnt)
81{ 86{
87 int id = mnt->mnt_id;
82 spin_lock(&vfsmount_lock); 88 spin_lock(&vfsmount_lock);
83 ida_remove(&mnt_id_ida, mnt->mnt_id); 89 ida_remove(&mnt_id_ida, id);
90 if (mnt_id_start > id)
91 mnt_id_start = id;
84 spin_unlock(&vfsmount_lock); 92 spin_unlock(&vfsmount_lock);
85} 93}
86 94
@@ -91,10 +99,18 @@ static void mnt_free_id(struct vfsmount *mnt)
91 */ 99 */
92static int mnt_alloc_group_id(struct vfsmount *mnt) 100static int mnt_alloc_group_id(struct vfsmount *mnt)
93{ 101{
102 int res;
103
94 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) 104 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
95 return -ENOMEM; 105 return -ENOMEM;
96 106
97 return ida_get_new_above(&mnt_group_ida, 1, &mnt->mnt_group_id); 107 res = ida_get_new_above(&mnt_group_ida,
108 mnt_group_start,
109 &mnt->mnt_group_id);
110 if (!res)
111 mnt_group_start = mnt->mnt_group_id + 1;
112
113 return res;
98} 114}
99 115
100/* 116/*
@@ -102,7 +118,10 @@ static int mnt_alloc_group_id(struct vfsmount *mnt)
102 */ 118 */
103void mnt_release_group_id(struct vfsmount *mnt) 119void mnt_release_group_id(struct vfsmount *mnt)
104{ 120{
105 ida_remove(&mnt_group_ida, mnt->mnt_group_id); 121 int id = mnt->mnt_group_id;
122 ida_remove(&mnt_group_ida, id);
123 if (mnt_group_start > id)
124 mnt_group_start = id;
106 mnt->mnt_group_id = 0; 125 mnt->mnt_group_id = 0;
107} 126}
108 127
@@ -131,10 +150,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
131 INIT_LIST_HEAD(&mnt->mnt_share); 150 INIT_LIST_HEAD(&mnt->mnt_share);
132 INIT_LIST_HEAD(&mnt->mnt_slave_list); 151 INIT_LIST_HEAD(&mnt->mnt_slave_list);
133 INIT_LIST_HEAD(&mnt->mnt_slave); 152 INIT_LIST_HEAD(&mnt->mnt_slave);
134 atomic_set(&mnt->__mnt_writers, 0); 153#ifdef CONFIG_SMP
154 mnt->mnt_writers = alloc_percpu(int);
155 if (!mnt->mnt_writers)
156 goto out_free_devname;
157#else
158 mnt->mnt_writers = 0;
159#endif
135 } 160 }
136 return mnt; 161 return mnt;
137 162
163#ifdef CONFIG_SMP
164out_free_devname:
165 kfree(mnt->mnt_devname);
166#endif
138out_free_id: 167out_free_id:
139 mnt_free_id(mnt); 168 mnt_free_id(mnt);
140out_free_cache: 169out_free_cache:
@@ -171,65 +200,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
171} 200}
172EXPORT_SYMBOL_GPL(__mnt_is_readonly); 201EXPORT_SYMBOL_GPL(__mnt_is_readonly);
173 202
174struct mnt_writer { 203static inline void inc_mnt_writers(struct vfsmount *mnt)
175 /* 204{
176 * If holding multiple instances of this lock, they 205#ifdef CONFIG_SMP
177 * must be ordered by cpu number. 206 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
178 */ 207#else
179 spinlock_t lock; 208 mnt->mnt_writers++;
180 struct lock_class_key lock_class; /* compiles out with !lockdep */ 209#endif
181 unsigned long count; 210}
182 struct vfsmount *mnt;
183} ____cacheline_aligned_in_smp;
184static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
185 211
186static int __init init_mnt_writers(void) 212static inline void dec_mnt_writers(struct vfsmount *mnt)
187{ 213{
188 int cpu; 214#ifdef CONFIG_SMP
189 for_each_possible_cpu(cpu) { 215 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
190 struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); 216#else
191 spin_lock_init(&writer->lock); 217 mnt->mnt_writers--;
192 lockdep_set_class(&writer->lock, &writer->lock_class); 218#endif
193 writer->count = 0;
194 }
195 return 0;
196} 219}
197fs_initcall(init_mnt_writers);
198 220
199static void unlock_mnt_writers(void) 221static unsigned int count_mnt_writers(struct vfsmount *mnt)
200{ 222{
223#ifdef CONFIG_SMP
224 unsigned int count = 0;
201 int cpu; 225 int cpu;
202 struct mnt_writer *cpu_writer;
203 226
204 for_each_possible_cpu(cpu) { 227 for_each_possible_cpu(cpu) {
205 cpu_writer = &per_cpu(mnt_writers, cpu); 228 count += *per_cpu_ptr(mnt->mnt_writers, cpu);
206 spin_unlock(&cpu_writer->lock);
207 } 229 }
208}
209 230
210static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) 231 return count;
211{ 232#else
212 if (!cpu_writer->mnt) 233 return mnt->mnt_writers;
213 return; 234#endif
214 /*
215 * This is in case anyone ever leaves an invalid,
216 * old ->mnt and a count of 0.
217 */
218 if (!cpu_writer->count)
219 return;
220 atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
221 cpu_writer->count = 0;
222}
223 /*
224 * must hold cpu_writer->lock
225 */
226static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
227 struct vfsmount *mnt)
228{
229 if (cpu_writer->mnt == mnt)
230 return;
231 __clear_mnt_count(cpu_writer);
232 cpu_writer->mnt = mnt;
233} 235}
234 236
235/* 237/*
@@ -253,74 +255,74 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
253int mnt_want_write(struct vfsmount *mnt) 255int mnt_want_write(struct vfsmount *mnt)
254{ 256{
255 int ret = 0; 257 int ret = 0;
256 struct mnt_writer *cpu_writer;
257 258
258 cpu_writer = &get_cpu_var(mnt_writers); 259 preempt_disable();
259 spin_lock(&cpu_writer->lock); 260 inc_mnt_writers(mnt);
261 /*
262 * The store to inc_mnt_writers must be visible before we pass
263 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
264 * incremented count after it has set MNT_WRITE_HOLD.
265 */
266 smp_mb();
267 while (mnt->mnt_flags & MNT_WRITE_HOLD)
268 cpu_relax();
269 /*
270 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
271 * be set to match its requirements. So we must not load that until
272 * MNT_WRITE_HOLD is cleared.
273 */
274 smp_rmb();
260 if (__mnt_is_readonly(mnt)) { 275 if (__mnt_is_readonly(mnt)) {
276 dec_mnt_writers(mnt);
261 ret = -EROFS; 277 ret = -EROFS;
262 goto out; 278 goto out;
263 } 279 }
264 use_cpu_writer_for_mount(cpu_writer, mnt);
265 cpu_writer->count++;
266out: 280out:
267 spin_unlock(&cpu_writer->lock); 281 preempt_enable();
268 put_cpu_var(mnt_writers);
269 return ret; 282 return ret;
270} 283}
271EXPORT_SYMBOL_GPL(mnt_want_write); 284EXPORT_SYMBOL_GPL(mnt_want_write);
272 285
273static void lock_mnt_writers(void) 286/**
274{ 287 * mnt_clone_write - get write access to a mount
275 int cpu; 288 * @mnt: the mount on which to take a write
276 struct mnt_writer *cpu_writer; 289 *
277 290 * This is effectively like mnt_want_write, except
278 for_each_possible_cpu(cpu) { 291 * it must only be used to take an extra write reference
279 cpu_writer = &per_cpu(mnt_writers, cpu); 292 * on a mountpoint that we already know has a write reference
280 spin_lock(&cpu_writer->lock); 293 * on it. This allows some optimisation.
281 __clear_mnt_count(cpu_writer); 294 *
282 cpu_writer->mnt = NULL; 295 * After finished, mnt_drop_write must be called as usual to
283 } 296 * drop the reference.
297 */
298int mnt_clone_write(struct vfsmount *mnt)
299{
300 /* superblock may be r/o */
301 if (__mnt_is_readonly(mnt))
302 return -EROFS;
303 preempt_disable();
304 inc_mnt_writers(mnt);
305 preempt_enable();
306 return 0;
284} 307}
308EXPORT_SYMBOL_GPL(mnt_clone_write);
285 309
286/* 310/**
287 * These per-cpu write counts are not guaranteed to have 311 * mnt_want_write_file - get write access to a file's mount
288 * matched increments and decrements on any given cpu. 312 * @file: the file who's mount on which to take a write
289 * A file open()ed for write on one cpu and close()d on 313 *
290 * another cpu will imbalance this count. Make sure it 314 * This is like mnt_want_write, but it takes a file and can
291 * does not get too far out of whack. 315 * do some optimisations if the file is open for write already
292 */ 316 */
293static void handle_write_count_underflow(struct vfsmount *mnt) 317int mnt_want_write_file(struct file *file)
294{ 318{
295 if (atomic_read(&mnt->__mnt_writers) >= 319 struct inode *inode = file->f_dentry->d_inode;
296 MNT_WRITER_UNDERFLOW_LIMIT) 320 if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
297 return; 321 return mnt_want_write(file->f_path.mnt);
298 /* 322 else
299 * It isn't necessary to hold all of the locks 323 return mnt_clone_write(file->f_path.mnt);
300 * at the same time, but doing it this way makes
301 * us share a lot more code.
302 */
303 lock_mnt_writers();
304 /*
305 * vfsmount_lock is for mnt_flags.
306 */
307 spin_lock(&vfsmount_lock);
308 /*
309 * If coalescing the per-cpu writer counts did not
310 * get us back to a positive writer count, we have
311 * a bug.
312 */
313 if ((atomic_read(&mnt->__mnt_writers) < 0) &&
314 !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
315 WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
316 "count: %d\n",
317 mnt, atomic_read(&mnt->__mnt_writers));
318 /* use the flag to keep the dmesg spam down */
319 mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
320 }
321 spin_unlock(&vfsmount_lock);
322 unlock_mnt_writers();
323} 324}
325EXPORT_SYMBOL_GPL(mnt_want_write_file);
324 326
325/** 327/**
326 * mnt_drop_write - give up write access to a mount 328 * mnt_drop_write - give up write access to a mount
@@ -332,37 +334,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
332 */ 334 */
333void mnt_drop_write(struct vfsmount *mnt) 335void mnt_drop_write(struct vfsmount *mnt)
334{ 336{
335 int must_check_underflow = 0; 337 preempt_disable();
336 struct mnt_writer *cpu_writer; 338 dec_mnt_writers(mnt);
337 339 preempt_enable();
338 cpu_writer = &get_cpu_var(mnt_writers);
339 spin_lock(&cpu_writer->lock);
340
341 use_cpu_writer_for_mount(cpu_writer, mnt);
342 if (cpu_writer->count > 0) {
343 cpu_writer->count--;
344 } else {
345 must_check_underflow = 1;
346 atomic_dec(&mnt->__mnt_writers);
347 }
348
349 spin_unlock(&cpu_writer->lock);
350 /*
351 * Logically, we could call this each time,
352 * but the __mnt_writers cacheline tends to
353 * be cold, and makes this expensive.
354 */
355 if (must_check_underflow)
356 handle_write_count_underflow(mnt);
357 /*
358 * This could be done right after the spinlock
359 * is taken because the spinlock keeps us on
360 * the cpu, and disables preemption. However,
361 * putting it here bounds the amount that
362 * __mnt_writers can underflow. Without it,
363 * we could theoretically wrap __mnt_writers.
364 */
365 put_cpu_var(mnt_writers);
366} 340}
367EXPORT_SYMBOL_GPL(mnt_drop_write); 341EXPORT_SYMBOL_GPL(mnt_drop_write);
368 342
@@ -370,24 +344,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
370{ 344{
371 int ret = 0; 345 int ret = 0;
372 346
373 lock_mnt_writers(); 347 spin_lock(&vfsmount_lock);
348 mnt->mnt_flags |= MNT_WRITE_HOLD;
374 /* 349 /*
375 * With all the locks held, this value is stable 350 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
351 * should be visible before we do.
376 */ 352 */
377 if (atomic_read(&mnt->__mnt_writers) > 0) { 353 smp_mb();
378 ret = -EBUSY; 354
379 goto out;
380 }
381 /* 355 /*
382 * nobody can do a successful mnt_want_write() with all 356 * With writers on hold, if this value is zero, then there are
383 * of the counts in MNT_DENIED_WRITE and the locks held. 357 * definitely no active writers (although held writers may subsequently
358 * increment the count, they'll have to wait, and decrement it after
359 * seeing MNT_READONLY).
360 *
361 * It is OK to have counter incremented on one CPU and decremented on
362 * another: the sum will add up correctly. The danger would be when we
363 * sum up each counter, if we read a counter before it is incremented,
364 * but then read another CPU's count which it has been subsequently
365 * decremented from -- we would see more decrements than we should.
366 * MNT_WRITE_HOLD protects against this scenario, because
367 * mnt_want_write first increments count, then smp_mb, then spins on
368 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
369 * we're counting up here.
384 */ 370 */
385 spin_lock(&vfsmount_lock); 371 if (count_mnt_writers(mnt) > 0)
386 if (!ret) 372 ret = -EBUSY;
373 else
387 mnt->mnt_flags |= MNT_READONLY; 374 mnt->mnt_flags |= MNT_READONLY;
375 /*
376 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
377 * that become unheld will see MNT_READONLY.
378 */
379 smp_wmb();
380 mnt->mnt_flags &= ~MNT_WRITE_HOLD;
388 spin_unlock(&vfsmount_lock); 381 spin_unlock(&vfsmount_lock);
389out:
390 unlock_mnt_writers();
391 return ret; 382 return ret;
392} 383}
393 384
@@ -410,6 +401,9 @@ void free_vfsmnt(struct vfsmount *mnt)
410{ 401{
411 kfree(mnt->mnt_devname); 402 kfree(mnt->mnt_devname);
412 mnt_free_id(mnt); 403 mnt_free_id(mnt);
404#ifdef CONFIG_SMP
405 free_percpu(mnt->mnt_writers);
406#endif
413 kmem_cache_free(mnt_cache, mnt); 407 kmem_cache_free(mnt_cache, mnt);
414} 408}
415 409
@@ -442,11 +436,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
442 * lookup_mnt increments the ref count before returning 436 * lookup_mnt increments the ref count before returning
443 * the vfsmount struct. 437 * the vfsmount struct.
444 */ 438 */
445struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 439struct vfsmount *lookup_mnt(struct path *path)
446{ 440{
447 struct vfsmount *child_mnt; 441 struct vfsmount *child_mnt;
448 spin_lock(&vfsmount_lock); 442 spin_lock(&vfsmount_lock);
449 if ((child_mnt = __lookup_mnt(mnt, dentry, 1))) 443 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
450 mntget(child_mnt); 444 mntget(child_mnt);
451 spin_unlock(&vfsmount_lock); 445 spin_unlock(&vfsmount_lock);
452 return child_mnt; 446 return child_mnt;
@@ -604,38 +598,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
604 598
605static inline void __mntput(struct vfsmount *mnt) 599static inline void __mntput(struct vfsmount *mnt)
606{ 600{
607 int cpu;
608 struct super_block *sb = mnt->mnt_sb; 601 struct super_block *sb = mnt->mnt_sb;
609 /* 602 /*
610 * We don't have to hold all of the locks at the
611 * same time here because we know that we're the
612 * last reference to mnt and that no new writers
613 * can come in.
614 */
615 for_each_possible_cpu(cpu) {
616 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
617 spin_lock(&cpu_writer->lock);
618 if (cpu_writer->mnt != mnt) {
619 spin_unlock(&cpu_writer->lock);
620 continue;
621 }
622 atomic_add(cpu_writer->count, &mnt->__mnt_writers);
623 cpu_writer->count = 0;
624 /*
625 * Might as well do this so that no one
626 * ever sees the pointer and expects
627 * it to be valid.
628 */
629 cpu_writer->mnt = NULL;
630 spin_unlock(&cpu_writer->lock);
631 }
632 /*
633 * This probably indicates that somebody messed 603 * This probably indicates that somebody messed
634 * up a mnt_want/drop_write() pair. If this 604 * up a mnt_want/drop_write() pair. If this
635 * happens, the filesystem was probably unable 605 * happens, the filesystem was probably unable
636 * to make r/w->r/o transitions. 606 * to make r/w->r/o transitions.
637 */ 607 */
638 WARN_ON(atomic_read(&mnt->__mnt_writers)); 608 /*
609 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
610 * provides barriers, so count_mnt_writers() below is safe. AV
611 */
612 WARN_ON(count_mnt_writers(mnt));
639 dput(mnt->mnt_root); 613 dput(mnt->mnt_root);
640 free_vfsmnt(mnt); 614 free_vfsmnt(mnt);
641 deactivate_super(sb); 615 deactivate_super(sb);
@@ -1106,11 +1080,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
1106 * we just try to remount it readonly. 1080 * we just try to remount it readonly.
1107 */ 1081 */
1108 down_write(&sb->s_umount); 1082 down_write(&sb->s_umount);
1109 if (!(sb->s_flags & MS_RDONLY)) { 1083 if (!(sb->s_flags & MS_RDONLY))
1110 lock_kernel();
1111 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1084 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
1112 unlock_kernel();
1113 }
1114 up_write(&sb->s_umount); 1085 up_write(&sb->s_umount);
1115 return retval; 1086 return retval;
1116 } 1087 }
@@ -1253,11 +1224,11 @@ Enomem:
1253 return NULL; 1224 return NULL;
1254} 1225}
1255 1226
1256struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) 1227struct vfsmount *collect_mounts(struct path *path)
1257{ 1228{
1258 struct vfsmount *tree; 1229 struct vfsmount *tree;
1259 down_write(&namespace_sem); 1230 down_write(&namespace_sem);
1260 tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); 1231 tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
1261 up_write(&namespace_sem); 1232 up_write(&namespace_sem);
1262 return tree; 1233 return tree;
1263} 1234}
@@ -1430,7 +1401,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
1430 goto out_unlock; 1401 goto out_unlock;
1431 1402
1432 err = -ENOENT; 1403 err = -ENOENT;
1433 if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry)) 1404 if (!d_unlinked(path->dentry))
1434 err = attach_recursive_mnt(mnt, path, NULL); 1405 err = attach_recursive_mnt(mnt, path, NULL);
1435out_unlock: 1406out_unlock:
1436 mutex_unlock(&path->dentry->d_inode->i_mutex); 1407 mutex_unlock(&path->dentry->d_inode->i_mutex);
@@ -1601,7 +1572,7 @@ static int do_move_mount(struct path *path, char *old_name)
1601 1572
1602 down_write(&namespace_sem); 1573 down_write(&namespace_sem);
1603 while (d_mountpoint(path->dentry) && 1574 while (d_mountpoint(path->dentry) &&
1604 follow_down(&path->mnt, &path->dentry)) 1575 follow_down(path))
1605 ; 1576 ;
1606 err = -EINVAL; 1577 err = -EINVAL;
1607 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1578 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
@@ -1612,7 +1583,7 @@ static int do_move_mount(struct path *path, char *old_name)
1612 if (IS_DEADDIR(path->dentry->d_inode)) 1583 if (IS_DEADDIR(path->dentry->d_inode))
1613 goto out1; 1584 goto out1;
1614 1585
1615 if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry)) 1586 if (d_unlinked(path->dentry))
1616 goto out1; 1587 goto out1;
1617 1588
1618 err = -EINVAL; 1589 err = -EINVAL;
@@ -1676,7 +1647,9 @@ static int do_new_mount(struct path *path, char *type, int flags,
1676 if (!capable(CAP_SYS_ADMIN)) 1647 if (!capable(CAP_SYS_ADMIN))
1677 return -EPERM; 1648 return -EPERM;
1678 1649
1650 lock_kernel();
1679 mnt = do_kern_mount(type, flags, name, data); 1651 mnt = do_kern_mount(type, flags, name, data);
1652 unlock_kernel();
1680 if (IS_ERR(mnt)) 1653 if (IS_ERR(mnt))
1681 return PTR_ERR(mnt); 1654 return PTR_ERR(mnt);
1682 1655
@@ -1695,10 +1668,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1695 down_write(&namespace_sem); 1668 down_write(&namespace_sem);
1696 /* Something was mounted here while we slept */ 1669 /* Something was mounted here while we slept */
1697 while (d_mountpoint(path->dentry) && 1670 while (d_mountpoint(path->dentry) &&
1698 follow_down(&path->mnt, &path->dentry)) 1671 follow_down(path))
1699 ; 1672 ;
1700 err = -EINVAL; 1673 err = -EINVAL;
1701 if (!check_mnt(path->mnt)) 1674 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
1702 goto unlock; 1675 goto unlock;
1703 1676
1704 /* Refuse the same filesystem on the same mount point */ 1677 /* Refuse the same filesystem on the same mount point */
@@ -1984,6 +1957,21 @@ dput_out:
1984 return retval; 1957 return retval;
1985} 1958}
1986 1959
1960static struct mnt_namespace *alloc_mnt_ns(void)
1961{
1962 struct mnt_namespace *new_ns;
1963
1964 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
1965 if (!new_ns)
1966 return ERR_PTR(-ENOMEM);
1967 atomic_set(&new_ns->count, 1);
1968 new_ns->root = NULL;
1969 INIT_LIST_HEAD(&new_ns->list);
1970 init_waitqueue_head(&new_ns->poll);
1971 new_ns->event = 0;
1972 return new_ns;
1973}
1974
1987/* 1975/*
1988 * Allocate a new namespace structure and populate it with contents 1976 * Allocate a new namespace structure and populate it with contents
1989 * copied from the namespace of the passed in task structure. 1977 * copied from the namespace of the passed in task structure.
@@ -1995,14 +1983,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
1995 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 1983 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
1996 struct vfsmount *p, *q; 1984 struct vfsmount *p, *q;
1997 1985
1998 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 1986 new_ns = alloc_mnt_ns();
1999 if (!new_ns) 1987 if (IS_ERR(new_ns))
2000 return ERR_PTR(-ENOMEM); 1988 return new_ns;
2001
2002 atomic_set(&new_ns->count, 1);
2003 INIT_LIST_HEAD(&new_ns->list);
2004 init_waitqueue_head(&new_ns->poll);
2005 new_ns->event = 0;
2006 1989
2007 down_write(&namespace_sem); 1990 down_write(&namespace_sem);
2008 /* First pass: copy the tree topology */ 1991 /* First pass: copy the tree topology */
@@ -2066,6 +2049,24 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2066 return new_ns; 2049 return new_ns;
2067} 2050}
2068 2051
2052/**
2053 * create_mnt_ns - creates a private namespace and adds a root filesystem
2054 * @mnt: pointer to the new root filesystem mountpoint
2055 */
2056struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
2057{
2058 struct mnt_namespace *new_ns;
2059
2060 new_ns = alloc_mnt_ns();
2061 if (!IS_ERR(new_ns)) {
2062 mnt->mnt_ns = new_ns;
2063 new_ns->root = mnt;
2064 list_add(&new_ns->list, &new_ns->root->mnt_list);
2065 }
2066 return new_ns;
2067}
2068EXPORT_SYMBOL(create_mnt_ns);
2069
2069SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2070SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2070 char __user *, type, unsigned long, flags, void __user *, data) 2071 char __user *, type, unsigned long, flags, void __user *, data)
2071{ 2072{
@@ -2092,10 +2093,8 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2092 if (retval < 0) 2093 if (retval < 0)
2093 goto out3; 2094 goto out3;
2094 2095
2095 lock_kernel();
2096 retval = do_mount((char *)dev_page, dir_page, (char *)type_page, 2096 retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
2097 flags, (void *)data_page); 2097 flags, (void *)data_page);
2098 unlock_kernel();
2099 free_page(data_page); 2098 free_page(data_page);
2100 2099
2101out3: 2100out3:
@@ -2175,9 +2174,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2175 error = -ENOENT; 2174 error = -ENOENT;
2176 if (IS_DEADDIR(new.dentry->d_inode)) 2175 if (IS_DEADDIR(new.dentry->d_inode))
2177 goto out2; 2176 goto out2;
2178 if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry)) 2177 if (d_unlinked(new.dentry))
2179 goto out2; 2178 goto out2;
2180 if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry)) 2179 if (d_unlinked(old.dentry))
2181 goto out2; 2180 goto out2;
2182 error = -EBUSY; 2181 error = -EBUSY;
2183 if (new.mnt == root.mnt || 2182 if (new.mnt == root.mnt ||
@@ -2243,16 +2242,9 @@ static void __init init_mount_tree(void)
2243 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2242 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
2244 if (IS_ERR(mnt)) 2243 if (IS_ERR(mnt))
2245 panic("Can't create rootfs"); 2244 panic("Can't create rootfs");
2246 ns = kmalloc(sizeof(*ns), GFP_KERNEL); 2245 ns = create_mnt_ns(mnt);
2247 if (!ns) 2246 if (IS_ERR(ns))
2248 panic("Can't allocate initial namespace"); 2247 panic("Can't allocate initial namespace");
2249 atomic_set(&ns->count, 1);
2250 INIT_LIST_HEAD(&ns->list);
2251 init_waitqueue_head(&ns->poll);
2252 ns->event = 0;
2253 list_add(&mnt->mnt_list, &ns->list);
2254 ns->root = mnt;
2255 mnt->mnt_ns = ns;
2256 2248
2257 init_task.nsproxy->mnt_ns = ns; 2249 init_task.nsproxy->mnt_ns = ns;
2258 get_mnt_ns(ns); 2250 get_mnt_ns(ns);
@@ -2295,10 +2287,14 @@ void __init mnt_init(void)
2295 init_mount_tree(); 2287 init_mount_tree();
2296} 2288}
2297 2289
2298void __put_mnt_ns(struct mnt_namespace *ns) 2290void put_mnt_ns(struct mnt_namespace *ns)
2299{ 2291{
2300 struct vfsmount *root = ns->root; 2292 struct vfsmount *root;
2301 LIST_HEAD(umount_list); 2293 LIST_HEAD(umount_list);
2294
2295 if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock))
2296 return;
2297 root = ns->root;
2302 ns->root = NULL; 2298 ns->root = NULL;
2303 spin_unlock(&vfsmount_lock); 2299 spin_unlock(&vfsmount_lock);
2304 down_write(&namespace_sem); 2300 down_write(&namespace_sem);
@@ -2309,3 +2305,4 @@ void __put_mnt_ns(struct mnt_namespace *ns)
2309 release_mounts(&umount_list); 2305 release_mounts(&umount_list);
2310 kfree(ns); 2306 kfree(ns);
2311} 2307}
2308EXPORT_SYMBOL(put_mnt_ns);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d642f0e5b365..b99ce205b1bd 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -736,6 +736,8 @@ static void ncp_put_super(struct super_block *sb)
736{ 736{
737 struct ncp_server *server = NCP_SBP(sb); 737 struct ncp_server *server = NCP_SBP(sb);
738 738
739 lock_kernel();
740
739 ncp_lock_server(server); 741 ncp_lock_server(server);
740 ncp_disconnect(server); 742 ncp_disconnect(server);
741 ncp_unlock_server(server); 743 ncp_unlock_server(server);
@@ -769,6 +771,8 @@ static void ncp_put_super(struct super_block *sb)
769 vfree(server->packet); 771 vfree(server->packet);
770 sb->s_fs_info = NULL; 772 sb->s_fs_info = NULL;
771 kfree(server); 773 kfree(server);
774
775 unlock_kernel();
772} 776}
773 777
774static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) 778static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 97645f112114..0ec6237a5970 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen,
1113 1113
1114 if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { 1114 if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
1115 int k; 1115 int k;
1116 unicode_t u;
1116 1117
1117 k = utf8_mbtowc(&ec, iname, iname_end - iname); 1118 k = utf8_to_utf32(iname, iname_end - iname, &u);
1118 if (k < 0) 1119 if (k < 0 || u > MAX_WCHAR_T)
1119 return -EINVAL; 1120 return -EINVAL;
1120 iname += k; 1121 iname += k;
1122 ec = u;
1121 } else { 1123 } else {
1122 if (*iname == NCP_ESC) { 1124 if (*iname == NCP_ESC) {
1123 int k; 1125 int k;
@@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen,
1214 if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { 1216 if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
1215 int k; 1217 int k;
1216 1218
1217 k = utf8_wctomb(iname, ec, iname_end - iname); 1219 k = utf32_to_utf8(ec, iname, iname_end - iname);
1218 if (k < 0) { 1220 if (k < 0) {
1219 err = -ENAMETOOLONG; 1221 err = -ENAMETOOLONG;
1220 goto quit; 1222 goto quit;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index e67f3ec07736..2a77bc25d5af 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -1,6 +1,6 @@
1config NFS_FS 1config NFS_FS
2 tristate "NFS client support" 2 tristate "NFS client support"
3 depends on INET 3 depends on INET && FILE_LOCKING
4 select LOCKD 4 select LOCKD
5 select SUNRPC 5 select SUNRPC
6 select NFS_ACL_SUPPORT if NFS_V3_ACL 6 select NFS_ACL_SUPPORT if NFS_V3_ACL
@@ -74,6 +74,15 @@ config NFS_V4
74 74
75 If unsure, say N. 75 If unsure, say N.
76 76
77config NFS_V4_1
78 bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)"
79 depends on NFS_V4 && EXPERIMENTAL
80 help
81 This option enables support for minor version 1 of the NFSv4 protocol
82 (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
83
84 Unless you're an NFS developer, say N.
85
77config ROOT_NFS 86config ROOT_NFS
78 bool "Root file system on NFS" 87 bool "Root file system on NFS"
79 depends on NFS_FS=y && IP_PNP 88 depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a886e692ddd0..7f604c7941fb 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -17,6 +17,9 @@
17#include <linux/freezer.h> 17#include <linux/freezer.h>
18#include <linux/kthread.h> 18#include <linux/kthread.h>
19#include <linux/sunrpc/svcauth_gss.h> 19#include <linux/sunrpc/svcauth_gss.h>
20#if defined(CONFIG_NFS_V4_1)
21#include <linux/sunrpc/bc_xprt.h>
22#endif
20 23
21#include <net/inet_sock.h> 24#include <net/inet_sock.h>
22 25
@@ -28,11 +31,12 @@
28 31
29struct nfs_callback_data { 32struct nfs_callback_data {
30 unsigned int users; 33 unsigned int users;
34 struct svc_serv *serv;
31 struct svc_rqst *rqst; 35 struct svc_rqst *rqst;
32 struct task_struct *task; 36 struct task_struct *task;
33}; 37};
34 38
35static struct nfs_callback_data nfs_callback_info; 39static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
36static DEFINE_MUTEX(nfs_callback_mutex); 40static DEFINE_MUTEX(nfs_callback_mutex);
37static struct svc_program nfs4_callback_program; 41static struct svc_program nfs4_callback_program;
38 42
@@ -56,10 +60,10 @@ module_param_call(callback_tcpport, param_set_port, param_get_int,
56 &nfs_callback_set_tcpport, 0644); 60 &nfs_callback_set_tcpport, 0644);
57 61
58/* 62/*
59 * This is the callback kernel thread. 63 * This is the NFSv4 callback kernel thread.
60 */ 64 */
61static int 65static int
62nfs_callback_svc(void *vrqstp) 66nfs4_callback_svc(void *vrqstp)
63{ 67{
64 int err, preverr = 0; 68 int err, preverr = 0;
65 struct svc_rqst *rqstp = vrqstp; 69 struct svc_rqst *rqstp = vrqstp;
@@ -97,20 +101,12 @@ nfs_callback_svc(void *vrqstp)
97} 101}
98 102
99/* 103/*
100 * Bring up the callback thread if it is not already up. 104 * Prepare to bring up the NFSv4 callback service
101 */ 105 */
102int nfs_callback_up(void) 106struct svc_rqst *
107nfs4_callback_up(struct svc_serv *serv)
103{ 108{
104 struct svc_serv *serv = NULL; 109 int ret;
105 int ret = 0;
106
107 mutex_lock(&nfs_callback_mutex);
108 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
109 goto out;
110 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
111 ret = -ENOMEM;
112 if (!serv)
113 goto out_err;
114 110
115 ret = svc_create_xprt(serv, "tcp", PF_INET, 111 ret = svc_create_xprt(serv, "tcp", PF_INET,
116 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 112 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
@@ -127,27 +123,174 @@ int nfs_callback_up(void)
127 nfs_callback_tcpport6 = ret; 123 nfs_callback_tcpport6 = ret;
128 dprintk("NFS: Callback listener port = %u (af %u)\n", 124 dprintk("NFS: Callback listener port = %u (af %u)\n",
129 nfs_callback_tcpport6, PF_INET6); 125 nfs_callback_tcpport6, PF_INET6);
130 } else if (ret != -EAFNOSUPPORT) 126 } else if (ret == -EAFNOSUPPORT)
127 ret = 0;
128 else
131 goto out_err; 129 goto out_err;
132#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ 130#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
133 131
134 nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); 132 return svc_prepare_thread(serv, &serv->sv_pools[0]);
135 if (IS_ERR(nfs_callback_info.rqst)) { 133
136 ret = PTR_ERR(nfs_callback_info.rqst); 134out_err:
137 nfs_callback_info.rqst = NULL; 135 if (ret == 0)
136 ret = -ENOMEM;
137 return ERR_PTR(ret);
138}
139
140#if defined(CONFIG_NFS_V4_1)
141/*
142 * The callback service for NFSv4.1 callbacks
143 */
144static int
145nfs41_callback_svc(void *vrqstp)
146{
147 struct svc_rqst *rqstp = vrqstp;
148 struct svc_serv *serv = rqstp->rq_server;
149 struct rpc_rqst *req;
150 int error;
151 DEFINE_WAIT(wq);
152
153 set_freezable();
154
155 /*
156 * FIXME: do we really need to run this under the BKL? If so, please
157 * add a comment about what it's intended to protect.
158 */
159 lock_kernel();
160 while (!kthread_should_stop()) {
161 prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
162 spin_lock_bh(&serv->sv_cb_lock);
163 if (!list_empty(&serv->sv_cb_list)) {
164 req = list_first_entry(&serv->sv_cb_list,
165 struct rpc_rqst, rq_bc_list);
166 list_del(&req->rq_bc_list);
167 spin_unlock_bh(&serv->sv_cb_lock);
168 dprintk("Invoking bc_svc_process()\n");
169 error = bc_svc_process(serv, req, rqstp);
170 dprintk("bc_svc_process() returned w/ error code= %d\n",
171 error);
172 } else {
173 spin_unlock_bh(&serv->sv_cb_lock);
174 schedule();
175 }
176 finish_wait(&serv->sv_cb_waitq, &wq);
177 }
178 unlock_kernel();
179 return 0;
180}
181
182/*
183 * Bring up the NFSv4.1 callback service
184 */
185struct svc_rqst *
186nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
187{
188 struct svc_xprt *bc_xprt;
189 struct svc_rqst *rqstp = ERR_PTR(-ENOMEM);
190
191 dprintk("--> %s\n", __func__);
192 /* Create a svc_sock for the service */
193 bc_xprt = svc_sock_create(serv, xprt->prot);
194 if (!bc_xprt)
195 goto out;
196
197 /*
198 * Save the svc_serv in the transport so that it can
199 * be referenced when the session backchannel is initialized
200 */
201 serv->bc_xprt = bc_xprt;
202 xprt->bc_serv = serv;
203
204 INIT_LIST_HEAD(&serv->sv_cb_list);
205 spin_lock_init(&serv->sv_cb_lock);
206 init_waitqueue_head(&serv->sv_cb_waitq);
207 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
208 if (IS_ERR(rqstp))
209 svc_sock_destroy(bc_xprt);
210out:
211 dprintk("--> %s return %p\n", __func__, rqstp);
212 return rqstp;
213}
214
215static inline int nfs_minorversion_callback_svc_setup(u32 minorversion,
216 struct svc_serv *serv, struct rpc_xprt *xprt,
217 struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
218{
219 if (minorversion) {
220 *rqstpp = nfs41_callback_up(serv, xprt);
221 *callback_svc = nfs41_callback_svc;
222 }
223 return minorversion;
224}
225
226static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
227 struct nfs_callback_data *cb_info)
228{
229 if (minorversion)
230 xprt->bc_serv = cb_info->serv;
231}
232#else
233static inline int nfs_minorversion_callback_svc_setup(u32 minorversion,
234 struct svc_serv *serv, struct rpc_xprt *xprt,
235 struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
236{
237 return 0;
238}
239
240static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
241 struct nfs_callback_data *cb_info)
242{
243}
244#endif /* CONFIG_NFS_V4_1 */
245
246/*
247 * Bring up the callback thread if it is not already up.
248 */
249int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
250{
251 struct svc_serv *serv = NULL;
252 struct svc_rqst *rqstp;
253 int (*callback_svc)(void *vrqstp);
254 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
255 char svc_name[12];
256 int ret = 0;
257 int minorversion_setup;
258
259 mutex_lock(&nfs_callback_mutex);
260 if (cb_info->users++ || cb_info->task != NULL) {
261 nfs_callback_bc_serv(minorversion, xprt, cb_info);
262 goto out;
263 }
264 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
265 if (!serv) {
266 ret = -ENOMEM;
267 goto out_err;
268 }
269
270 minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion,
271 serv, xprt, &rqstp, &callback_svc);
272 if (!minorversion_setup) {
273 /* v4.0 callback setup */
274 rqstp = nfs4_callback_up(serv);
275 callback_svc = nfs4_callback_svc;
276 }
277
278 if (IS_ERR(rqstp)) {
279 ret = PTR_ERR(rqstp);
138 goto out_err; 280 goto out_err;
139 } 281 }
140 282
141 svc_sock_update_bufs(serv); 283 svc_sock_update_bufs(serv);
142 284
143 nfs_callback_info.task = kthread_run(nfs_callback_svc, 285 sprintf(svc_name, "nfsv4.%u-svc", minorversion);
144 nfs_callback_info.rqst, 286 cb_info->serv = serv;
145 "nfsv4-svc"); 287 cb_info->rqst = rqstp;
146 if (IS_ERR(nfs_callback_info.task)) { 288 cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name);
147 ret = PTR_ERR(nfs_callback_info.task); 289 if (IS_ERR(cb_info->task)) {
148 svc_exit_thread(nfs_callback_info.rqst); 290 ret = PTR_ERR(cb_info->task);
149 nfs_callback_info.rqst = NULL; 291 svc_exit_thread(cb_info->rqst);
150 nfs_callback_info.task = NULL; 292 cb_info->rqst = NULL;
293 cb_info->task = NULL;
151 goto out_err; 294 goto out_err;
152 } 295 }
153out: 296out:
@@ -164,22 +307,25 @@ out:
164out_err: 307out_err:
165 dprintk("NFS: Couldn't create callback socket or server thread; " 308 dprintk("NFS: Couldn't create callback socket or server thread; "
166 "err = %d\n", ret); 309 "err = %d\n", ret);
167 nfs_callback_info.users--; 310 cb_info->users--;
168 goto out; 311 goto out;
169} 312}
170 313
171/* 314/*
172 * Kill the callback thread if it's no longer being used. 315 * Kill the callback thread if it's no longer being used.
173 */ 316 */
174void nfs_callback_down(void) 317void nfs_callback_down(int minorversion)
175{ 318{
319 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
320
176 mutex_lock(&nfs_callback_mutex); 321 mutex_lock(&nfs_callback_mutex);
177 nfs_callback_info.users--; 322 cb_info->users--;
178 if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) { 323 if (cb_info->users == 0 && cb_info->task != NULL) {
179 kthread_stop(nfs_callback_info.task); 324 kthread_stop(cb_info->task);
180 svc_exit_thread(nfs_callback_info.rqst); 325 svc_exit_thread(cb_info->rqst);
181 nfs_callback_info.rqst = NULL; 326 cb_info->serv = NULL;
182 nfs_callback_info.task = NULL; 327 cb_info->rqst = NULL;
328 cb_info->task = NULL;
183 } 329 }
184 mutex_unlock(&nfs_callback_mutex); 330 mutex_unlock(&nfs_callback_mutex);
185} 331}
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index e110e286a262..07baa8254ca1 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -20,13 +20,24 @@ enum nfs4_callback_procnum {
20enum nfs4_callback_opnum { 20enum nfs4_callback_opnum {
21 OP_CB_GETATTR = 3, 21 OP_CB_GETATTR = 3,
22 OP_CB_RECALL = 4, 22 OP_CB_RECALL = 4,
23/* Callback operations new to NFSv4.1 */
24 OP_CB_LAYOUTRECALL = 5,
25 OP_CB_NOTIFY = 6,
26 OP_CB_PUSH_DELEG = 7,
27 OP_CB_RECALL_ANY = 8,
28 OP_CB_RECALLABLE_OBJ_AVAIL = 9,
29 OP_CB_RECALL_SLOT = 10,
30 OP_CB_SEQUENCE = 11,
31 OP_CB_WANTS_CANCELLED = 12,
32 OP_CB_NOTIFY_LOCK = 13,
33 OP_CB_NOTIFY_DEVICEID = 14,
23 OP_CB_ILLEGAL = 10044, 34 OP_CB_ILLEGAL = 10044,
24}; 35};
25 36
26struct cb_compound_hdr_arg { 37struct cb_compound_hdr_arg {
27 unsigned int taglen; 38 unsigned int taglen;
28 const char *tag; 39 const char *tag;
29 unsigned int callback_ident; 40 unsigned int minorversion;
30 unsigned nops; 41 unsigned nops;
31}; 42};
32 43
@@ -59,16 +70,59 @@ struct cb_recallargs {
59 uint32_t truncate; 70 uint32_t truncate;
60}; 71};
61 72
73#if defined(CONFIG_NFS_V4_1)
74
75struct referring_call {
76 uint32_t rc_sequenceid;
77 uint32_t rc_slotid;
78};
79
80struct referring_call_list {
81 struct nfs4_sessionid rcl_sessionid;
82 uint32_t rcl_nrefcalls;
83 struct referring_call *rcl_refcalls;
84};
85
86struct cb_sequenceargs {
87 struct sockaddr *csa_addr;
88 struct nfs4_sessionid csa_sessionid;
89 uint32_t csa_sequenceid;
90 uint32_t csa_slotid;
91 uint32_t csa_highestslotid;
92 uint32_t csa_cachethis;
93 uint32_t csa_nrclists;
94 struct referring_call_list *csa_rclists;
95};
96
97struct cb_sequenceres {
98 __be32 csr_status;
99 struct nfs4_sessionid csr_sessionid;
100 uint32_t csr_sequenceid;
101 uint32_t csr_slotid;
102 uint32_t csr_highestslotid;
103 uint32_t csr_target_highestslotid;
104};
105
106extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
107 struct cb_sequenceres *res);
108
109#endif /* CONFIG_NFS_V4_1 */
110
62extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); 111extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
63extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy); 112extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
64 113
65#ifdef CONFIG_NFS_V4 114#ifdef CONFIG_NFS_V4
66extern int nfs_callback_up(void); 115extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
67extern void nfs_callback_down(void); 116extern void nfs_callback_down(int minorversion);
68#else 117#endif /* CONFIG_NFS_V4 */
69#define nfs_callback_up() (0) 118
70#define nfs_callback_down() do {} while(0) 119/*
71#endif 120 * nfs41: Callbacks are expected to not cause substantial latency,
121 * so we limit their concurrency to 1 by setting up the maximum number
122 * of slots for the backchannel.
123 */
124#define NFS41_BC_MIN_CALLBACKS 1
125#define NFS41_BC_MAX_CALLBACKS 1
72 126
73extern unsigned int nfs_callback_set_tcpport; 127extern unsigned int nfs_callback_set_tcpport;
74extern unsigned short nfs_callback_tcpport; 128extern unsigned short nfs_callback_tcpport;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f7e83e23cf9f..b7da1f54da68 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -101,3 +101,130 @@ out:
101 dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); 101 dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
102 return res; 102 return res;
103} 103}
104
105#if defined(CONFIG_NFS_V4_1)
106
107/*
108 * Validate the sequenceID sent by the server.
109 * Return success if the sequenceID is one more than what we last saw on
110 * this slot, accounting for wraparound. Increments the slot's sequence.
111 *
112 * We don't yet implement a duplicate request cache, so at this time
113 * we will log replays, and process them as if we had not seen them before,
114 * but we don't bump the sequence in the slot. Not too worried about it,
115 * since we only currently implement idempotent callbacks anyway.
116 *
117 * We have a single slot backchannel at this time, so we don't bother
118 * checking the used_slots bit array on the table. The lower layer guarantees
119 * a single outstanding callback request at a time.
120 */
121static int
122validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid)
123{
124 struct nfs4_slot *slot;
125
126 dprintk("%s enter. slotid %d seqid %d\n",
127 __func__, slotid, seqid);
128
129 if (slotid > NFS41_BC_MAX_CALLBACKS)
130 return htonl(NFS4ERR_BADSLOT);
131
132 slot = tbl->slots + slotid;
133 dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr);
134
135 /* Normal */
136 if (likely(seqid == slot->seq_nr + 1)) {
137 slot->seq_nr++;
138 return htonl(NFS4_OK);
139 }
140
141 /* Replay */
142 if (seqid == slot->seq_nr) {
143 dprintk("%s seqid %d is a replay - no DRC available\n",
144 __func__, seqid);
145 return htonl(NFS4_OK);
146 }
147
148 /* Wraparound */
149 if (seqid == 1 && (slot->seq_nr + 1) == 0) {
150 slot->seq_nr = 1;
151 return htonl(NFS4_OK);
152 }
153
154 /* Misordered request */
155 return htonl(NFS4ERR_SEQ_MISORDERED);
156}
157
158/*
159 * Returns a pointer to a held 'struct nfs_client' that matches the server's
160 * address, major version number, and session ID. It is the caller's
161 * responsibility to release the returned reference.
162 *
163 * Returns NULL if there are no connections with sessions, or if no session
164 * matches the one of interest.
165 */
166 static struct nfs_client *find_client_with_session(
167 const struct sockaddr *addr, u32 nfsversion,
168 struct nfs4_sessionid *sessionid)
169{
170 struct nfs_client *clp;
171
172 clp = nfs_find_client(addr, 4);
173 if (clp == NULL)
174 return NULL;
175
176 do {
177 struct nfs_client *prev = clp;
178
179 if (clp->cl_session != NULL) {
180 if (memcmp(clp->cl_session->sess_id.data,
181 sessionid->data,
182 NFS4_MAX_SESSIONID_LEN) == 0) {
183 /* Returns a held reference to clp */
184 return clp;
185 }
186 }
187 clp = nfs_find_client_next(prev);
188 nfs_put_client(prev);
189 } while (clp != NULL);
190
191 return NULL;
192}
193
194/* FIXME: referring calls should be processed */
195unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
196 struct cb_sequenceres *res)
197{
198 struct nfs_client *clp;
199 int i, status;
200
201 for (i = 0; i < args->csa_nrclists; i++)
202 kfree(args->csa_rclists[i].rcl_refcalls);
203 kfree(args->csa_rclists);
204
205 status = htonl(NFS4ERR_BADSESSION);
206 clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
207 if (clp == NULL)
208 goto out;
209
210 status = validate_seqid(&clp->cl_session->bc_slot_table,
211 args->csa_slotid, args->csa_sequenceid);
212 if (status)
213 goto out_putclient;
214
215 memcpy(&res->csr_sessionid, &args->csa_sessionid,
216 sizeof(res->csr_sessionid));
217 res->csr_sequenceid = args->csa_sequenceid;
218 res->csr_slotid = args->csa_slotid;
219 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
220 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
221
222out_putclient:
223 nfs_put_client(clp);
224out:
225 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
226 res->csr_status = status;
227 return res->csr_status;
228}
229
230#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index dd0ef34b5845..e5a2dac5f715 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -20,6 +20,11 @@
20 2 + 2 + 3 + 3) 20 2 + 2 + 3 + 3)
21#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 21#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
22 22
23#if defined(CONFIG_NFS_V4_1)
24#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
25 4 + 1 + 3)
26#endif /* CONFIG_NFS_V4_1 */
27
23#define NFSDBG_FACILITY NFSDBG_CALLBACK 28#define NFSDBG_FACILITY NFSDBG_CALLBACK
24 29
25typedef __be32 (*callback_process_op_t)(void *, void *); 30typedef __be32 (*callback_process_op_t)(void *, void *);
@@ -132,7 +137,6 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
132static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr) 137static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
133{ 138{
134 __be32 *p; 139 __be32 *p;
135 unsigned int minor_version;
136 __be32 status; 140 __be32 status;
137 141
138 status = decode_string(xdr, &hdr->taglen, &hdr->tag); 142 status = decode_string(xdr, &hdr->taglen, &hdr->tag);
@@ -147,15 +151,19 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
147 p = read_buf(xdr, 12); 151 p = read_buf(xdr, 12);
148 if (unlikely(p == NULL)) 152 if (unlikely(p == NULL))
149 return htonl(NFS4ERR_RESOURCE); 153 return htonl(NFS4ERR_RESOURCE);
150 minor_version = ntohl(*p++); 154 hdr->minorversion = ntohl(*p++);
151 /* Check minor version is zero. */ 155 /* Check minor version is zero or one. */
152 if (minor_version != 0) { 156 if (hdr->minorversion <= 1) {
153 printk(KERN_WARNING "%s: NFSv4 server callback with illegal minor version %u!\n", 157 p++; /* skip callback_ident */
154 __func__, minor_version); 158 } else {
159 printk(KERN_WARNING "%s: NFSv4 server callback with "
160 "illegal minor version %u!\n",
161 __func__, hdr->minorversion);
155 return htonl(NFS4ERR_MINOR_VERS_MISMATCH); 162 return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
156 } 163 }
157 hdr->callback_ident = ntohl(*p++);
158 hdr->nops = ntohl(*p); 164 hdr->nops = ntohl(*p);
165 dprintk("%s: minorversion %d nops %d\n", __func__,
166 hdr->minorversion, hdr->nops);
159 return 0; 167 return 0;
160} 168}
161 169
@@ -204,6 +212,122 @@ out:
204 return status; 212 return status;
205} 213}
206 214
215#if defined(CONFIG_NFS_V4_1)
216
217static unsigned decode_sessionid(struct xdr_stream *xdr,
218 struct nfs4_sessionid *sid)
219{
220 uint32_t *p;
221 int len = NFS4_MAX_SESSIONID_LEN;
222
223 p = read_buf(xdr, len);
224 if (unlikely(p == NULL))
225 return htonl(NFS4ERR_RESOURCE);;
226
227 memcpy(sid->data, p, len);
228 return 0;
229}
230
231static unsigned decode_rc_list(struct xdr_stream *xdr,
232 struct referring_call_list *rc_list)
233{
234 uint32_t *p;
235 int i;
236 unsigned status;
237
238 status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
239 if (status)
240 goto out;
241
242 status = htonl(NFS4ERR_RESOURCE);
243 p = read_buf(xdr, sizeof(uint32_t));
244 if (unlikely(p == NULL))
245 goto out;
246
247 rc_list->rcl_nrefcalls = ntohl(*p++);
248 if (rc_list->rcl_nrefcalls) {
249 p = read_buf(xdr,
250 rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
251 if (unlikely(p == NULL))
252 goto out;
253 rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls *
254 sizeof(*rc_list->rcl_refcalls),
255 GFP_KERNEL);
256 if (unlikely(rc_list->rcl_refcalls == NULL))
257 goto out;
258 for (i = 0; i < rc_list->rcl_nrefcalls; i++) {
259 rc_list->rcl_refcalls[i].rc_sequenceid = ntohl(*p++);
260 rc_list->rcl_refcalls[i].rc_slotid = ntohl(*p++);
261 }
262 }
263 status = 0;
264
265out:
266 return status;
267}
268
269static unsigned decode_cb_sequence_args(struct svc_rqst *rqstp,
270 struct xdr_stream *xdr,
271 struct cb_sequenceargs *args)
272{
273 uint32_t *p;
274 int i;
275 unsigned status;
276
277 status = decode_sessionid(xdr, &args->csa_sessionid);
278 if (status)
279 goto out;
280
281 status = htonl(NFS4ERR_RESOURCE);
282 p = read_buf(xdr, 5 * sizeof(uint32_t));
283 if (unlikely(p == NULL))
284 goto out;
285
286 args->csa_addr = svc_addr(rqstp);
287 args->csa_sequenceid = ntohl(*p++);
288 args->csa_slotid = ntohl(*p++);
289 args->csa_highestslotid = ntohl(*p++);
290 args->csa_cachethis = ntohl(*p++);
291 args->csa_nrclists = ntohl(*p++);
292 args->csa_rclists = NULL;
293 if (args->csa_nrclists) {
294 args->csa_rclists = kmalloc(args->csa_nrclists *
295 sizeof(*args->csa_rclists),
296 GFP_KERNEL);
297 if (unlikely(args->csa_rclists == NULL))
298 goto out;
299
300 for (i = 0; i < args->csa_nrclists; i++) {
301 status = decode_rc_list(xdr, &args->csa_rclists[i]);
302 if (status)
303 goto out_free;
304 }
305 }
306 status = 0;
307
308 dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u "
309 "highestslotid %u cachethis %d nrclists %u\n",
310 __func__,
311 ((u32 *)&args->csa_sessionid)[0],
312 ((u32 *)&args->csa_sessionid)[1],
313 ((u32 *)&args->csa_sessionid)[2],
314 ((u32 *)&args->csa_sessionid)[3],
315 args->csa_sequenceid, args->csa_slotid,
316 args->csa_highestslotid, args->csa_cachethis,
317 args->csa_nrclists);
318out:
319 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
320 return status;
321
322out_free:
323 for (i = 0; i < args->csa_nrclists; i++)
324 kfree(args->csa_rclists[i].rcl_refcalls);
325 kfree(args->csa_rclists);
326 goto out;
327}
328
329#endif /* CONFIG_NFS_V4_1 */
330
207static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) 331static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
208{ 332{
209 __be32 *p; 333 __be32 *p;
@@ -353,31 +477,134 @@ out:
353 return status; 477 return status;
354} 478}
355 479
356static __be32 process_op(struct svc_rqst *rqstp, 480#if defined(CONFIG_NFS_V4_1)
481
482static unsigned encode_sessionid(struct xdr_stream *xdr,
483 const struct nfs4_sessionid *sid)
484{
485 uint32_t *p;
486 int len = NFS4_MAX_SESSIONID_LEN;
487
488 p = xdr_reserve_space(xdr, len);
489 if (unlikely(p == NULL))
490 return htonl(NFS4ERR_RESOURCE);
491
492 memcpy(p, sid, len);
493 return 0;
494}
495
496static unsigned encode_cb_sequence_res(struct svc_rqst *rqstp,
497 struct xdr_stream *xdr,
498 const struct cb_sequenceres *res)
499{
500 uint32_t *p;
501 unsigned status = res->csr_status;
502
503 if (unlikely(status != 0))
504 goto out;
505
506 encode_sessionid(xdr, &res->csr_sessionid);
507
508 p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
509 if (unlikely(p == NULL))
510 return htonl(NFS4ERR_RESOURCE);
511
512 *p++ = htonl(res->csr_sequenceid);
513 *p++ = htonl(res->csr_slotid);
514 *p++ = htonl(res->csr_highestslotid);
515 *p++ = htonl(res->csr_target_highestslotid);
516out:
517 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
518 return status;
519}
520
521static __be32
522preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
523{
524 if (op_nr == OP_CB_SEQUENCE) {
525 if (nop != 0)
526 return htonl(NFS4ERR_SEQUENCE_POS);
527 } else {
528 if (nop == 0)
529 return htonl(NFS4ERR_OP_NOT_IN_SESSION);
530 }
531
532 switch (op_nr) {
533 case OP_CB_GETATTR:
534 case OP_CB_RECALL:
535 case OP_CB_SEQUENCE:
536 *op = &callback_ops[op_nr];
537 break;
538
539 case OP_CB_LAYOUTRECALL:
540 case OP_CB_NOTIFY_DEVICEID:
541 case OP_CB_NOTIFY:
542 case OP_CB_PUSH_DELEG:
543 case OP_CB_RECALL_ANY:
544 case OP_CB_RECALLABLE_OBJ_AVAIL:
545 case OP_CB_RECALL_SLOT:
546 case OP_CB_WANTS_CANCELLED:
547 case OP_CB_NOTIFY_LOCK:
548 return htonl(NFS4ERR_NOTSUPP);
549
550 default:
551 return htonl(NFS4ERR_OP_ILLEGAL);
552 }
553
554 return htonl(NFS_OK);
555}
556
557#else /* CONFIG_NFS_V4_1 */
558
559static __be32
560preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
561{
562 return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
563}
564
565#endif /* CONFIG_NFS_V4_1 */
566
567static __be32
568preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
569{
570 switch (op_nr) {
571 case OP_CB_GETATTR:
572 case OP_CB_RECALL:
573 *op = &callback_ops[op_nr];
574 break;
575 default:
576 return htonl(NFS4ERR_OP_ILLEGAL);
577 }
578
579 return htonl(NFS_OK);
580}
581
582static __be32 process_op(uint32_t minorversion, int nop,
583 struct svc_rqst *rqstp,
357 struct xdr_stream *xdr_in, void *argp, 584 struct xdr_stream *xdr_in, void *argp,
358 struct xdr_stream *xdr_out, void *resp) 585 struct xdr_stream *xdr_out, void *resp)
359{ 586{
360 struct callback_op *op = &callback_ops[0]; 587 struct callback_op *op = &callback_ops[0];
361 unsigned int op_nr = OP_CB_ILLEGAL; 588 unsigned int op_nr = OP_CB_ILLEGAL;
362 __be32 status = 0; 589 __be32 status;
363 long maxlen; 590 long maxlen;
364 __be32 res; 591 __be32 res;
365 592
366 dprintk("%s: start\n", __func__); 593 dprintk("%s: start\n", __func__);
367 status = decode_op_hdr(xdr_in, &op_nr); 594 status = decode_op_hdr(xdr_in, &op_nr);
368 if (likely(status == 0)) { 595 if (unlikely(status)) {
369 switch (op_nr) { 596 status = htonl(NFS4ERR_OP_ILLEGAL);
370 case OP_CB_GETATTR: 597 goto out;
371 case OP_CB_RECALL:
372 op = &callback_ops[op_nr];
373 break;
374 default:
375 op_nr = OP_CB_ILLEGAL;
376 op = &callback_ops[0];
377 status = htonl(NFS4ERR_OP_ILLEGAL);
378 }
379 } 598 }
380 599
600 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
601 __func__, minorversion, nop, op_nr);
602
603 status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
604 preprocess_nfs4_op(op_nr, &op);
605 if (status == htonl(NFS4ERR_OP_ILLEGAL))
606 op_nr = OP_CB_ILLEGAL;
607out:
381 maxlen = xdr_out->end - xdr_out->p; 608 maxlen = xdr_out->end - xdr_out->p;
382 if (maxlen > 0 && maxlen < PAGE_SIZE) { 609 if (maxlen > 0 && maxlen < PAGE_SIZE) {
383 if (likely(status == 0 && op->decode_args != NULL)) 610 if (likely(status == 0 && op->decode_args != NULL))
@@ -425,7 +652,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
425 return rpc_system_err; 652 return rpc_system_err;
426 653
427 while (status == 0 && nops != hdr_arg.nops) { 654 while (status == 0 && nops != hdr_arg.nops) {
428 status = process_op(rqstp, &xdr_in, argp, &xdr_out, resp); 655 status = process_op(hdr_arg.minorversion, nops,
656 rqstp, &xdr_in, argp, &xdr_out, resp);
429 nops++; 657 nops++;
430 } 658 }
431 659
@@ -452,7 +680,15 @@ static struct callback_op callback_ops[] = {
452 .process_op = (callback_process_op_t)nfs4_callback_recall, 680 .process_op = (callback_process_op_t)nfs4_callback_recall,
453 .decode_args = (callback_decode_arg_t)decode_recall_args, 681 .decode_args = (callback_decode_arg_t)decode_recall_args,
454 .res_maxsize = CB_OP_RECALL_RES_MAXSZ, 682 .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
455 } 683 },
684#if defined(CONFIG_NFS_V4_1)
685 [OP_CB_SEQUENCE] = {
686 .process_op = (callback_process_op_t)nfs4_callback_sequence,
687 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
688 .encode_res = (callback_encode_res_t)encode_cb_sequence_res,
689 .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ,
690 },
691#endif /* CONFIG_NFS_V4_1 */
456}; 692};
457 693
458/* 694/*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 75c9cd2aa119..8d25ccb2d51d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -37,6 +37,7 @@
37#include <linux/in6.h> 37#include <linux/in6.h>
38#include <net/ipv6.h> 38#include <net/ipv6.h>
39#include <linux/nfs_xdr.h> 39#include <linux/nfs_xdr.h>
40#include <linux/sunrpc/bc_xprt.h>
40 41
41#include <asm/system.h> 42#include <asm/system.h>
42 43
@@ -102,6 +103,7 @@ struct nfs_client_initdata {
102 size_t addrlen; 103 size_t addrlen;
103 const struct nfs_rpc_ops *rpc_ops; 104 const struct nfs_rpc_ops *rpc_ops;
104 int proto; 105 int proto;
106 u32 minorversion;
105}; 107};
106 108
107/* 109/*
@@ -114,18 +116,13 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
114{ 116{
115 struct nfs_client *clp; 117 struct nfs_client *clp;
116 struct rpc_cred *cred; 118 struct rpc_cred *cred;
119 int err = -ENOMEM;
117 120
118 if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) 121 if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
119 goto error_0; 122 goto error_0;
120 123
121 clp->rpc_ops = cl_init->rpc_ops; 124 clp->rpc_ops = cl_init->rpc_ops;
122 125
123 if (cl_init->rpc_ops->version == 4) {
124 if (nfs_callback_up() < 0)
125 goto error_2;
126 __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
127 }
128
129 atomic_set(&clp->cl_count, 1); 126 atomic_set(&clp->cl_count, 1);
130 clp->cl_cons_state = NFS_CS_INITING; 127 clp->cl_cons_state = NFS_CS_INITING;
131 128
@@ -133,9 +130,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
133 clp->cl_addrlen = cl_init->addrlen; 130 clp->cl_addrlen = cl_init->addrlen;
134 131
135 if (cl_init->hostname) { 132 if (cl_init->hostname) {
133 err = -ENOMEM;
136 clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL); 134 clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
137 if (!clp->cl_hostname) 135 if (!clp->cl_hostname)
138 goto error_3; 136 goto error_cleanup;
139 } 137 }
140 138
141 INIT_LIST_HEAD(&clp->cl_superblocks); 139 INIT_LIST_HEAD(&clp->cl_superblocks);
@@ -150,6 +148,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
150 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); 148 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
151 clp->cl_boot_time = CURRENT_TIME; 149 clp->cl_boot_time = CURRENT_TIME;
152 clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; 150 clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
151 clp->cl_minorversion = cl_init->minorversion;
153#endif 152#endif
154 cred = rpc_lookup_machine_cred(); 153 cred = rpc_lookup_machine_cred();
155 if (!IS_ERR(cred)) 154 if (!IS_ERR(cred))
@@ -159,13 +158,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
159 158
160 return clp; 159 return clp;
161 160
162error_3: 161error_cleanup:
163 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
164 nfs_callback_down();
165error_2:
166 kfree(clp); 162 kfree(clp);
167error_0: 163error_0:
168 return NULL; 164 return ERR_PTR(err);
169} 165}
170 166
171static void nfs4_shutdown_client(struct nfs_client *clp) 167static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -182,12 +178,42 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
182} 178}
183 179
184/* 180/*
181 * Destroy the NFS4 callback service
182 */
183static void nfs4_destroy_callback(struct nfs_client *clp)
184{
185#ifdef CONFIG_NFS_V4
186 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
187 nfs_callback_down(clp->cl_minorversion);
188#endif /* CONFIG_NFS_V4 */
189}
190
191/*
192 * Clears/puts all minor version specific parts from an nfs_client struct
193 * reverting it to minorversion 0.
194 */
195static void nfs4_clear_client_minor_version(struct nfs_client *clp)
196{
197#ifdef CONFIG_NFS_V4_1
198 if (nfs4_has_session(clp)) {
199 nfs4_destroy_session(clp->cl_session);
200 clp->cl_session = NULL;
201 }
202
203 clp->cl_call_sync = _nfs4_call_sync;
204#endif /* CONFIG_NFS_V4_1 */
205
206 nfs4_destroy_callback(clp);
207}
208
209/*
185 * Destroy a shared client record 210 * Destroy a shared client record
186 */ 211 */
187static void nfs_free_client(struct nfs_client *clp) 212static void nfs_free_client(struct nfs_client *clp)
188{ 213{
189 dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); 214 dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
190 215
216 nfs4_clear_client_minor_version(clp);
191 nfs4_shutdown_client(clp); 217 nfs4_shutdown_client(clp);
192 218
193 nfs_fscache_release_client_cookie(clp); 219 nfs_fscache_release_client_cookie(clp);
@@ -196,9 +222,6 @@ static void nfs_free_client(struct nfs_client *clp)
196 if (!IS_ERR(clp->cl_rpcclient)) 222 if (!IS_ERR(clp->cl_rpcclient))
197 rpc_shutdown_client(clp->cl_rpcclient); 223 rpc_shutdown_client(clp->cl_rpcclient);
198 224
199 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
200 nfs_callback_down();
201
202 if (clp->cl_machine_cred != NULL) 225 if (clp->cl_machine_cred != NULL)
203 put_rpccred(clp->cl_machine_cred); 226 put_rpccred(clp->cl_machine_cred);
204 227
@@ -347,7 +370,8 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
347 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; 370 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
348 371
349 /* Don't match clients that failed to initialise properly */ 372 /* Don't match clients that failed to initialise properly */
350 if (clp->cl_cons_state != NFS_CS_READY) 373 if (!(clp->cl_cons_state == NFS_CS_READY ||
374 clp->cl_cons_state == NFS_CS_SESSION_INITING))
351 continue; 375 continue;
352 376
353 /* Different NFS versions cannot share the same nfs_client */ 377 /* Different NFS versions cannot share the same nfs_client */
@@ -420,7 +444,9 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
420 444
421 if (clp->cl_proto != data->proto) 445 if (clp->cl_proto != data->proto)
422 continue; 446 continue;
423 447 /* Match nfsv4 minorversion */
448 if (clp->cl_minorversion != data->minorversion)
449 continue;
424 /* Match the full socket address */ 450 /* Match the full socket address */
425 if (!nfs_sockaddr_cmp(sap, clap)) 451 if (!nfs_sockaddr_cmp(sap, clap))
426 continue; 452 continue;
@@ -456,9 +482,10 @@ static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_in
456 spin_unlock(&nfs_client_lock); 482 spin_unlock(&nfs_client_lock);
457 483
458 new = nfs_alloc_client(cl_init); 484 new = nfs_alloc_client(cl_init);
459 } while (new); 485 } while (!IS_ERR(new));
460 486
461 return ERR_PTR(-ENOMEM); 487 dprintk("--> nfs_get_client() = %ld [failed]\n", PTR_ERR(new));
488 return new;
462 489
463 /* install a new client and return with it unready */ 490 /* install a new client and return with it unready */
464install_client: 491install_client:
@@ -478,7 +505,7 @@ found_client:
478 nfs_free_client(new); 505 nfs_free_client(new);
479 506
480 error = wait_event_killable(nfs_client_active_wq, 507 error = wait_event_killable(nfs_client_active_wq,
481 clp->cl_cons_state != NFS_CS_INITING); 508 clp->cl_cons_state < NFS_CS_INITING);
482 if (error < 0) { 509 if (error < 0) {
483 nfs_put_client(clp); 510 nfs_put_client(clp);
484 return ERR_PTR(-ERESTARTSYS); 511 return ERR_PTR(-ERESTARTSYS);
@@ -499,13 +526,29 @@ found_client:
499/* 526/*
500 * Mark a server as ready or failed 527 * Mark a server as ready or failed
501 */ 528 */
502static void nfs_mark_client_ready(struct nfs_client *clp, int state) 529void nfs_mark_client_ready(struct nfs_client *clp, int state)
503{ 530{
504 clp->cl_cons_state = state; 531 clp->cl_cons_state = state;
505 wake_up_all(&nfs_client_active_wq); 532 wake_up_all(&nfs_client_active_wq);
506} 533}
507 534
508/* 535/*
536 * With sessions, the client is not marked ready until after a
537 * successful EXCHANGE_ID and CREATE_SESSION.
538 *
539 * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
540 * other versions of NFS can be tried.
541 */
542int nfs4_check_client_ready(struct nfs_client *clp)
543{
544 if (!nfs4_has_session(clp))
545 return 0;
546 if (clp->cl_cons_state < NFS_CS_READY)
547 return -EPROTONOSUPPORT;
548 return 0;
549}
550
551/*
509 * Initialise the timeout values for a connection 552 * Initialise the timeout values for a connection
510 */ 553 */
511static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, 554static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
@@ -1050,6 +1093,61 @@ error:
1050 1093
1051#ifdef CONFIG_NFS_V4 1094#ifdef CONFIG_NFS_V4
1052/* 1095/*
1096 * Initialize the NFS4 callback service
1097 */
1098static int nfs4_init_callback(struct nfs_client *clp)
1099{
1100 int error;
1101
1102 if (clp->rpc_ops->version == 4) {
1103 if (nfs4_has_session(clp)) {
1104 error = xprt_setup_backchannel(
1105 clp->cl_rpcclient->cl_xprt,
1106 NFS41_BC_MIN_CALLBACKS);
1107 if (error < 0)
1108 return error;
1109 }
1110
1111 error = nfs_callback_up(clp->cl_minorversion,
1112 clp->cl_rpcclient->cl_xprt);
1113 if (error < 0) {
1114 dprintk("%s: failed to start callback. Error = %d\n",
1115 __func__, error);
1116 return error;
1117 }
1118 __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
1119 }
1120 return 0;
1121}
1122
1123/*
1124 * Initialize the minor version specific parts of an NFS4 client record
1125 */
1126static int nfs4_init_client_minor_version(struct nfs_client *clp)
1127{
1128 clp->cl_call_sync = _nfs4_call_sync;
1129
1130#if defined(CONFIG_NFS_V4_1)
1131 if (clp->cl_minorversion) {
1132 struct nfs4_session *session = NULL;
1133 /*
1134 * Create the session and mark it expired.
1135 * When a SEQUENCE operation encounters the expired session
1136 * it will do session recovery to initialize it.
1137 */
1138 session = nfs4_alloc_session(clp);
1139 if (!session)
1140 return -ENOMEM;
1141
1142 clp->cl_session = session;
1143 clp->cl_call_sync = _nfs4_call_sync_session;
1144 }
1145#endif /* CONFIG_NFS_V4_1 */
1146
1147 return nfs4_init_callback(clp);
1148}
1149
1150/*
1053 * Initialise an NFS4 client record 1151 * Initialise an NFS4 client record
1054 */ 1152 */
1055static int nfs4_init_client(struct nfs_client *clp, 1153static int nfs4_init_client(struct nfs_client *clp,
@@ -1083,7 +1181,12 @@ static int nfs4_init_client(struct nfs_client *clp,
1083 } 1181 }
1084 __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); 1182 __set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
1085 1183
1086 nfs_mark_client_ready(clp, NFS_CS_READY); 1184 error = nfs4_init_client_minor_version(clp);
1185 if (error < 0)
1186 goto error;
1187
1188 if (!nfs4_has_session(clp))
1189 nfs_mark_client_ready(clp, NFS_CS_READY);
1087 return 0; 1190 return 0;
1088 1191
1089error: 1192error:
@@ -1101,7 +1204,8 @@ static int nfs4_set_client(struct nfs_server *server,
1101 const size_t addrlen, 1204 const size_t addrlen,
1102 const char *ip_addr, 1205 const char *ip_addr,
1103 rpc_authflavor_t authflavour, 1206 rpc_authflavor_t authflavour,
1104 int proto, const struct rpc_timeout *timeparms) 1207 int proto, const struct rpc_timeout *timeparms,
1208 u32 minorversion)
1105{ 1209{
1106 struct nfs_client_initdata cl_init = { 1210 struct nfs_client_initdata cl_init = {
1107 .hostname = hostname, 1211 .hostname = hostname,
@@ -1109,6 +1213,7 @@ static int nfs4_set_client(struct nfs_server *server,
1109 .addrlen = addrlen, 1213 .addrlen = addrlen,
1110 .rpc_ops = &nfs_v4_clientops, 1214 .rpc_ops = &nfs_v4_clientops,
1111 .proto = proto, 1215 .proto = proto,
1216 .minorversion = minorversion,
1112 }; 1217 };
1113 struct nfs_client *clp; 1218 struct nfs_client *clp;
1114 int error; 1219 int error;
@@ -1137,6 +1242,22 @@ error:
1137 return error; 1242 return error;
1138} 1243}
1139 1244
1245
1246/*
1247 * Session has been established, and the client marked ready.
1248 * Set the mount rsize and wsize with negotiated fore channel
1249 * attributes which will be bound checked in nfs_server_set_fsinfo.
1250 */
1251static void nfs4_session_set_rwsize(struct nfs_server *server)
1252{
1253#ifdef CONFIG_NFS_V4_1
1254 if (!nfs4_has_session(server->nfs_client))
1255 return;
1256 server->rsize = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
1257 server->wsize = server->nfs_client->cl_session->fc_attrs.max_rqst_sz;
1258#endif /* CONFIG_NFS_V4_1 */
1259}
1260
1140/* 1261/*
1141 * Create a version 4 volume record 1262 * Create a version 4 volume record
1142 */ 1263 */
@@ -1164,7 +1285,8 @@ static int nfs4_init_server(struct nfs_server *server,
1164 data->client_address, 1285 data->client_address,
1165 data->auth_flavors[0], 1286 data->auth_flavors[0],
1166 data->nfs_server.protocol, 1287 data->nfs_server.protocol,
1167 &timeparms); 1288 &timeparms,
1289 data->minorversion);
1168 if (error < 0) 1290 if (error < 0)
1169 goto error; 1291 goto error;
1170 1292
@@ -1214,6 +1336,10 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1214 BUG_ON(!server->nfs_client->rpc_ops); 1336 BUG_ON(!server->nfs_client->rpc_ops);
1215 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); 1337 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1216 1338
1339 error = nfs4_init_session(server);
1340 if (error < 0)
1341 goto error;
1342
1217 /* Probe the root fh to retrieve its FSID */ 1343 /* Probe the root fh to retrieve its FSID */
1218 error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); 1344 error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
1219 if (error < 0) 1345 if (error < 0)
@@ -1224,6 +1350,8 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1224 (unsigned long long) server->fsid.minor); 1350 (unsigned long long) server->fsid.minor);
1225 dprintk("Mount FH: %d\n", mntfh->size); 1351 dprintk("Mount FH: %d\n", mntfh->size);
1226 1352
1353 nfs4_session_set_rwsize(server);
1354
1227 error = nfs_probe_fsinfo(server, mntfh, &fattr); 1355 error = nfs_probe_fsinfo(server, mntfh, &fattr);
1228 if (error < 0) 1356 if (error < 0)
1229 goto error; 1357 goto error;
@@ -1282,7 +1410,8 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1282 parent_client->cl_ipaddr, 1410 parent_client->cl_ipaddr,
1283 data->authflavor, 1411 data->authflavor,
1284 parent_server->client->cl_xprt->prot, 1412 parent_server->client->cl_xprt->prot,
1285 parent_server->client->cl_timeout); 1413 parent_server->client->cl_timeout,
1414 parent_client->cl_minorversion);
1286 if (error < 0) 1415 if (error < 0)
1287 goto error; 1416 goto error;
1288 1417
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 968225a88015..6dd48a4405b4 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
10#include <linux/kthread.h> 10#include <linux/kthread.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/smp_lock.h>
13#include <linux/spinlock.h> 14#include <linux/spinlock.h>
14 15
15#include <linux/nfs4.h> 16#include <linux/nfs4.h>
@@ -68,29 +69,26 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
68{ 69{
69 struct inode *inode = state->inode; 70 struct inode *inode = state->inode;
70 struct file_lock *fl; 71 struct file_lock *fl;
71 int status; 72 int status = 0;
73
74 if (inode->i_flock == NULL)
75 goto out;
72 76
77 /* Protect inode->i_flock using the BKL */
78 lock_kernel();
73 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 79 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
74 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 80 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
75 continue; 81 continue;
76 if (nfs_file_open_context(fl->fl_file) != ctx) 82 if (nfs_file_open_context(fl->fl_file) != ctx)
77 continue; 83 continue;
84 unlock_kernel();
78 status = nfs4_lock_delegation_recall(state, fl); 85 status = nfs4_lock_delegation_recall(state, fl);
79 if (status >= 0) 86 if (status < 0)
80 continue; 87 goto out;
81 switch (status) { 88 lock_kernel();
82 default:
83 printk(KERN_ERR "%s: unhandled error %d.\n",
84 __func__, status);
85 case -NFS4ERR_EXPIRED:
86 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
87 case -NFS4ERR_STALE_CLIENTID:
88 nfs4_schedule_state_recovery(NFS_SERVER(inode)->nfs_client);
89 goto out_err;
90 }
91 } 89 }
92 return 0; 90 unlock_kernel();
93out_err: 91out:
94 return status; 92 return status;
95} 93}
96 94
@@ -268,7 +266,10 @@ static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegat
268 struct nfs_inode *nfsi = NFS_I(inode); 266 struct nfs_inode *nfsi = NFS_I(inode);
269 267
270 nfs_msync_inode(inode); 268 nfs_msync_inode(inode);
271 /* Guard against new delegated open calls */ 269 /*
270 * Guard against new delegated open/lock/unlock calls and against
271 * state recovery
272 */
272 down_write(&nfsi->rwsem); 273 down_write(&nfsi->rwsem);
273 nfs_delegation_claim_opens(inode, &delegation->stateid); 274 nfs_delegation_claim_opens(inode, &delegation->stateid);
274 up_write(&nfsi->rwsem); 275 up_write(&nfsi->rwsem);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 89f98e9a024b..32062c33c859 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -29,7 +29,6 @@
29#include <linux/nfs_fs.h> 29#include <linux/nfs_fs.h>
30#include <linux/nfs_mount.h> 30#include <linux/nfs_mount.h>
31#include <linux/pagemap.h> 31#include <linux/pagemap.h>
32#include <linux/smp_lock.h>
33#include <linux/pagevec.h> 32#include <linux/pagevec.h>
34#include <linux/namei.h> 33#include <linux/namei.h>
35#include <linux/mount.h> 34#include <linux/mount.h>
@@ -1026,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1026 res = NULL; 1025 res = NULL;
1027 goto out; 1026 goto out;
1028 /* This turned out not to be a regular file */ 1027 /* This turned out not to be a regular file */
1029 case -EISDIR:
1030 case -ENOTDIR: 1028 case -ENOTDIR:
1031 goto no_open; 1029 goto no_open;
1032 case -ELOOP: 1030 case -ELOOP:
1033 if (!(nd->intent.open.flags & O_NOFOLLOW)) 1031 if (!(nd->intent.open.flags & O_NOFOLLOW))
1034 goto no_open; 1032 goto no_open;
1033 /* case -EISDIR: */
1035 /* case -EINVAL: */ 1034 /* case -EINVAL: */
1036 default: 1035 default:
1037 goto out; 1036 goto out;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 08f6b040d289..e4e089a8f294 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -255,10 +255,13 @@ static void nfs_direct_read_release(void *calldata)
255 255
256 if (put_dreq(dreq)) 256 if (put_dreq(dreq))
257 nfs_direct_complete(dreq); 257 nfs_direct_complete(dreq);
258 nfs_readdata_release(calldata); 258 nfs_readdata_free(data);
259} 259}
260 260
261static const struct rpc_call_ops nfs_read_direct_ops = { 261static const struct rpc_call_ops nfs_read_direct_ops = {
262#if defined(CONFIG_NFS_V4_1)
263 .rpc_call_prepare = nfs_read_prepare,
264#endif /* CONFIG_NFS_V4_1 */
262 .rpc_call_done = nfs_direct_read_result, 265 .rpc_call_done = nfs_direct_read_result,
263 .rpc_release = nfs_direct_read_release, 266 .rpc_release = nfs_direct_read_release,
264}; 267};
@@ -311,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
311 data->npages, 1, 0, data->pagevec, NULL); 314 data->npages, 1, 0, data->pagevec, NULL);
312 up_read(&current->mm->mmap_sem); 315 up_read(&current->mm->mmap_sem);
313 if (result < 0) { 316 if (result < 0) {
314 nfs_readdata_release(data); 317 nfs_readdata_free(data);
315 break; 318 break;
316 } 319 }
317 if ((unsigned)result < data->npages) { 320 if ((unsigned)result < data->npages) {
318 bytes = result * PAGE_SIZE; 321 bytes = result * PAGE_SIZE;
319 if (bytes <= pgbase) { 322 if (bytes <= pgbase) {
320 nfs_direct_release_pages(data->pagevec, result); 323 nfs_direct_release_pages(data->pagevec, result);
321 nfs_readdata_release(data); 324 nfs_readdata_free(data);
322 break; 325 break;
323 } 326 }
324 bytes -= pgbase; 327 bytes -= pgbase;
@@ -331,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
331 data->inode = inode; 334 data->inode = inode;
332 data->cred = msg.rpc_cred; 335 data->cred = msg.rpc_cred;
333 data->args.fh = NFS_FH(inode); 336 data->args.fh = NFS_FH(inode);
334 data->args.context = get_nfs_open_context(ctx); 337 data->args.context = ctx;
335 data->args.offset = pos; 338 data->args.offset = pos;
336 data->args.pgbase = pgbase; 339 data->args.pgbase = pgbase;
337 data->args.pages = data->pagevec; 340 data->args.pages = data->pagevec;
@@ -438,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
438 struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); 441 struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
439 list_del(&data->pages); 442 list_del(&data->pages);
440 nfs_direct_release_pages(data->pagevec, data->npages); 443 nfs_direct_release_pages(data->pagevec, data->npages);
441 nfs_writedata_release(data); 444 nfs_writedata_free(data);
442 } 445 }
443} 446}
444 447
@@ -531,10 +534,13 @@ static void nfs_direct_commit_release(void *calldata)
531 534
532 dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); 535 dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
533 nfs_direct_write_complete(dreq, data->inode); 536 nfs_direct_write_complete(dreq, data->inode);
534 nfs_commitdata_release(calldata); 537 nfs_commit_free(data);
535} 538}
536 539
537static const struct rpc_call_ops nfs_commit_direct_ops = { 540static const struct rpc_call_ops nfs_commit_direct_ops = {
541#if defined(CONFIG_NFS_V4_1)
542 .rpc_call_prepare = nfs_write_prepare,
543#endif /* CONFIG_NFS_V4_1 */
538 .rpc_call_done = nfs_direct_commit_result, 544 .rpc_call_done = nfs_direct_commit_result,
539 .rpc_release = nfs_direct_commit_release, 545 .rpc_release = nfs_direct_commit_release,
540}; 546};
@@ -564,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
564 data->args.fh = NFS_FH(data->inode); 570 data->args.fh = NFS_FH(data->inode);
565 data->args.offset = 0; 571 data->args.offset = 0;
566 data->args.count = 0; 572 data->args.count = 0;
567 data->args.context = get_nfs_open_context(dreq->ctx); 573 data->args.context = dreq->ctx;
568 data->res.count = 0; 574 data->res.count = 0;
569 data->res.fattr = &data->fattr; 575 data->res.fattr = &data->fattr;
570 data->res.verf = &data->verf; 576 data->res.verf = &data->verf;
@@ -673,6 +679,9 @@ out_unlock:
673} 679}
674 680
675static const struct rpc_call_ops nfs_write_direct_ops = { 681static const struct rpc_call_ops nfs_write_direct_ops = {
682#if defined(CONFIG_NFS_V4_1)
683 .rpc_call_prepare = nfs_write_prepare,
684#endif /* CONFIG_NFS_V4_1 */
676 .rpc_call_done = nfs_direct_write_result, 685 .rpc_call_done = nfs_direct_write_result,
677 .rpc_release = nfs_direct_write_release, 686 .rpc_release = nfs_direct_write_release,
678}; 687};
@@ -725,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
725 data->npages, 0, 0, data->pagevec, NULL); 734 data->npages, 0, 0, data->pagevec, NULL);
726 up_read(&current->mm->mmap_sem); 735 up_read(&current->mm->mmap_sem);
727 if (result < 0) { 736 if (result < 0) {
728 nfs_writedata_release(data); 737 nfs_writedata_free(data);
729 break; 738 break;
730 } 739 }
731 if ((unsigned)result < data->npages) { 740 if ((unsigned)result < data->npages) {
732 bytes = result * PAGE_SIZE; 741 bytes = result * PAGE_SIZE;
733 if (bytes <= pgbase) { 742 if (bytes <= pgbase) {
734 nfs_direct_release_pages(data->pagevec, result); 743 nfs_direct_release_pages(data->pagevec, result);
735 nfs_writedata_release(data); 744 nfs_writedata_free(data);
736 break; 745 break;
737 } 746 }
738 bytes -= pgbase; 747 bytes -= pgbase;
@@ -747,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
747 data->inode = inode; 756 data->inode = inode;
748 data->cred = msg.rpc_cred; 757 data->cred = msg.rpc_cred;
749 data->args.fh = NFS_FH(inode); 758 data->args.fh = NFS_FH(inode);
750 data->args.context = get_nfs_open_context(ctx); 759 data->args.context = ctx;
751 data->args.offset = pos; 760 data->args.offset = pos;
752 data->args.pgbase = pgbase; 761 data->args.pgbase = pgbase;
753 data->args.pages = data->pagevec; 762 data->args.pages = data->pagevec;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ec7e27d00bc6..05062329b678 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -26,7 +26,6 @@
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/pagemap.h> 28#include <linux/pagemap.h>
29#include <linux/smp_lock.h>
30#include <linux/aio.h> 29#include <linux/aio.h>
31 30
32#include <asm/uaccess.h> 31#include <asm/uaccess.h>
@@ -48,6 +47,9 @@ static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
48 size_t count, unsigned int flags); 47 size_t count, unsigned int flags);
49static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, 48static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
50 unsigned long nr_segs, loff_t pos); 49 unsigned long nr_segs, loff_t pos);
50static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
51 struct file *filp, loff_t *ppos,
52 size_t count, unsigned int flags);
51static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, 53static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
52 unsigned long nr_segs, loff_t pos); 54 unsigned long nr_segs, loff_t pos);
53static int nfs_file_flush(struct file *, fl_owner_t id); 55static int nfs_file_flush(struct file *, fl_owner_t id);
@@ -73,6 +75,7 @@ const struct file_operations nfs_file_operations = {
73 .lock = nfs_lock, 75 .lock = nfs_lock,
74 .flock = nfs_flock, 76 .flock = nfs_flock,
75 .splice_read = nfs_file_splice_read, 77 .splice_read = nfs_file_splice_read,
78 .splice_write = nfs_file_splice_write,
76 .check_flags = nfs_check_flags, 79 .check_flags = nfs_check_flags,
77 .setlease = nfs_setlease, 80 .setlease = nfs_setlease,
78}; 81};
@@ -587,12 +590,38 @@ out_swapfile:
587 goto out; 590 goto out;
588} 591}
589 592
593static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
594 struct file *filp, loff_t *ppos,
595 size_t count, unsigned int flags)
596{
597 struct dentry *dentry = filp->f_path.dentry;
598 struct inode *inode = dentry->d_inode;
599 ssize_t ret;
600
601 dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
602 dentry->d_parent->d_name.name, dentry->d_name.name,
603 (unsigned long) count, (unsigned long long) *ppos);
604
605 /*
606 * The combination of splice and an O_APPEND destination is disallowed.
607 */
608
609 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
610
611 ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
612 if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
613 int err = nfs_do_fsync(nfs_file_open_context(filp), inode);
614 if (err < 0)
615 ret = err;
616 }
617 return ret;
618}
619
590static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) 620static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
591{ 621{
592 struct inode *inode = filp->f_mapping->host; 622 struct inode *inode = filp->f_mapping->host;
593 int status = 0; 623 int status = 0;
594 624
595 lock_kernel();
596 /* Try local locking first */ 625 /* Try local locking first */
597 posix_test_lock(filp, fl); 626 posix_test_lock(filp, fl);
598 if (fl->fl_type != F_UNLCK) { 627 if (fl->fl_type != F_UNLCK) {
@@ -608,7 +637,6 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
608 637
609 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 638 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
610out: 639out:
611 unlock_kernel();
612 return status; 640 return status;
613out_noconflict: 641out_noconflict:
614 fl->fl_type = F_UNLCK; 642 fl->fl_type = F_UNLCK;
@@ -650,13 +678,11 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
650 * If we're signalled while cleaning up locks on process exit, we 678 * If we're signalled while cleaning up locks on process exit, we
651 * still need to complete the unlock. 679 * still need to complete the unlock.
652 */ 680 */
653 lock_kernel();
654 /* Use local locking if mounted with "-onolock" */ 681 /* Use local locking if mounted with "-onolock" */
655 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) 682 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
656 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 683 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
657 else 684 else
658 status = do_vfs_lock(filp, fl); 685 status = do_vfs_lock(filp, fl);
659 unlock_kernel();
660 return status; 686 return status;
661} 687}
662 688
@@ -673,13 +699,11 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
673 if (status != 0) 699 if (status != 0)
674 goto out; 700 goto out;
675 701
676 lock_kernel();
677 /* Use local locking if mounted with "-onolock" */ 702 /* Use local locking if mounted with "-onolock" */
678 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) 703 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
679 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 704 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
680 else 705 else
681 status = do_vfs_lock(filp, fl); 706 status = do_vfs_lock(filp, fl);
682 unlock_kernel();
683 if (status < 0) 707 if (status < 0)
684 goto out; 708 goto out;
685 /* 709 /*
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 46177cb87064..b35d2a616066 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -30,7 +30,6 @@
30#include <linux/nfs_idmap.h> 30#include <linux/nfs_idmap.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/namei.h> 32#include <linux/namei.h>
33#include <linux/mnt_namespace.h>
34#include <linux/security.h> 33#include <linux/security.h>
35 34
36#include <asm/system.h> 35#include <asm/system.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 64f87194d390..bd7938eda6a8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -30,7 +30,6 @@
30#include <linux/nfs_mount.h> 30#include <linux/nfs_mount.h>
31#include <linux/nfs4_mount.h> 31#include <linux/nfs4_mount.h>
32#include <linux/lockd/bind.h> 32#include <linux/lockd/bind.h>
33#include <linux/smp_lock.h>
34#include <linux/seq_file.h> 33#include <linux/seq_file.h>
35#include <linux/mount.h> 34#include <linux/mount.h>
36#include <linux/nfs_idmap.h> 35#include <linux/nfs_idmap.h>
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e4d6a8348adf..7dd90a6769d0 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -2,6 +2,7 @@
2 * NFS internal definitions 2 * NFS internal definitions
3 */ 3 */
4 4
5#include "nfs4_fs.h"
5#include <linux/mount.h> 6#include <linux/mount.h>
6#include <linux/security.h> 7#include <linux/security.h>
7 8
@@ -17,6 +18,18 @@ struct nfs_string;
17 */ 18 */
18#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) 19#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1)
19 20
21/*
22 * Determine if sessions are in use.
23 */
24static inline int nfs4_has_session(const struct nfs_client *clp)
25{
26#ifdef CONFIG_NFS_V4_1
27 if (clp->cl_session)
28 return 1;
29#endif /* CONFIG_NFS_V4_1 */
30 return 0;
31}
32
20struct nfs_clone_mount { 33struct nfs_clone_mount {
21 const struct super_block *sb; 34 const struct super_block *sb;
22 const struct dentry *dentry; 35 const struct dentry *dentry;
@@ -30,6 +43,12 @@ struct nfs_clone_mount {
30}; 43};
31 44
32/* 45/*
46 * Note: RFC 1813 doesn't limit the number of auth flavors that
47 * a server can return, so make something up.
48 */
49#define NFS_MAX_SECFLAVORS (12)
50
51/*
33 * In-kernel mount arguments 52 * In-kernel mount arguments
34 */ 53 */
35struct nfs_parsed_mount_data { 54struct nfs_parsed_mount_data {
@@ -44,6 +63,7 @@ struct nfs_parsed_mount_data {
44 unsigned int auth_flavor_len; 63 unsigned int auth_flavor_len;
45 rpc_authflavor_t auth_flavors[1]; 64 rpc_authflavor_t auth_flavors[1];
46 char *client_address; 65 char *client_address;
66 unsigned int minorversion;
47 char *fscache_uniq; 67 char *fscache_uniq;
48 68
49 struct { 69 struct {
@@ -77,6 +97,8 @@ struct nfs_mount_request {
77 unsigned short protocol; 97 unsigned short protocol;
78 struct nfs_fh *fh; 98 struct nfs_fh *fh;
79 int noresvport; 99 int noresvport;
100 unsigned int *auth_flav_len;
101 rpc_authflavor_t *auth_flavs;
80}; 102};
81 103
82extern int nfs_mount(struct nfs_mount_request *info); 104extern int nfs_mount(struct nfs_mount_request *info);
@@ -99,6 +121,8 @@ extern void nfs_free_server(struct nfs_server *server);
99extern struct nfs_server *nfs_clone_server(struct nfs_server *, 121extern struct nfs_server *nfs_clone_server(struct nfs_server *,
100 struct nfs_fh *, 122 struct nfs_fh *,
101 struct nfs_fattr *); 123 struct nfs_fattr *);
124extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
125extern int nfs4_check_client_ready(struct nfs_client *clp);
102#ifdef CONFIG_PROC_FS 126#ifdef CONFIG_PROC_FS
103extern int __init nfs_fs_proc_init(void); 127extern int __init nfs_fs_proc_init(void);
104extern void nfs_fs_proc_exit(void); 128extern void nfs_fs_proc_exit(void);
@@ -146,6 +170,20 @@ extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
146extern struct rpc_procinfo nfs3_procedures[]; 170extern struct rpc_procinfo nfs3_procedures[];
147extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int); 171extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
148 172
173/* nfs4proc.c */
174static inline void nfs4_restart_rpc(struct rpc_task *task,
175 const struct nfs_client *clp)
176{
177#ifdef CONFIG_NFS_V4_1
178 if (nfs4_has_session(clp) &&
179 test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) {
180 rpc_restart_call_prepare(task);
181 return;
182 }
183#endif /* CONFIG_NFS_V4_1 */
184 rpc_restart_call(task);
185}
186
149/* nfs4xdr.c */ 187/* nfs4xdr.c */
150#ifdef CONFIG_NFS_V4 188#ifdef CONFIG_NFS_V4
151extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); 189extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
@@ -205,6 +243,38 @@ extern int nfs4_path_walk(struct nfs_server *server,
205 const char *path); 243 const char *path);
206#endif 244#endif
207 245
246/* read.c */
247extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
248
249/* write.c */
250extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
251
252/* nfs4proc.c */
253extern int _nfs4_call_sync(struct nfs_server *server,
254 struct rpc_message *msg,
255 struct nfs4_sequence_args *args,
256 struct nfs4_sequence_res *res,
257 int cache_reply);
258extern int _nfs4_call_sync_session(struct nfs_server *server,
259 struct rpc_message *msg,
260 struct nfs4_sequence_args *args,
261 struct nfs4_sequence_res *res,
262 int cache_reply);
263
264#ifdef CONFIG_NFS_V4_1
265extern void nfs41_sequence_free_slot(const struct nfs_client *,
266 struct nfs4_sequence_res *res);
267#endif /* CONFIG_NFS_V4_1 */
268
269static inline void nfs4_sequence_free_slot(const struct nfs_client *clp,
270 struct nfs4_sequence_res *res)
271{
272#ifdef CONFIG_NFS_V4_1
273 if (nfs4_has_session(clp))
274 nfs41_sequence_free_slot(clp, res);
275#endif /* CONFIG_NFS_V4_1 */
276}
277
208/* 278/*
209 * Determine the device name as a string 279 * Determine the device name as a string
210 */ 280 */
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a2ab2529b5ca..ceda50aad73c 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -31,7 +31,7 @@ static inline void nfs_inc_server_stats(const struct nfs_server *server,
31 cpu = get_cpu(); 31 cpu = get_cpu();
32 iostats = per_cpu_ptr(server->io_stats, cpu); 32 iostats = per_cpu_ptr(server->io_stats, cpu);
33 iostats->events[stat]++; 33 iostats->events[stat]++;
34 put_cpu_no_resched(); 34 put_cpu();
35} 35}
36 36
37static inline void nfs_inc_stats(const struct inode *inode, 37static inline void nfs_inc_stats(const struct inode *inode,
@@ -50,7 +50,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server,
50 cpu = get_cpu(); 50 cpu = get_cpu();
51 iostats = per_cpu_ptr(server->io_stats, cpu); 51 iostats = per_cpu_ptr(server->io_stats, cpu);
52 iostats->bytes[stat] += addend; 52 iostats->bytes[stat] += addend;
53 put_cpu_no_resched(); 53 put_cpu();
54} 54}
55 55
56static inline void nfs_add_stats(const struct inode *inode, 56static inline void nfs_add_stats(const struct inode *inode,
@@ -71,7 +71,7 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
71 cpu = get_cpu(); 71 cpu = get_cpu();
72 iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu); 72 iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
73 iostats->fscache[stat] += addend; 73 iostats->fscache[stat] += addend;
74 put_cpu_no_resched(); 74 put_cpu();
75} 75}
76#endif 76#endif
77 77
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index ca905a5bb1ba..38ef9eaec407 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -20,8 +20,116 @@
20# define NFSDBG_FACILITY NFSDBG_MOUNT 20# define NFSDBG_FACILITY NFSDBG_MOUNT
21#endif 21#endif
22 22
23/*
24 * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
25 */
26#define MNTPATHLEN (1024)
27
28/*
29 * XDR data type sizes
30 */
31#define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN))
32#define MNT_status_sz (1)
33#define MNT_fhs_status_sz (1)
34#define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE)
35#define MNT_fhandle3_sz (1 + XDR_QUADLEN(NFS3_FHSIZE))
36#define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS)
37
38/*
39 * XDR argument and result sizes
40 */
41#define MNT_enc_dirpath_sz encode_dirpath_sz
42#define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz)
43#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandle_sz + \
44 MNT_authflav3_sz)
45
46/*
47 * Defined by RFC 1094, section A.5
48 */
49enum {
50 MOUNTPROC_NULL = 0,
51 MOUNTPROC_MNT = 1,
52 MOUNTPROC_DUMP = 2,
53 MOUNTPROC_UMNT = 3,
54 MOUNTPROC_UMNTALL = 4,
55 MOUNTPROC_EXPORT = 5,
56};
57
58/*
59 * Defined by RFC 1813, section 5.2
60 */
61enum {
62 MOUNTPROC3_NULL = 0,
63 MOUNTPROC3_MNT = 1,
64 MOUNTPROC3_DUMP = 2,
65 MOUNTPROC3_UMNT = 3,
66 MOUNTPROC3_UMNTALL = 4,
67 MOUNTPROC3_EXPORT = 5,
68};
69
23static struct rpc_program mnt_program; 70static struct rpc_program mnt_program;
24 71
72/*
73 * Defined by OpenGroup XNFS Version 3W, chapter 8
74 */
75enum mountstat {
76 MNT_OK = 0,
77 MNT_EPERM = 1,
78 MNT_ENOENT = 2,
79 MNT_EACCES = 13,
80 MNT_EINVAL = 22,
81};
82
83static struct {
84 u32 status;
85 int errno;
86} mnt_errtbl[] = {
87 { .status = MNT_OK, .errno = 0, },
88 { .status = MNT_EPERM, .errno = -EPERM, },
89 { .status = MNT_ENOENT, .errno = -ENOENT, },
90 { .status = MNT_EACCES, .errno = -EACCES, },
91 { .status = MNT_EINVAL, .errno = -EINVAL, },
92};
93
94/*
95 * Defined by RFC 1813, section 5.1.5
96 */
97enum mountstat3 {
98 MNT3_OK = 0, /* no error */
99 MNT3ERR_PERM = 1, /* Not owner */
100 MNT3ERR_NOENT = 2, /* No such file or directory */
101 MNT3ERR_IO = 5, /* I/O error */
102 MNT3ERR_ACCES = 13, /* Permission denied */
103 MNT3ERR_NOTDIR = 20, /* Not a directory */
104 MNT3ERR_INVAL = 22, /* Invalid argument */
105 MNT3ERR_NAMETOOLONG = 63, /* Filename too long */
106 MNT3ERR_NOTSUPP = 10004, /* Operation not supported */
107 MNT3ERR_SERVERFAULT = 10006, /* A failure on the server */
108};
109
110static struct {
111 u32 status;
112 int errno;
113} mnt3_errtbl[] = {
114 { .status = MNT3_OK, .errno = 0, },
115 { .status = MNT3ERR_PERM, .errno = -EPERM, },
116 { .status = MNT3ERR_NOENT, .errno = -ENOENT, },
117 { .status = MNT3ERR_IO, .errno = -EIO, },
118 { .status = MNT3ERR_ACCES, .errno = -EACCES, },
119 { .status = MNT3ERR_NOTDIR, .errno = -ENOTDIR, },
120 { .status = MNT3ERR_INVAL, .errno = -EINVAL, },
121 { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, },
122 { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, },
123 { .status = MNT3ERR_SERVERFAULT, .errno = -ESERVERFAULT, },
124};
125
126struct mountres {
127 int errno;
128 struct nfs_fh *fh;
129 unsigned int *auth_count;
130 rpc_authflavor_t *auth_flavors;
131};
132
25struct mnt_fhstatus { 133struct mnt_fhstatus {
26 u32 status; 134 u32 status;
27 struct nfs_fh *fh; 135 struct nfs_fh *fh;
@@ -35,8 +143,10 @@ struct mnt_fhstatus {
35 */ 143 */
36int nfs_mount(struct nfs_mount_request *info) 144int nfs_mount(struct nfs_mount_request *info)
37{ 145{
38 struct mnt_fhstatus result = { 146 struct mountres result = {
39 .fh = info->fh 147 .fh = info->fh,
148 .auth_count = info->auth_flav_len,
149 .auth_flavors = info->auth_flavs,
40 }; 150 };
41 struct rpc_message msg = { 151 struct rpc_message msg = {
42 .rpc_argp = info->dirpath, 152 .rpc_argp = info->dirpath,
@@ -68,14 +178,14 @@ int nfs_mount(struct nfs_mount_request *info)
68 if (info->version == NFS_MNT3_VERSION) 178 if (info->version == NFS_MNT3_VERSION)
69 msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT]; 179 msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
70 else 180 else
71 msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT]; 181 msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
72 182
73 status = rpc_call_sync(mnt_clnt, &msg, 0); 183 status = rpc_call_sync(mnt_clnt, &msg, 0);
74 rpc_shutdown_client(mnt_clnt); 184 rpc_shutdown_client(mnt_clnt);
75 185
76 if (status < 0) 186 if (status < 0)
77 goto out_call_err; 187 goto out_call_err;
78 if (result.status != 0) 188 if (result.errno != 0)
79 goto out_mnt_err; 189 goto out_mnt_err;
80 190
81 dprintk("NFS: MNT request succeeded\n"); 191 dprintk("NFS: MNT request succeeded\n");
@@ -86,72 +196,215 @@ out:
86 196
87out_clnt_err: 197out_clnt_err:
88 status = PTR_ERR(mnt_clnt); 198 status = PTR_ERR(mnt_clnt);
89 dprintk("NFS: failed to create RPC client, status=%d\n", status); 199 dprintk("NFS: failed to create MNT RPC client, status=%d\n", status);
90 goto out; 200 goto out;
91 201
92out_call_err: 202out_call_err:
93 dprintk("NFS: failed to start MNT request, status=%d\n", status); 203 dprintk("NFS: MNT request failed, status=%d\n", status);
94 goto out; 204 goto out;
95 205
96out_mnt_err: 206out_mnt_err:
97 dprintk("NFS: MNT server returned result %d\n", result.status); 207 dprintk("NFS: MNT server returned result %d\n", result.errno);
98 status = nfs_stat_to_errno(result.status); 208 status = result.errno;
99 goto out; 209 goto out;
100} 210}
101 211
102/* 212/*
103 * XDR encode/decode functions for MOUNT 213 * XDR encode/decode functions for MOUNT
104 */ 214 */
105static int xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p, 215
106 const char *path) 216static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
217{
218 const u32 pathname_len = strlen(pathname);
219 __be32 *p;
220
221 if (unlikely(pathname_len > MNTPATHLEN))
222 return -EIO;
223
224 p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len);
225 if (unlikely(p == NULL))
226 return -EIO;
227 xdr_encode_opaque(p, pathname, pathname_len);
228
229 return 0;
230}
231
232static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p,
233 const char *dirpath)
234{
235 struct xdr_stream xdr;
236
237 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
238 return encode_mntdirpath(&xdr, dirpath);
239}
240
241/*
242 * RFC 1094: "A non-zero status indicates some sort of error. In this
243 * case, the status is a UNIX error number." This can be problematic
244 * if the server and client use different errno values for the same
245 * error.
246 *
247 * However, the OpenGroup XNFS spec provides a simple mapping that is
248 * independent of local errno values on the server and the client.
249 */
250static int decode_status(struct xdr_stream *xdr, struct mountres *res)
107{ 251{
108 p = xdr_encode_string(p, path); 252 unsigned int i;
253 u32 status;
254 __be32 *p;
255
256 p = xdr_inline_decode(xdr, sizeof(status));
257 if (unlikely(p == NULL))
258 return -EIO;
259 status = ntohl(*p);
109 260
110 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 261 for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) {
262 if (mnt_errtbl[i].status == status) {
263 res->errno = mnt_errtbl[i].errno;
264 return 0;
265 }
266 }
267
268 dprintk("NFS: unrecognized MNT status code: %u\n", status);
269 res->errno = -EACCES;
111 return 0; 270 return 0;
112} 271}
113 272
114static int xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p, 273static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
115 struct mnt_fhstatus *res)
116{ 274{
117 struct nfs_fh *fh = res->fh; 275 struct nfs_fh *fh = res->fh;
276 __be32 *p;
277
278 p = xdr_inline_decode(xdr, NFS2_FHSIZE);
279 if (unlikely(p == NULL))
280 return -EIO;
281
282 fh->size = NFS2_FHSIZE;
283 memcpy(fh->data, p, NFS2_FHSIZE);
284 return 0;
285}
286
287static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p,
288 struct mountres *res)
289{
290 struct xdr_stream xdr;
291 int status;
292
293 xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
294
295 status = decode_status(&xdr, res);
296 if (unlikely(status != 0 || res->errno != 0))
297 return status;
298 return decode_fhandle(&xdr, res);
299}
300
301static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
302{
303 unsigned int i;
304 u32 status;
305 __be32 *p;
118 306
119 if ((res->status = ntohl(*p++)) == 0) { 307 p = xdr_inline_decode(xdr, sizeof(status));
120 fh->size = NFS2_FHSIZE; 308 if (unlikely(p == NULL))
121 memcpy(fh->data, p, NFS2_FHSIZE); 309 return -EIO;
310 status = ntohl(*p);
311
312 for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) {
313 if (mnt3_errtbl[i].status == status) {
314 res->errno = mnt3_errtbl[i].errno;
315 return 0;
316 }
122 } 317 }
318
319 dprintk("NFS: unrecognized MNT3 status code: %u\n", status);
320 res->errno = -EACCES;
123 return 0; 321 return 0;
124} 322}
125 323
126static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, 324static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
127 struct mnt_fhstatus *res)
128{ 325{
129 struct nfs_fh *fh = res->fh; 326 struct nfs_fh *fh = res->fh;
130 unsigned size; 327 u32 size;
131 328 __be32 *p;
132 if ((res->status = ntohl(*p++)) == 0) { 329
133 size = ntohl(*p++); 330 p = xdr_inline_decode(xdr, sizeof(size));
134 if (size <= NFS3_FHSIZE && size != 0) { 331 if (unlikely(p == NULL))
135 fh->size = size; 332 return -EIO;
136 memcpy(fh->data, p, size); 333
137 } else 334 size = ntohl(*p++);
138 res->status = -EBADHANDLE; 335 if (size > NFS3_FHSIZE || size == 0)
336 return -EIO;
337
338 p = xdr_inline_decode(xdr, size);
339 if (unlikely(p == NULL))
340 return -EIO;
341
342 fh->size = size;
343 memcpy(fh->data, p, size);
344 return 0;
345}
346
347static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
348{
349 rpc_authflavor_t *flavors = res->auth_flavors;
350 unsigned int *count = res->auth_count;
351 u32 entries, i;
352 __be32 *p;
353
354 if (*count == 0)
355 return 0;
356
357 p = xdr_inline_decode(xdr, sizeof(entries));
358 if (unlikely(p == NULL))
359 return -EIO;
360 entries = ntohl(*p);
361 dprintk("NFS: received %u auth flavors\n", entries);
362 if (entries > NFS_MAX_SECFLAVORS)
363 entries = NFS_MAX_SECFLAVORS;
364
365 p = xdr_inline_decode(xdr, sizeof(u32) * entries);
366 if (unlikely(p == NULL))
367 return -EIO;
368
369 if (entries > *count)
370 entries = *count;
371
372 for (i = 0; i < entries; i++) {
373 flavors[i] = ntohl(*p++);
374 dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]);
139 } 375 }
376 *count = i;
377
140 return 0; 378 return 0;
141} 379}
142 380
143#define MNT_dirpath_sz (1 + 256) 381static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p,
144#define MNT_fhstatus_sz (1 + 8) 382 struct mountres *res)
145#define MNT_fhstatus3_sz (1 + 16) 383{
384 struct xdr_stream xdr;
385 int status;
386
387 xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
388
389 status = decode_fhs_status(&xdr, res);
390 if (unlikely(status != 0 || res->errno != 0))
391 return status;
392 status = decode_fhandle3(&xdr, res);
393 if (unlikely(status != 0)) {
394 res->errno = -EBADHANDLE;
395 return 0;
396 }
397 return decode_auth_flavors(&xdr, res);
398}
146 399
147static struct rpc_procinfo mnt_procedures[] = { 400static struct rpc_procinfo mnt_procedures[] = {
148 [MNTPROC_MNT] = { 401 [MOUNTPROC_MNT] = {
149 .p_proc = MNTPROC_MNT, 402 .p_proc = MOUNTPROC_MNT,
150 .p_encode = (kxdrproc_t) xdr_encode_dirpath, 403 .p_encode = (kxdrproc_t)mnt_enc_dirpath,
151 .p_decode = (kxdrproc_t) xdr_decode_fhstatus, 404 .p_decode = (kxdrproc_t)mnt_dec_mountres,
152 .p_arglen = MNT_dirpath_sz, 405 .p_arglen = MNT_enc_dirpath_sz,
153 .p_replen = MNT_fhstatus_sz, 406 .p_replen = MNT_dec_mountres_sz,
154 .p_statidx = MNTPROC_MNT, 407 .p_statidx = MOUNTPROC_MNT,
155 .p_name = "MOUNT", 408 .p_name = "MOUNT",
156 }, 409 },
157}; 410};
@@ -159,10 +412,10 @@ static struct rpc_procinfo mnt_procedures[] = {
159static struct rpc_procinfo mnt3_procedures[] = { 412static struct rpc_procinfo mnt3_procedures[] = {
160 [MOUNTPROC3_MNT] = { 413 [MOUNTPROC3_MNT] = {
161 .p_proc = MOUNTPROC3_MNT, 414 .p_proc = MOUNTPROC3_MNT,
162 .p_encode = (kxdrproc_t) xdr_encode_dirpath, 415 .p_encode = (kxdrproc_t)mnt_enc_dirpath,
163 .p_decode = (kxdrproc_t) xdr_decode_fhstatus3, 416 .p_decode = (kxdrproc_t)mnt_dec_mountres3,
164 .p_arglen = MNT_dirpath_sz, 417 .p_arglen = MNT_enc_dirpath_sz,
165 .p_replen = MNT_fhstatus3_sz, 418 .p_replen = MNT_dec_mountres3_sz,
166 .p_statidx = MOUNTPROC3_MNT, 419 .p_statidx = MOUNTPROC3_MNT,
167 .p_name = "MOUNT", 420 .p_name = "MOUNT",
168 }, 421 },
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 64a288ee046d..40c766782891 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -65,6 +65,11 @@ char *nfs_path(const char *base,
65 dentry = dentry->d_parent; 65 dentry = dentry->d_parent;
66 } 66 }
67 spin_unlock(&dcache_lock); 67 spin_unlock(&dcache_lock);
68 if (*end != '/') {
69 if (--buflen < 0)
70 goto Elong;
71 *--end = '/';
72 }
68 namelen = strlen(base); 73 namelen = strlen(base);
69 /* Strip off excess slashes in base string */ 74 /* Strip off excess slashes in base string */
70 while (namelen > 0 && base[namelen - 1] == '/') 75 while (namelen > 0 && base[namelen - 1] == '/')
@@ -154,7 +159,7 @@ out_err:
154 goto out; 159 goto out;
155out_follow: 160out_follow:
156 while (d_mountpoint(nd->path.dentry) && 161 while (d_mountpoint(nd->path.dentry) &&
157 follow_down(&nd->path.mnt, &nd->path.dentry)) 162 follow_down(&nd->path))
158 ; 163 ;
159 err = 0; 164 err = 0;
160 goto out; 165 goto out;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 6bbf0e6daad2..bac60515a4b3 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -207,8 +207,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
207 status = nfs_revalidate_inode(server, inode); 207 status = nfs_revalidate_inode(server, inode);
208 if (status < 0) 208 if (status < 0)
209 return ERR_PTR(status); 209 return ERR_PTR(status);
210 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
211 nfs_zap_acl_cache(inode);
212 acl = nfs3_get_cached_acl(inode, type); 210 acl = nfs3_get_cached_acl(inode, type);
213 if (acl != ERR_PTR(-EAGAIN)) 211 if (acl != ERR_PTR(-EAGAIN))
214 return acl; 212 return acl;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 84345deab26f..6ea07a3c75d4 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
44 NFS4CLNT_RECLAIM_REBOOT, 44 NFS4CLNT_RECLAIM_REBOOT,
45 NFS4CLNT_RECLAIM_NOGRACE, 45 NFS4CLNT_RECLAIM_NOGRACE,
46 NFS4CLNT_DELEGRETURN, 46 NFS4CLNT_DELEGRETURN,
47 NFS4CLNT_SESSION_SETUP,
47}; 48};
48 49
49/* 50/*
@@ -177,6 +178,14 @@ struct nfs4_state_recovery_ops {
177 int state_flag_bit; 178 int state_flag_bit;
178 int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *); 179 int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
179 int (*recover_lock)(struct nfs4_state *, struct file_lock *); 180 int (*recover_lock)(struct nfs4_state *, struct file_lock *);
181 int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
182 struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
183};
184
185struct nfs4_state_maintenance_ops {
186 int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *);
187 struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
188 int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
180}; 189};
181 190
182extern const struct dentry_operations nfs4_dentry_operations; 191extern const struct dentry_operations nfs4_dentry_operations;
@@ -193,6 +202,7 @@ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struc
193extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); 202extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
194extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); 203extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
195extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); 204extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
205extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
196extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait); 206extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait);
197extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); 207extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
198extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); 208extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
@@ -200,8 +210,32 @@ extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fh
200extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 210extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
201 struct nfs4_fs_locations *fs_locations, struct page *page); 211 struct nfs4_fs_locations *fs_locations, struct page *page);
202 212
203extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops; 213extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[];
204extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops; 214extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[];
215#if defined(CONFIG_NFS_V4_1)
216extern int nfs4_setup_sequence(struct nfs_client *clp,
217 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
218 int cache_reply, struct rpc_task *task);
219extern void nfs4_destroy_session(struct nfs4_session *session);
220extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
221extern int nfs4_proc_create_session(struct nfs_client *, int reset);
222extern int nfs4_proc_destroy_session(struct nfs4_session *);
223extern int nfs4_init_session(struct nfs_server *server);
224#else /* CONFIG_NFS_v4_1 */
225static inline int nfs4_setup_sequence(struct nfs_client *clp,
226 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
227 int cache_reply, struct rpc_task *task)
228{
229 return 0;
230}
231
232static inline int nfs4_init_session(struct nfs_server *server)
233{
234 return 0;
235}
236#endif /* CONFIG_NFS_V4_1 */
237
238extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[];
205 239
206extern const u32 nfs4_fattr_bitmap[2]; 240extern const u32 nfs4_fattr_bitmap[2];
207extern const u32 nfs4_statfs_bitmap[2]; 241extern const u32 nfs4_statfs_bitmap[2];
@@ -216,7 +250,12 @@ extern void nfs4_kill_renewd(struct nfs_client *);
216extern void nfs4_renew_state(struct work_struct *); 250extern void nfs4_renew_state(struct work_struct *);
217 251
218/* nfs4state.c */ 252/* nfs4state.c */
253struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
219struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); 254struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
255#if defined(CONFIG_NFS_V4_1)
256struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
257struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
258#endif /* CONFIG_NFS_V4_1 */
220 259
221extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); 260extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
222extern void nfs4_put_state_owner(struct nfs4_state_owner *); 261extern void nfs4_put_state_owner(struct nfs4_state_owner *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4674f8092da8..6917311f201c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -45,14 +45,16 @@
45#include <linux/nfs4.h> 45#include <linux/nfs4.h>
46#include <linux/nfs_fs.h> 46#include <linux/nfs_fs.h>
47#include <linux/nfs_page.h> 47#include <linux/nfs_page.h>
48#include <linux/smp_lock.h>
49#include <linux/namei.h> 48#include <linux/namei.h>
50#include <linux/mount.h> 49#include <linux/mount.h>
50#include <linux/module.h>
51#include <linux/sunrpc/bc_xprt.h>
51 52
52#include "nfs4_fs.h" 53#include "nfs4_fs.h"
53#include "delegation.h" 54#include "delegation.h"
54#include "internal.h" 55#include "internal.h"
55#include "iostat.h" 56#include "iostat.h"
57#include "callback.h"
56 58
57#define NFSDBG_FACILITY NFSDBG_PROC 59#define NFSDBG_FACILITY NFSDBG_PROC
58 60
@@ -247,7 +249,25 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
247 ret = nfs4_wait_clnt_recover(clp); 249 ret = nfs4_wait_clnt_recover(clp);
248 if (ret == 0) 250 if (ret == 0)
249 exception->retry = 1; 251 exception->retry = 1;
252#if !defined(CONFIG_NFS_V4_1)
250 break; 253 break;
254#else /* !defined(CONFIG_NFS_V4_1) */
255 if (!nfs4_has_session(server->nfs_client))
256 break;
257 /* FALLTHROUGH */
258 case -NFS4ERR_BADSESSION:
259 case -NFS4ERR_BADSLOT:
260 case -NFS4ERR_BAD_HIGH_SLOT:
261 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
262 case -NFS4ERR_DEADSESSION:
263 case -NFS4ERR_SEQ_FALSE_RETRY:
264 case -NFS4ERR_SEQ_MISORDERED:
265 dprintk("%s ERROR: %d Reset session\n", __func__,
266 errorcode);
267 set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
268 exception->retry = 1;
269 /* FALLTHROUGH */
270#endif /* !defined(CONFIG_NFS_V4_1) */
251 case -NFS4ERR_FILE_OPEN: 271 case -NFS4ERR_FILE_OPEN:
252 case -NFS4ERR_GRACE: 272 case -NFS4ERR_GRACE:
253 case -NFS4ERR_DELAY: 273 case -NFS4ERR_DELAY:
@@ -271,6 +291,353 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
271 spin_unlock(&clp->cl_lock); 291 spin_unlock(&clp->cl_lock);
272} 292}
273 293
294#if defined(CONFIG_NFS_V4_1)
295
296/*
297 * nfs4_free_slot - free a slot and efficiently update slot table.
298 *
299 * freeing a slot is trivially done by clearing its respective bit
300 * in the bitmap.
301 * If the freed slotid equals highest_used_slotid we want to update it
302 * so that the server would be able to size down the slot table if needed,
303 * otherwise we know that the highest_used_slotid is still in use.
304 * When updating highest_used_slotid there may be "holes" in the bitmap
305 * so we need to scan down from highest_used_slotid to 0 looking for the now
306 * highest slotid in use.
307 * If none found, highest_used_slotid is set to -1.
308 */
309static void
310nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
311{
312 int slotid = free_slotid;
313
314 spin_lock(&tbl->slot_tbl_lock);
315 /* clear used bit in bitmap */
316 __clear_bit(slotid, tbl->used_slots);
317
318 /* update highest_used_slotid when it is freed */
319 if (slotid == tbl->highest_used_slotid) {
320 slotid = find_last_bit(tbl->used_slots, tbl->max_slots);
321 if (slotid >= 0 && slotid < tbl->max_slots)
322 tbl->highest_used_slotid = slotid;
323 else
324 tbl->highest_used_slotid = -1;
325 }
326 rpc_wake_up_next(&tbl->slot_tbl_waitq);
327 spin_unlock(&tbl->slot_tbl_lock);
328 dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__,
329 free_slotid, tbl->highest_used_slotid);
330}
331
332void nfs41_sequence_free_slot(const struct nfs_client *clp,
333 struct nfs4_sequence_res *res)
334{
335 struct nfs4_slot_table *tbl;
336
337 if (!nfs4_has_session(clp)) {
338 dprintk("%s: No session\n", __func__);
339 return;
340 }
341 tbl = &clp->cl_session->fc_slot_table;
342 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
343 dprintk("%s: No slot\n", __func__);
344 /* just wake up the next guy waiting since
345 * we may have not consumed a slot after all */
346 rpc_wake_up_next(&tbl->slot_tbl_waitq);
347 return;
348 }
349 nfs4_free_slot(tbl, res->sr_slotid);
350 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
351}
352
353static void nfs41_sequence_done(struct nfs_client *clp,
354 struct nfs4_sequence_res *res,
355 int rpc_status)
356{
357 unsigned long timestamp;
358 struct nfs4_slot_table *tbl;
359 struct nfs4_slot *slot;
360
361 /*
362 * sr_status remains 1 if an RPC level error occurred. The server
363 * may or may not have processed the sequence operation..
364 * Proceed as if the server received and processed the sequence
365 * operation.
366 */
367 if (res->sr_status == 1)
368 res->sr_status = NFS_OK;
369
370 /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
371 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
372 goto out;
373
374 tbl = &clp->cl_session->fc_slot_table;
375 slot = tbl->slots + res->sr_slotid;
376
377 if (res->sr_status == 0) {
378 /* Update the slot's sequence and clientid lease timer */
379 ++slot->seq_nr;
380 timestamp = res->sr_renewal_time;
381 spin_lock(&clp->cl_lock);
382 if (time_before(clp->cl_last_renewal, timestamp))
383 clp->cl_last_renewal = timestamp;
384 spin_unlock(&clp->cl_lock);
385 return;
386 }
387out:
388 /* The session may be reset by one of the error handlers. */
389 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
390 nfs41_sequence_free_slot(clp, res);
391}
392
393/*
394 * nfs4_find_slot - efficiently look for a free slot
395 *
396 * nfs4_find_slot looks for an unset bit in the used_slots bitmap.
397 * If found, we mark the slot as used, update the highest_used_slotid,
398 * and respectively set up the sequence operation args.
399 * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise.
400 *
401 * Note: must be called with under the slot_tbl_lock.
402 */
403static u8
404nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task)
405{
406 int slotid;
407 u8 ret_id = NFS4_MAX_SLOT_TABLE;
408 BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE);
409
410 dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n",
411 __func__, tbl->used_slots[0], tbl->highest_used_slotid,
412 tbl->max_slots);
413 slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
414 if (slotid >= tbl->max_slots)
415 goto out;
416 __set_bit(slotid, tbl->used_slots);
417 if (slotid > tbl->highest_used_slotid)
418 tbl->highest_used_slotid = slotid;
419 ret_id = slotid;
420out:
421 dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
422 __func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id);
423 return ret_id;
424}
425
426static int nfs4_recover_session(struct nfs4_session *session)
427{
428 struct nfs_client *clp = session->clp;
429 int ret;
430
431 for (;;) {
432 ret = nfs4_wait_clnt_recover(clp);
433 if (ret != 0)
434 return ret;
435 if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
436 break;
437 nfs4_schedule_state_manager(clp);
438 }
439 return 0;
440}
441
442static int nfs41_setup_sequence(struct nfs4_session *session,
443 struct nfs4_sequence_args *args,
444 struct nfs4_sequence_res *res,
445 int cache_reply,
446 struct rpc_task *task)
447{
448 struct nfs4_slot *slot;
449 struct nfs4_slot_table *tbl;
450 int status = 0;
451 u8 slotid;
452
453 dprintk("--> %s\n", __func__);
454 /* slot already allocated? */
455 if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
456 return 0;
457
458 memset(res, 0, sizeof(*res));
459 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
460 tbl = &session->fc_slot_table;
461
462 spin_lock(&tbl->slot_tbl_lock);
463 if (test_bit(NFS4CLNT_SESSION_SETUP, &session->clp->cl_state)) {
464 if (tbl->highest_used_slotid != -1) {
465 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
466 spin_unlock(&tbl->slot_tbl_lock);
467 dprintk("<-- %s: Session reset: draining\n", __func__);
468 return -EAGAIN;
469 }
470
471 /* The slot table is empty; start the reset thread */
472 dprintk("%s Session Reset\n", __func__);
473 spin_unlock(&tbl->slot_tbl_lock);
474 status = nfs4_recover_session(session);
475 if (status)
476 return status;
477 spin_lock(&tbl->slot_tbl_lock);
478 }
479
480 slotid = nfs4_find_slot(tbl, task);
481 if (slotid == NFS4_MAX_SLOT_TABLE) {
482 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
483 spin_unlock(&tbl->slot_tbl_lock);
484 dprintk("<-- %s: no free slots\n", __func__);
485 return -EAGAIN;
486 }
487 spin_unlock(&tbl->slot_tbl_lock);
488
489 slot = tbl->slots + slotid;
490 args->sa_session = session;
491 args->sa_slotid = slotid;
492 args->sa_cache_this = cache_reply;
493
494 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
495
496 res->sr_session = session;
497 res->sr_slotid = slotid;
498 res->sr_renewal_time = jiffies;
499 /*
500 * sr_status is only set in decode_sequence, and so will remain
501 * set to 1 if an rpc level failure occurs.
502 */
503 res->sr_status = 1;
504 return 0;
505}
506
507int nfs4_setup_sequence(struct nfs_client *clp,
508 struct nfs4_sequence_args *args,
509 struct nfs4_sequence_res *res,
510 int cache_reply,
511 struct rpc_task *task)
512{
513 int ret = 0;
514
515 dprintk("--> %s clp %p session %p sr_slotid %d\n",
516 __func__, clp, clp->cl_session, res->sr_slotid);
517
518 if (!nfs4_has_session(clp))
519 goto out;
520 ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply,
521 task);
522 if (ret != -EAGAIN) {
523 /* terminate rpc task */
524 task->tk_status = ret;
525 task->tk_action = NULL;
526 }
527out:
528 dprintk("<-- %s status=%d\n", __func__, ret);
529 return ret;
530}
531
532struct nfs41_call_sync_data {
533 struct nfs_client *clp;
534 struct nfs4_sequence_args *seq_args;
535 struct nfs4_sequence_res *seq_res;
536 int cache_reply;
537};
538
539static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
540{
541 struct nfs41_call_sync_data *data = calldata;
542
543 dprintk("--> %s data->clp->cl_session %p\n", __func__,
544 data->clp->cl_session);
545 if (nfs4_setup_sequence(data->clp, data->seq_args,
546 data->seq_res, data->cache_reply, task))
547 return;
548 rpc_call_start(task);
549}
550
551static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
552{
553 struct nfs41_call_sync_data *data = calldata;
554
555 nfs41_sequence_done(data->clp, data->seq_res, task->tk_status);
556 nfs41_sequence_free_slot(data->clp, data->seq_res);
557}
558
559struct rpc_call_ops nfs41_call_sync_ops = {
560 .rpc_call_prepare = nfs41_call_sync_prepare,
561 .rpc_call_done = nfs41_call_sync_done,
562};
563
564static int nfs4_call_sync_sequence(struct nfs_client *clp,
565 struct rpc_clnt *clnt,
566 struct rpc_message *msg,
567 struct nfs4_sequence_args *args,
568 struct nfs4_sequence_res *res,
569 int cache_reply)
570{
571 int ret;
572 struct rpc_task *task;
573 struct nfs41_call_sync_data data = {
574 .clp = clp,
575 .seq_args = args,
576 .seq_res = res,
577 .cache_reply = cache_reply,
578 };
579 struct rpc_task_setup task_setup = {
580 .rpc_client = clnt,
581 .rpc_message = msg,
582 .callback_ops = &nfs41_call_sync_ops,
583 .callback_data = &data
584 };
585
586 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
587 task = rpc_run_task(&task_setup);
588 if (IS_ERR(task))
589 ret = PTR_ERR(task);
590 else {
591 ret = task->tk_status;
592 rpc_put_task(task);
593 }
594 return ret;
595}
596
597int _nfs4_call_sync_session(struct nfs_server *server,
598 struct rpc_message *msg,
599 struct nfs4_sequence_args *args,
600 struct nfs4_sequence_res *res,
601 int cache_reply)
602{
603 return nfs4_call_sync_sequence(server->nfs_client, server->client,
604 msg, args, res, cache_reply);
605}
606
607#endif /* CONFIG_NFS_V4_1 */
608
609int _nfs4_call_sync(struct nfs_server *server,
610 struct rpc_message *msg,
611 struct nfs4_sequence_args *args,
612 struct nfs4_sequence_res *res,
613 int cache_reply)
614{
615 args->sa_session = res->sr_session = NULL;
616 return rpc_call_sync(server->client, msg, 0);
617}
618
619#define nfs4_call_sync(server, msg, args, res, cache_reply) \
620 (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \
621 &(res)->seq_res, (cache_reply))
622
623static void nfs4_sequence_done(const struct nfs_server *server,
624 struct nfs4_sequence_res *res, int rpc_status)
625{
626#ifdef CONFIG_NFS_V4_1
627 if (nfs4_has_session(server->nfs_client))
628 nfs41_sequence_done(server->nfs_client, res, rpc_status);
629#endif /* CONFIG_NFS_V4_1 */
630}
631
632/* no restart, therefore free slot here */
633static void nfs4_sequence_done_free_slot(const struct nfs_server *server,
634 struct nfs4_sequence_res *res,
635 int rpc_status)
636{
637 nfs4_sequence_done(server, res, rpc_status);
638 nfs4_sequence_free_slot(server->nfs_client, res);
639}
640
274static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) 641static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
275{ 642{
276 struct nfs_inode *nfsi = NFS_I(dir); 643 struct nfs_inode *nfsi = NFS_I(dir);
@@ -312,6 +679,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
312 p->o_res.server = p->o_arg.server; 679 p->o_res.server = p->o_arg.server;
313 nfs_fattr_init(&p->f_attr); 680 nfs_fattr_init(&p->f_attr);
314 nfs_fattr_init(&p->dir_attr); 681 nfs_fattr_init(&p->dir_attr);
682 p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
315} 683}
316 684
317static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, 685static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
@@ -804,16 +1172,30 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
804 err = _nfs4_open_delegation_recall(ctx, state, stateid); 1172 err = _nfs4_open_delegation_recall(ctx, state, stateid);
805 switch (err) { 1173 switch (err) {
806 case 0: 1174 case 0:
807 return err; 1175 case -ENOENT:
1176 case -ESTALE:
1177 goto out;
808 case -NFS4ERR_STALE_CLIENTID: 1178 case -NFS4ERR_STALE_CLIENTID:
809 case -NFS4ERR_STALE_STATEID: 1179 case -NFS4ERR_STALE_STATEID:
810 case -NFS4ERR_EXPIRED: 1180 case -NFS4ERR_EXPIRED:
811 /* Don't recall a delegation if it was lost */ 1181 /* Don't recall a delegation if it was lost */
812 nfs4_schedule_state_recovery(server->nfs_client); 1182 nfs4_schedule_state_recovery(server->nfs_client);
813 return err; 1183 goto out;
1184 case -ERESTARTSYS:
1185 /*
1186 * The show must go on: exit, but mark the
1187 * stateid as needing recovery.
1188 */
1189 case -NFS4ERR_ADMIN_REVOKED:
1190 case -NFS4ERR_BAD_STATEID:
1191 nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
1192 case -ENOMEM:
1193 err = 0;
1194 goto out;
814 } 1195 }
815 err = nfs4_handle_exception(server, err, &exception); 1196 err = nfs4_handle_exception(server, err, &exception);
816 } while (exception.retry); 1197 } while (exception.retry);
1198out:
817 return err; 1199 return err;
818} 1200}
819 1201
@@ -929,6 +1311,10 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
929 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); 1311 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
930 } 1312 }
931 data->timestamp = jiffies; 1313 data->timestamp = jiffies;
1314 if (nfs4_setup_sequence(data->o_arg.server->nfs_client,
1315 &data->o_arg.seq_args,
1316 &data->o_res.seq_res, 1, task))
1317 return;
932 rpc_call_start(task); 1318 rpc_call_start(task);
933 return; 1319 return;
934out_no_action: 1320out_no_action:
@@ -941,6 +1327,10 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
941 struct nfs4_opendata *data = calldata; 1327 struct nfs4_opendata *data = calldata;
942 1328
943 data->rpc_status = task->tk_status; 1329 data->rpc_status = task->tk_status;
1330
1331 nfs4_sequence_done_free_slot(data->o_arg.server, &data->o_res.seq_res,
1332 task->tk_status);
1333
944 if (RPC_ASSASSINATED(task)) 1334 if (RPC_ASSASSINATED(task))
945 return; 1335 return;
946 if (task->tk_status == 0) { 1336 if (task->tk_status == 0) {
@@ -1269,7 +1659,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1269 } else 1659 } else
1270 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); 1660 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
1271 1661
1272 status = rpc_call_sync(server->client, &msg, 0); 1662 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
1273 if (status == 0 && state != NULL) 1663 if (status == 0 && state != NULL)
1274 renew_lease(server, timestamp); 1664 renew_lease(server, timestamp);
1275 return status; 1665 return status;
@@ -1318,6 +1708,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1318 struct nfs4_state *state = calldata->state; 1708 struct nfs4_state *state = calldata->state;
1319 struct nfs_server *server = NFS_SERVER(calldata->inode); 1709 struct nfs_server *server = NFS_SERVER(calldata->inode);
1320 1710
1711 nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status);
1321 if (RPC_ASSASSINATED(task)) 1712 if (RPC_ASSASSINATED(task))
1322 return; 1713 return;
1323 /* hmm. we are done with the inode, and in the process of freeing 1714 /* hmm. we are done with the inode, and in the process of freeing
@@ -1336,10 +1727,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1336 break; 1727 break;
1337 default: 1728 default:
1338 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { 1729 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
1339 rpc_restart_call(task); 1730 nfs4_restart_rpc(task, server->nfs_client);
1340 return; 1731 return;
1341 } 1732 }
1342 } 1733 }
1734 nfs4_sequence_free_slot(server->nfs_client, &calldata->res.seq_res);
1343 nfs_refresh_inode(calldata->inode, calldata->res.fattr); 1735 nfs_refresh_inode(calldata->inode, calldata->res.fattr);
1344} 1736}
1345 1737
@@ -1380,6 +1772,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1380 calldata->arg.fmode = FMODE_WRITE; 1772 calldata->arg.fmode = FMODE_WRITE;
1381 } 1773 }
1382 calldata->timestamp = jiffies; 1774 calldata->timestamp = jiffies;
1775 if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client,
1776 &calldata->arg.seq_args, &calldata->res.seq_res,
1777 1, task))
1778 return;
1383 rpc_call_start(task); 1779 rpc_call_start(task);
1384} 1780}
1385 1781
@@ -1419,13 +1815,15 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1419 }; 1815 };
1420 int status = -ENOMEM; 1816 int status = -ENOMEM;
1421 1817
1422 calldata = kmalloc(sizeof(*calldata), GFP_KERNEL); 1818 calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
1423 if (calldata == NULL) 1819 if (calldata == NULL)
1424 goto out; 1820 goto out;
1425 calldata->inode = state->inode; 1821 calldata->inode = state->inode;
1426 calldata->state = state; 1822 calldata->state = state;
1427 calldata->arg.fh = NFS_FH(state->inode); 1823 calldata->arg.fh = NFS_FH(state->inode);
1428 calldata->arg.stateid = &state->open_stateid; 1824 calldata->arg.stateid = &state->open_stateid;
1825 if (nfs4_has_session(server->nfs_client))
1826 memset(calldata->arg.stateid->data, 0, 4); /* clear seqid */
1429 /* Serialization for the sequence id */ 1827 /* Serialization for the sequence id */
1430 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); 1828 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
1431 if (calldata->arg.seqid == NULL) 1829 if (calldata->arg.seqid == NULL)
@@ -1435,6 +1833,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1435 calldata->res.fattr = &calldata->fattr; 1833 calldata->res.fattr = &calldata->fattr;
1436 calldata->res.seqid = calldata->arg.seqid; 1834 calldata->res.seqid = calldata->arg.seqid;
1437 calldata->res.server = server; 1835 calldata->res.server = server;
1836 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
1438 calldata->path.mnt = mntget(path->mnt); 1837 calldata->path.mnt = mntget(path->mnt);
1439 calldata->path.dentry = dget(path->dentry); 1838 calldata->path.dentry = dget(path->dentry);
1440 1839
@@ -1584,15 +1983,18 @@ void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
1584 1983
1585static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) 1984static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
1586{ 1985{
1986 struct nfs4_server_caps_arg args = {
1987 .fhandle = fhandle,
1988 };
1587 struct nfs4_server_caps_res res = {}; 1989 struct nfs4_server_caps_res res = {};
1588 struct rpc_message msg = { 1990 struct rpc_message msg = {
1589 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS], 1991 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS],
1590 .rpc_argp = fhandle, 1992 .rpc_argp = &args,
1591 .rpc_resp = &res, 1993 .rpc_resp = &res,
1592 }; 1994 };
1593 int status; 1995 int status;
1594 1996
1595 status = rpc_call_sync(server->client, &msg, 0); 1997 status = nfs4_call_sync(server, &msg, &args, &res, 0);
1596 if (status == 0) { 1998 if (status == 0) {
1597 memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); 1999 memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
1598 if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) 2000 if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
@@ -1606,6 +2008,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
1606 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; 2008 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
1607 server->acl_bitmask = res.acl_bitmask; 2009 server->acl_bitmask = res.acl_bitmask;
1608 } 2010 }
2011
1609 return status; 2012 return status;
1610} 2013}
1611 2014
@@ -1637,8 +2040,9 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
1637 .rpc_argp = &args, 2040 .rpc_argp = &args,
1638 .rpc_resp = &res, 2041 .rpc_resp = &res,
1639 }; 2042 };
2043
1640 nfs_fattr_init(info->fattr); 2044 nfs_fattr_init(info->fattr);
1641 return rpc_call_sync(server->client, &msg, 0); 2045 return nfs4_call_sync(server, &msg, &args, &res, 0);
1642} 2046}
1643 2047
1644static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, 2048static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -1728,7 +2132,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
1728 }; 2132 };
1729 2133
1730 nfs_fattr_init(fattr); 2134 nfs_fattr_init(fattr);
1731 return rpc_call_sync(server->client, &msg, 0); 2135 return nfs4_call_sync(server, &msg, &args, &res, 0);
1732} 2136}
1733 2137
1734static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2138static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
@@ -1812,7 +2216,7 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d
1812 nfs_fattr_init(fattr); 2216 nfs_fattr_init(fattr);
1813 2217
1814 dprintk("NFS call lookupfh %s\n", name->name); 2218 dprintk("NFS call lookupfh %s\n", name->name);
1815 status = rpc_call_sync(server->client, &msg, 0); 2219 status = nfs4_call_sync(server, &msg, &args, &res, 0);
1816 dprintk("NFS reply lookupfh: %d\n", status); 2220 dprintk("NFS reply lookupfh: %d\n", status);
1817 return status; 2221 return status;
1818} 2222}
@@ -1898,7 +2302,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
1898 args.access |= NFS4_ACCESS_EXECUTE; 2302 args.access |= NFS4_ACCESS_EXECUTE;
1899 } 2303 }
1900 nfs_fattr_init(&fattr); 2304 nfs_fattr_init(&fattr);
1901 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 2305 status = nfs4_call_sync(server, &msg, &args, &res, 0);
1902 if (!status) { 2306 if (!status) {
1903 entry->mask = 0; 2307 entry->mask = 0;
1904 if (res.access & NFS4_ACCESS_READ) 2308 if (res.access & NFS4_ACCESS_READ)
@@ -1957,13 +2361,14 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
1957 .pglen = pglen, 2361 .pglen = pglen,
1958 .pages = &page, 2362 .pages = &page,
1959 }; 2363 };
2364 struct nfs4_readlink_res res;
1960 struct rpc_message msg = { 2365 struct rpc_message msg = {
1961 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK], 2366 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK],
1962 .rpc_argp = &args, 2367 .rpc_argp = &args,
1963 .rpc_resp = NULL, 2368 .rpc_resp = &res,
1964 }; 2369 };
1965 2370
1966 return rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 2371 return nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0);
1967} 2372}
1968 2373
1969static int nfs4_proc_readlink(struct inode *inode, struct page *page, 2374static int nfs4_proc_readlink(struct inode *inode, struct page *page,
@@ -2057,7 +2462,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
2057 int status; 2462 int status;
2058 2463
2059 nfs_fattr_init(&res.dir_attr); 2464 nfs_fattr_init(&res.dir_attr);
2060 status = rpc_call_sync(server->client, &msg, 0); 2465 status = nfs4_call_sync(server, &msg, &args, &res, 1);
2061 if (status == 0) { 2466 if (status == 0) {
2062 update_changeattr(dir, &res.cinfo); 2467 update_changeattr(dir, &res.cinfo);
2063 nfs_post_op_update_inode(dir, &res.dir_attr); 2468 nfs_post_op_update_inode(dir, &res.dir_attr);
@@ -2092,8 +2497,10 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
2092{ 2497{
2093 struct nfs_removeres *res = task->tk_msg.rpc_resp; 2498 struct nfs_removeres *res = task->tk_msg.rpc_resp;
2094 2499
2500 nfs4_sequence_done(res->server, &res->seq_res, task->tk_status);
2095 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 2501 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
2096 return 0; 2502 return 0;
2503 nfs4_sequence_free_slot(res->server->nfs_client, &res->seq_res);
2097 update_changeattr(dir, &res->cinfo); 2504 update_changeattr(dir, &res->cinfo);
2098 nfs_post_op_update_inode(dir, &res->dir_attr); 2505 nfs_post_op_update_inode(dir, &res->dir_attr);
2099 return 1; 2506 return 1;
@@ -2125,7 +2532,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
2125 2532
2126 nfs_fattr_init(res.old_fattr); 2533 nfs_fattr_init(res.old_fattr);
2127 nfs_fattr_init(res.new_fattr); 2534 nfs_fattr_init(res.new_fattr);
2128 status = rpc_call_sync(server->client, &msg, 0); 2535 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
2129 2536
2130 if (!status) { 2537 if (!status) {
2131 update_changeattr(old_dir, &res.old_cinfo); 2538 update_changeattr(old_dir, &res.old_cinfo);
@@ -2174,7 +2581,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
2174 2581
2175 nfs_fattr_init(res.fattr); 2582 nfs_fattr_init(res.fattr);
2176 nfs_fattr_init(res.dir_attr); 2583 nfs_fattr_init(res.dir_attr);
2177 status = rpc_call_sync(server->client, &msg, 0); 2584 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
2178 if (!status) { 2585 if (!status) {
2179 update_changeattr(dir, &res.cinfo); 2586 update_changeattr(dir, &res.cinfo);
2180 nfs_post_op_update_inode(dir, res.dir_attr); 2587 nfs_post_op_update_inode(dir, res.dir_attr);
@@ -2235,7 +2642,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
2235 2642
2236static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) 2643static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
2237{ 2644{
2238 int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); 2645 int status = nfs4_call_sync(NFS_SERVER(dir), &data->msg,
2646 &data->arg, &data->res, 1);
2239 if (status == 0) { 2647 if (status == 0) {
2240 update_changeattr(dir, &data->res.dir_cinfo); 2648 update_changeattr(dir, &data->res.dir_cinfo);
2241 nfs_post_op_update_inode(dir, data->res.dir_fattr); 2649 nfs_post_op_update_inode(dir, data->res.dir_fattr);
@@ -2344,7 +2752,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2344 (unsigned long long)cookie); 2752 (unsigned long long)cookie);
2345 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); 2753 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
2346 res.pgbase = args.pgbase; 2754 res.pgbase = args.pgbase;
2347 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 2755 status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
2348 if (status == 0) 2756 if (status == 0)
2349 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); 2757 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
2350 2758
@@ -2422,14 +2830,17 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
2422 .fh = fhandle, 2830 .fh = fhandle,
2423 .bitmask = server->attr_bitmask, 2831 .bitmask = server->attr_bitmask,
2424 }; 2832 };
2833 struct nfs4_statfs_res res = {
2834 .fsstat = fsstat,
2835 };
2425 struct rpc_message msg = { 2836 struct rpc_message msg = {
2426 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS], 2837 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS],
2427 .rpc_argp = &args, 2838 .rpc_argp = &args,
2428 .rpc_resp = fsstat, 2839 .rpc_resp = &res,
2429 }; 2840 };
2430 2841
2431 nfs_fattr_init(fsstat->fattr); 2842 nfs_fattr_init(fsstat->fattr);
2432 return rpc_call_sync(server->client, &msg, 0); 2843 return nfs4_call_sync(server, &msg, &args, &res, 0);
2433} 2844}
2434 2845
2435static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) 2846static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
@@ -2451,13 +2862,16 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
2451 .fh = fhandle, 2862 .fh = fhandle,
2452 .bitmask = server->attr_bitmask, 2863 .bitmask = server->attr_bitmask,
2453 }; 2864 };
2865 struct nfs4_fsinfo_res res = {
2866 .fsinfo = fsinfo,
2867 };
2454 struct rpc_message msg = { 2868 struct rpc_message msg = {
2455 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO], 2869 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO],
2456 .rpc_argp = &args, 2870 .rpc_argp = &args,
2457 .rpc_resp = fsinfo, 2871 .rpc_resp = &res,
2458 }; 2872 };
2459 2873
2460 return rpc_call_sync(server->client, &msg, 0); 2874 return nfs4_call_sync(server, &msg, &args, &res, 0);
2461} 2875}
2462 2876
2463static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) 2877static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
@@ -2486,10 +2900,13 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
2486 .fh = fhandle, 2900 .fh = fhandle,
2487 .bitmask = server->attr_bitmask, 2901 .bitmask = server->attr_bitmask,
2488 }; 2902 };
2903 struct nfs4_pathconf_res res = {
2904 .pathconf = pathconf,
2905 };
2489 struct rpc_message msg = { 2906 struct rpc_message msg = {
2490 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF], 2907 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF],
2491 .rpc_argp = &args, 2908 .rpc_argp = &args,
2492 .rpc_resp = pathconf, 2909 .rpc_resp = &res,
2493 }; 2910 };
2494 2911
2495 /* None of the pathconf attributes are mandatory to implement */ 2912 /* None of the pathconf attributes are mandatory to implement */
@@ -2499,7 +2916,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
2499 } 2916 }
2500 2917
2501 nfs_fattr_init(pathconf->fattr); 2918 nfs_fattr_init(pathconf->fattr);
2502 return rpc_call_sync(server->client, &msg, 0); 2919 return nfs4_call_sync(server, &msg, &args, &res, 0);
2503} 2920}
2504 2921
2505static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, 2922static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -2520,8 +2937,13 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
2520{ 2937{
2521 struct nfs_server *server = NFS_SERVER(data->inode); 2938 struct nfs_server *server = NFS_SERVER(data->inode);
2522 2939
2940 dprintk("--> %s\n", __func__);
2941
2942 /* nfs4_sequence_free_slot called in the read rpc_call_done */
2943 nfs4_sequence_done(server, &data->res.seq_res, task->tk_status);
2944
2523 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 2945 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
2524 rpc_restart_call(task); 2946 nfs4_restart_rpc(task, server->nfs_client);
2525 return -EAGAIN; 2947 return -EAGAIN;
2526 } 2948 }
2527 2949
@@ -2541,8 +2963,12 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
2541{ 2963{
2542 struct inode *inode = data->inode; 2964 struct inode *inode = data->inode;
2543 2965
2966 /* slot is freed in nfs_writeback_done */
2967 nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
2968 task->tk_status);
2969
2544 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 2970 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
2545 rpc_restart_call(task); 2971 nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
2546 return -EAGAIN; 2972 return -EAGAIN;
2547 } 2973 }
2548 if (task->tk_status >= 0) { 2974 if (task->tk_status >= 0) {
@@ -2567,10 +2993,14 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
2567{ 2993{
2568 struct inode *inode = data->inode; 2994 struct inode *inode = data->inode;
2569 2995
2996 nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
2997 task->tk_status);
2570 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 2998 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
2571 rpc_restart_call(task); 2999 nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
2572 return -EAGAIN; 3000 return -EAGAIN;
2573 } 3001 }
3002 nfs4_sequence_free_slot(NFS_SERVER(inode)->nfs_client,
3003 &data->res.seq_res);
2574 nfs_refresh_inode(inode, data->res.fattr); 3004 nfs_refresh_inode(inode, data->res.fattr);
2575 return 0; 3005 return 0;
2576} 3006}
@@ -2603,6 +3033,9 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
2603 if (time_before(clp->cl_last_renewal,timestamp)) 3033 if (time_before(clp->cl_last_renewal,timestamp))
2604 clp->cl_last_renewal = timestamp; 3034 clp->cl_last_renewal = timestamp;
2605 spin_unlock(&clp->cl_lock); 3035 spin_unlock(&clp->cl_lock);
3036 dprintk("%s calling put_rpccred on rpc_cred %p\n", __func__,
3037 task->tk_msg.rpc_cred);
3038 put_rpccred(task->tk_msg.rpc_cred);
2606} 3039}
2607 3040
2608static const struct rpc_call_ops nfs4_renew_ops = { 3041static const struct rpc_call_ops nfs4_renew_ops = {
@@ -2742,12 +3175,14 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
2742 .acl_pages = pages, 3175 .acl_pages = pages,
2743 .acl_len = buflen, 3176 .acl_len = buflen,
2744 }; 3177 };
2745 size_t resp_len = buflen; 3178 struct nfs_getaclres res = {
3179 .acl_len = buflen,
3180 };
2746 void *resp_buf; 3181 void *resp_buf;
2747 struct rpc_message msg = { 3182 struct rpc_message msg = {
2748 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL], 3183 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
2749 .rpc_argp = &args, 3184 .rpc_argp = &args,
2750 .rpc_resp = &resp_len, 3185 .rpc_resp = &res,
2751 }; 3186 };
2752 struct page *localpage = NULL; 3187 struct page *localpage = NULL;
2753 int ret; 3188 int ret;
@@ -2761,26 +3196,26 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
2761 return -ENOMEM; 3196 return -ENOMEM;
2762 args.acl_pages[0] = localpage; 3197 args.acl_pages[0] = localpage;
2763 args.acl_pgbase = 0; 3198 args.acl_pgbase = 0;
2764 resp_len = args.acl_len = PAGE_SIZE; 3199 args.acl_len = PAGE_SIZE;
2765 } else { 3200 } else {
2766 resp_buf = buf; 3201 resp_buf = buf;
2767 buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); 3202 buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
2768 } 3203 }
2769 ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 3204 ret = nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0);
2770 if (ret) 3205 if (ret)
2771 goto out_free; 3206 goto out_free;
2772 if (resp_len > args.acl_len) 3207 if (res.acl_len > args.acl_len)
2773 nfs4_write_cached_acl(inode, NULL, resp_len); 3208 nfs4_write_cached_acl(inode, NULL, res.acl_len);
2774 else 3209 else
2775 nfs4_write_cached_acl(inode, resp_buf, resp_len); 3210 nfs4_write_cached_acl(inode, resp_buf, res.acl_len);
2776 if (buf) { 3211 if (buf) {
2777 ret = -ERANGE; 3212 ret = -ERANGE;
2778 if (resp_len > buflen) 3213 if (res.acl_len > buflen)
2779 goto out_free; 3214 goto out_free;
2780 if (localpage) 3215 if (localpage)
2781 memcpy(buf, resp_buf, resp_len); 3216 memcpy(buf, resp_buf, res.acl_len);
2782 } 3217 }
2783 ret = resp_len; 3218 ret = res.acl_len;
2784out_free: 3219out_free:
2785 if (localpage) 3220 if (localpage)
2786 __free_page(localpage); 3221 __free_page(localpage);
@@ -2810,8 +3245,6 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
2810 ret = nfs_revalidate_inode(server, inode); 3245 ret = nfs_revalidate_inode(server, inode);
2811 if (ret < 0) 3246 if (ret < 0)
2812 return ret; 3247 return ret;
2813 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
2814 nfs_zap_acl_cache(inode);
2815 ret = nfs4_read_cached_acl(inode, buf, buflen); 3248 ret = nfs4_read_cached_acl(inode, buf, buflen);
2816 if (ret != -ENOENT) 3249 if (ret != -ENOENT)
2817 return ret; 3250 return ret;
@@ -2827,10 +3260,11 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
2827 .acl_pages = pages, 3260 .acl_pages = pages,
2828 .acl_len = buflen, 3261 .acl_len = buflen,
2829 }; 3262 };
3263 struct nfs_setaclres res;
2830 struct rpc_message msg = { 3264 struct rpc_message msg = {
2831 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETACL], 3265 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETACL],
2832 .rpc_argp = &arg, 3266 .rpc_argp = &arg,
2833 .rpc_resp = NULL, 3267 .rpc_resp = &res,
2834 }; 3268 };
2835 int ret; 3269 int ret;
2836 3270
@@ -2838,7 +3272,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
2838 return -EOPNOTSUPP; 3272 return -EOPNOTSUPP;
2839 nfs_inode_return_delegation(inode); 3273 nfs_inode_return_delegation(inode);
2840 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); 3274 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
2841 ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 3275 ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
2842 nfs_access_zap_cache(inode); 3276 nfs_access_zap_cache(inode);
2843 nfs_zap_acl_cache(inode); 3277 nfs_zap_acl_cache(inode);
2844 return ret; 3278 return ret;
@@ -2857,10 +3291,8 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
2857} 3291}
2858 3292
2859static int 3293static int
2860nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) 3294_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state)
2861{ 3295{
2862 struct nfs_client *clp = server->nfs_client;
2863
2864 if (!clp || task->tk_status >= 0) 3296 if (!clp || task->tk_status >= 0)
2865 return 0; 3297 return 0;
2866 switch(task->tk_status) { 3298 switch(task->tk_status) {
@@ -2879,8 +3311,23 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
2879 rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); 3311 rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
2880 task->tk_status = 0; 3312 task->tk_status = 0;
2881 return -EAGAIN; 3313 return -EAGAIN;
3314#if defined(CONFIG_NFS_V4_1)
3315 case -NFS4ERR_BADSESSION:
3316 case -NFS4ERR_BADSLOT:
3317 case -NFS4ERR_BAD_HIGH_SLOT:
3318 case -NFS4ERR_DEADSESSION:
3319 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
3320 case -NFS4ERR_SEQ_FALSE_RETRY:
3321 case -NFS4ERR_SEQ_MISORDERED:
3322 dprintk("%s ERROR %d, Reset session\n", __func__,
3323 task->tk_status);
3324 set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
3325 task->tk_status = 0;
3326 return -EAGAIN;
3327#endif /* CONFIG_NFS_V4_1 */
2882 case -NFS4ERR_DELAY: 3328 case -NFS4ERR_DELAY:
2883 nfs_inc_server_stats(server, NFSIOS_DELAY); 3329 if (server)
3330 nfs_inc_server_stats(server, NFSIOS_DELAY);
2884 case -NFS4ERR_GRACE: 3331 case -NFS4ERR_GRACE:
2885 rpc_delay(task, NFS4_POLL_RETRY_MAX); 3332 rpc_delay(task, NFS4_POLL_RETRY_MAX);
2886 task->tk_status = 0; 3333 task->tk_status = 0;
@@ -2893,6 +3340,12 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
2893 return 0; 3340 return 0;
2894} 3341}
2895 3342
3343static int
3344nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
3345{
3346 return _nfs4_async_handle_error(task, server, server->nfs_client, state);
3347}
3348
2896int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) 3349int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
2897{ 3350{
2898 nfs4_verifier sc_verifier; 3351 nfs4_verifier sc_verifier;
@@ -3000,6 +3453,10 @@ struct nfs4_delegreturndata {
3000static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) 3453static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
3001{ 3454{
3002 struct nfs4_delegreturndata *data = calldata; 3455 struct nfs4_delegreturndata *data = calldata;
3456
3457 nfs4_sequence_done_free_slot(data->res.server, &data->res.seq_res,
3458 task->tk_status);
3459
3003 data->rpc_status = task->tk_status; 3460 data->rpc_status = task->tk_status;
3004 if (data->rpc_status == 0) 3461 if (data->rpc_status == 0)
3005 renew_lease(data->res.server, data->timestamp); 3462 renew_lease(data->res.server, data->timestamp);
@@ -3010,7 +3467,25 @@ static void nfs4_delegreturn_release(void *calldata)
3010 kfree(calldata); 3467 kfree(calldata);
3011} 3468}
3012 3469
3470#if defined(CONFIG_NFS_V4_1)
3471static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
3472{
3473 struct nfs4_delegreturndata *d_data;
3474
3475 d_data = (struct nfs4_delegreturndata *)data;
3476
3477 if (nfs4_setup_sequence(d_data->res.server->nfs_client,
3478 &d_data->args.seq_args,
3479 &d_data->res.seq_res, 1, task))
3480 return;
3481 rpc_call_start(task);
3482}
3483#endif /* CONFIG_NFS_V4_1 */
3484
3013static const struct rpc_call_ops nfs4_delegreturn_ops = { 3485static const struct rpc_call_ops nfs4_delegreturn_ops = {
3486#if defined(CONFIG_NFS_V4_1)
3487 .rpc_call_prepare = nfs4_delegreturn_prepare,
3488#endif /* CONFIG_NFS_V4_1 */
3014 .rpc_call_done = nfs4_delegreturn_done, 3489 .rpc_call_done = nfs4_delegreturn_done,
3015 .rpc_release = nfs4_delegreturn_release, 3490 .rpc_release = nfs4_delegreturn_release,
3016}; 3491};
@@ -3032,7 +3507,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3032 }; 3507 };
3033 int status = 0; 3508 int status = 0;
3034 3509
3035 data = kmalloc(sizeof(*data), GFP_KERNEL); 3510 data = kzalloc(sizeof(*data), GFP_KERNEL);
3036 if (data == NULL) 3511 if (data == NULL)
3037 return -ENOMEM; 3512 return -ENOMEM;
3038 data->args.fhandle = &data->fh; 3513 data->args.fhandle = &data->fh;
@@ -3042,6 +3517,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3042 memcpy(&data->stateid, stateid, sizeof(data->stateid)); 3517 memcpy(&data->stateid, stateid, sizeof(data->stateid));
3043 data->res.fattr = &data->fattr; 3518 data->res.fattr = &data->fattr;
3044 data->res.server = server; 3519 data->res.server = server;
3520 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
3045 nfs_fattr_init(data->res.fattr); 3521 nfs_fattr_init(data->res.fattr);
3046 data->timestamp = jiffies; 3522 data->timestamp = jiffies;
3047 data->rpc_status = 0; 3523 data->rpc_status = 0;
@@ -3127,7 +3603,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
3127 goto out; 3603 goto out;
3128 lsp = request->fl_u.nfs4_fl.owner; 3604 lsp = request->fl_u.nfs4_fl.owner;
3129 arg.lock_owner.id = lsp->ls_id.id; 3605 arg.lock_owner.id = lsp->ls_id.id;
3130 status = rpc_call_sync(server->client, &msg, 0); 3606 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
3131 switch (status) { 3607 switch (status) {
3132 case 0: 3608 case 0:
3133 request->fl_type = F_UNLCK; 3609 request->fl_type = F_UNLCK;
@@ -3187,13 +3663,14 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
3187 struct nfs4_unlockdata *p; 3663 struct nfs4_unlockdata *p;
3188 struct inode *inode = lsp->ls_state->inode; 3664 struct inode *inode = lsp->ls_state->inode;
3189 3665
3190 p = kmalloc(sizeof(*p), GFP_KERNEL); 3666 p = kzalloc(sizeof(*p), GFP_KERNEL);
3191 if (p == NULL) 3667 if (p == NULL)
3192 return NULL; 3668 return NULL;
3193 p->arg.fh = NFS_FH(inode); 3669 p->arg.fh = NFS_FH(inode);
3194 p->arg.fl = &p->fl; 3670 p->arg.fl = &p->fl;
3195 p->arg.seqid = seqid; 3671 p->arg.seqid = seqid;
3196 p->res.seqid = seqid; 3672 p->res.seqid = seqid;
3673 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
3197 p->arg.stateid = &lsp->ls_stateid; 3674 p->arg.stateid = &lsp->ls_stateid;
3198 p->lsp = lsp; 3675 p->lsp = lsp;
3199 atomic_inc(&lsp->ls_count); 3676 atomic_inc(&lsp->ls_count);
@@ -3217,6 +3694,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
3217{ 3694{
3218 struct nfs4_unlockdata *calldata = data; 3695 struct nfs4_unlockdata *calldata = data;
3219 3696
3697 nfs4_sequence_done(calldata->server, &calldata->res.seq_res,
3698 task->tk_status);
3220 if (RPC_ASSASSINATED(task)) 3699 if (RPC_ASSASSINATED(task))
3221 return; 3700 return;
3222 switch (task->tk_status) { 3701 switch (task->tk_status) {
@@ -3233,8 +3712,11 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
3233 break; 3712 break;
3234 default: 3713 default:
3235 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) 3714 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
3236 rpc_restart_call(task); 3715 nfs4_restart_rpc(task,
3716 calldata->server->nfs_client);
3237 } 3717 }
3718 nfs4_sequence_free_slot(calldata->server->nfs_client,
3719 &calldata->res.seq_res);
3238} 3720}
3239 3721
3240static void nfs4_locku_prepare(struct rpc_task *task, void *data) 3722static void nfs4_locku_prepare(struct rpc_task *task, void *data)
@@ -3249,6 +3731,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
3249 return; 3731 return;
3250 } 3732 }
3251 calldata->timestamp = jiffies; 3733 calldata->timestamp = jiffies;
3734 if (nfs4_setup_sequence(calldata->server->nfs_client,
3735 &calldata->arg.seq_args,
3736 &calldata->res.seq_res, 1, task))
3737 return;
3252 rpc_call_start(task); 3738 rpc_call_start(task);
3253} 3739}
3254 3740
@@ -3341,6 +3827,7 @@ struct nfs4_lockdata {
3341 unsigned long timestamp; 3827 unsigned long timestamp;
3342 int rpc_status; 3828 int rpc_status;
3343 int cancelled; 3829 int cancelled;
3830 struct nfs_server *server;
3344}; 3831};
3345 3832
3346static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, 3833static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
@@ -3366,7 +3853,9 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
3366 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 3853 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
3367 p->arg.lock_owner.id = lsp->ls_id.id; 3854 p->arg.lock_owner.id = lsp->ls_id.id;
3368 p->res.lock_seqid = p->arg.lock_seqid; 3855 p->res.lock_seqid = p->arg.lock_seqid;
3856 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
3369 p->lsp = lsp; 3857 p->lsp = lsp;
3858 p->server = server;
3370 atomic_inc(&lsp->ls_count); 3859 atomic_inc(&lsp->ls_count);
3371 p->ctx = get_nfs_open_context(ctx); 3860 p->ctx = get_nfs_open_context(ctx);
3372 memcpy(&p->fl, fl, sizeof(p->fl)); 3861 memcpy(&p->fl, fl, sizeof(p->fl));
@@ -3396,6 +3885,9 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
3396 } else 3885 } else
3397 data->arg.new_lock_owner = 0; 3886 data->arg.new_lock_owner = 0;
3398 data->timestamp = jiffies; 3887 data->timestamp = jiffies;
3888 if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args,
3889 &data->res.seq_res, 1, task))
3890 return;
3399 rpc_call_start(task); 3891 rpc_call_start(task);
3400 dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); 3892 dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
3401} 3893}
@@ -3406,6 +3898,9 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
3406 3898
3407 dprintk("%s: begin!\n", __func__); 3899 dprintk("%s: begin!\n", __func__);
3408 3900
3901 nfs4_sequence_done_free_slot(data->server, &data->res.seq_res,
3902 task->tk_status);
3903
3409 data->rpc_status = task->tk_status; 3904 data->rpc_status = task->tk_status;
3410 if (RPC_ASSASSINATED(task)) 3905 if (RPC_ASSASSINATED(task))
3411 goto out; 3906 goto out;
@@ -3487,8 +3982,6 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
3487 ret = nfs4_wait_for_completion_rpc_task(task); 3982 ret = nfs4_wait_for_completion_rpc_task(task);
3488 if (ret == 0) { 3983 if (ret == 0) {
3489 ret = data->rpc_status; 3984 ret = data->rpc_status;
3490 if (ret == -NFS4ERR_DENIED)
3491 ret = -EAGAIN;
3492 } else 3985 } else
3493 data->cancelled = 1; 3986 data->cancelled = 1;
3494 rpc_put_task(task); 3987 rpc_put_task(task);
@@ -3576,9 +4069,11 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
3576 int err; 4069 int err;
3577 4070
3578 do { 4071 do {
4072 err = _nfs4_proc_setlk(state, cmd, request);
4073 if (err == -NFS4ERR_DENIED)
4074 err = -EAGAIN;
3579 err = nfs4_handle_exception(NFS_SERVER(state->inode), 4075 err = nfs4_handle_exception(NFS_SERVER(state->inode),
3580 _nfs4_proc_setlk(state, cmd, request), 4076 err, &exception);
3581 &exception);
3582 } while (exception.retry); 4077 } while (exception.retry);
3583 return err; 4078 return err;
3584} 4079}
@@ -3598,15 +4093,23 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
3598 if (request->fl_start < 0 || request->fl_end < 0) 4093 if (request->fl_start < 0 || request->fl_end < 0)
3599 return -EINVAL; 4094 return -EINVAL;
3600 4095
3601 if (IS_GETLK(cmd)) 4096 if (IS_GETLK(cmd)) {
3602 return nfs4_proc_getlk(state, F_GETLK, request); 4097 if (state != NULL)
4098 return nfs4_proc_getlk(state, F_GETLK, request);
4099 return 0;
4100 }
3603 4101
3604 if (!(IS_SETLK(cmd) || IS_SETLKW(cmd))) 4102 if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
3605 return -EINVAL; 4103 return -EINVAL;
3606 4104
3607 if (request->fl_type == F_UNLCK) 4105 if (request->fl_type == F_UNLCK) {
3608 return nfs4_proc_unlck(state, cmd, request); 4106 if (state != NULL)
4107 return nfs4_proc_unlck(state, cmd, request);
4108 return 0;
4109 }
3609 4110
4111 if (state == NULL)
4112 return -ENOLCK;
3610 do { 4113 do {
3611 status = nfs4_proc_setlk(state, cmd, request); 4114 status = nfs4_proc_setlk(state, cmd, request);
3612 if ((status != -EAGAIN) || IS_SETLK(cmd)) 4115 if ((status != -EAGAIN) || IS_SETLK(cmd))
@@ -3630,8 +4133,37 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
3630 goto out; 4133 goto out;
3631 do { 4134 do {
3632 err = _nfs4_do_setlk(state, F_SETLK, fl, 0); 4135 err = _nfs4_do_setlk(state, F_SETLK, fl, 0);
3633 if (err != -NFS4ERR_DELAY) 4136 switch (err) {
3634 break; 4137 default:
4138 printk(KERN_ERR "%s: unhandled error %d.\n",
4139 __func__, err);
4140 case 0:
4141 case -ESTALE:
4142 goto out;
4143 case -NFS4ERR_EXPIRED:
4144 case -NFS4ERR_STALE_CLIENTID:
4145 case -NFS4ERR_STALE_STATEID:
4146 nfs4_schedule_state_recovery(server->nfs_client);
4147 goto out;
4148 case -ERESTARTSYS:
4149 /*
4150 * The show must go on: exit, but mark the
4151 * stateid as needing recovery.
4152 */
4153 case -NFS4ERR_ADMIN_REVOKED:
4154 case -NFS4ERR_BAD_STATEID:
4155 case -NFS4ERR_OPENMODE:
4156 nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
4157 err = 0;
4158 goto out;
4159 case -ENOMEM:
4160 case -NFS4ERR_DENIED:
4161 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
4162 err = 0;
4163 goto out;
4164 case -NFS4ERR_DELAY:
4165 break;
4166 }
3635 err = nfs4_handle_exception(server, err, &exception); 4167 err = nfs4_handle_exception(server, err, &exception);
3636 } while (exception.retry); 4168 } while (exception.retry);
3637out: 4169out:
@@ -3706,10 +4238,13 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
3706 .page = page, 4238 .page = page,
3707 .bitmask = bitmask, 4239 .bitmask = bitmask,
3708 }; 4240 };
4241 struct nfs4_fs_locations_res res = {
4242 .fs_locations = fs_locations,
4243 };
3709 struct rpc_message msg = { 4244 struct rpc_message msg = {
3710 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], 4245 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
3711 .rpc_argp = &args, 4246 .rpc_argp = &args,
3712 .rpc_resp = fs_locations, 4247 .rpc_resp = &res,
3713 }; 4248 };
3714 int status; 4249 int status;
3715 4250
@@ -3717,24 +4252,736 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
3717 nfs_fattr_init(&fs_locations->fattr); 4252 nfs_fattr_init(&fs_locations->fattr);
3718 fs_locations->server = server; 4253 fs_locations->server = server;
3719 fs_locations->nlocations = 0; 4254 fs_locations->nlocations = 0;
3720 status = rpc_call_sync(server->client, &msg, 0); 4255 status = nfs4_call_sync(server, &msg, &args, &res, 0);
3721 nfs_fixup_referral_attributes(&fs_locations->fattr); 4256 nfs_fixup_referral_attributes(&fs_locations->fattr);
3722 dprintk("%s: returned status = %d\n", __func__, status); 4257 dprintk("%s: returned status = %d\n", __func__, status);
3723 return status; 4258 return status;
3724} 4259}
3725 4260
3726struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = { 4261#ifdef CONFIG_NFS_V4_1
4262/*
4263 * nfs4_proc_exchange_id()
4264 *
4265 * Since the clientid has expired, all compounds using sessions
4266 * associated with the stale clientid will be returning
4267 * NFS4ERR_BADSESSION in the sequence operation, and will therefore
4268 * be in some phase of session reset.
4269 */
4270static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4271{
4272 nfs4_verifier verifier;
4273 struct nfs41_exchange_id_args args = {
4274 .client = clp,
4275 .flags = clp->cl_exchange_flags,
4276 };
4277 struct nfs41_exchange_id_res res = {
4278 .client = clp,
4279 };
4280 int status;
4281 struct rpc_message msg = {
4282 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
4283 .rpc_argp = &args,
4284 .rpc_resp = &res,
4285 .rpc_cred = cred,
4286 };
4287 __be32 *p;
4288
4289 dprintk("--> %s\n", __func__);
4290 BUG_ON(clp == NULL);
4291
4292 p = (u32 *)verifier.data;
4293 *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
4294 *p = htonl((u32)clp->cl_boot_time.tv_nsec);
4295 args.verifier = &verifier;
4296
4297 while (1) {
4298 args.id_len = scnprintf(args.id, sizeof(args.id),
4299 "%s/%s %u",
4300 clp->cl_ipaddr,
4301 rpc_peeraddr2str(clp->cl_rpcclient,
4302 RPC_DISPLAY_ADDR),
4303 clp->cl_id_uniquifier);
4304
4305 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
4306
4307 if (status != NFS4ERR_CLID_INUSE)
4308 break;
4309
4310 if (signalled())
4311 break;
4312
4313 if (++clp->cl_id_uniquifier == 0)
4314 break;
4315 }
4316
4317 dprintk("<-- %s status= %d\n", __func__, status);
4318 return status;
4319}
4320
4321struct nfs4_get_lease_time_data {
4322 struct nfs4_get_lease_time_args *args;
4323 struct nfs4_get_lease_time_res *res;
4324 struct nfs_client *clp;
4325};
4326
4327static void nfs4_get_lease_time_prepare(struct rpc_task *task,
4328 void *calldata)
4329{
4330 int ret;
4331 struct nfs4_get_lease_time_data *data =
4332 (struct nfs4_get_lease_time_data *)calldata;
4333
4334 dprintk("--> %s\n", __func__);
4335 /* just setup sequence, do not trigger session recovery
4336 since we're invoked within one */
4337 ret = nfs41_setup_sequence(data->clp->cl_session,
4338 &data->args->la_seq_args,
4339 &data->res->lr_seq_res, 0, task);
4340
4341 BUG_ON(ret == -EAGAIN);
4342 rpc_call_start(task);
4343 dprintk("<-- %s\n", __func__);
4344}
4345
4346/*
4347 * Called from nfs4_state_manager thread for session setup, so don't recover
4348 * from sequence operation or clientid errors.
4349 */
4350static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4351{
4352 struct nfs4_get_lease_time_data *data =
4353 (struct nfs4_get_lease_time_data *)calldata;
4354
4355 dprintk("--> %s\n", __func__);
4356 nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status);
4357 switch (task->tk_status) {
4358 case -NFS4ERR_DELAY:
4359 case -NFS4ERR_GRACE:
4360 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
4361 rpc_delay(task, NFS4_POLL_RETRY_MIN);
4362 task->tk_status = 0;
4363 nfs4_restart_rpc(task, data->clp);
4364 return;
4365 }
4366 nfs41_sequence_free_slot(data->clp, &data->res->lr_seq_res);
4367 dprintk("<-- %s\n", __func__);
4368}
4369
4370struct rpc_call_ops nfs4_get_lease_time_ops = {
4371 .rpc_call_prepare = nfs4_get_lease_time_prepare,
4372 .rpc_call_done = nfs4_get_lease_time_done,
4373};
4374
4375int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
4376{
4377 struct rpc_task *task;
4378 struct nfs4_get_lease_time_args args;
4379 struct nfs4_get_lease_time_res res = {
4380 .lr_fsinfo = fsinfo,
4381 };
4382 struct nfs4_get_lease_time_data data = {
4383 .args = &args,
4384 .res = &res,
4385 .clp = clp,
4386 };
4387 struct rpc_message msg = {
4388 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GET_LEASE_TIME],
4389 .rpc_argp = &args,
4390 .rpc_resp = &res,
4391 };
4392 struct rpc_task_setup task_setup = {
4393 .rpc_client = clp->cl_rpcclient,
4394 .rpc_message = &msg,
4395 .callback_ops = &nfs4_get_lease_time_ops,
4396 .callback_data = &data
4397 };
4398 int status;
4399
4400 res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
4401 dprintk("--> %s\n", __func__);
4402 task = rpc_run_task(&task_setup);
4403
4404 if (IS_ERR(task))
4405 status = PTR_ERR(task);
4406 else {
4407 status = task->tk_status;
4408 rpc_put_task(task);
4409 }
4410 dprintk("<-- %s return %d\n", __func__, status);
4411
4412 return status;
4413}
4414
4415/*
4416 * Reset a slot table
4417 */
4418static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots,
4419 int old_max_slots, int ivalue)
4420{
4421 int i;
4422 int ret = 0;
4423
4424 dprintk("--> %s: max_reqs=%u, tbl %p\n", __func__, max_slots, tbl);
4425
4426 /*
4427 * Until we have dynamic slot table adjustment, insist
4428 * upon the same slot table size
4429 */
4430 if (max_slots != old_max_slots) {
4431 dprintk("%s reset slot table does't match old\n",
4432 __func__);
4433 ret = -EINVAL; /*XXX NFS4ERR_REQ_TOO_BIG ? */
4434 goto out;
4435 }
4436 spin_lock(&tbl->slot_tbl_lock);
4437 for (i = 0; i < max_slots; ++i)
4438 tbl->slots[i].seq_nr = ivalue;
4439 tbl->highest_used_slotid = -1;
4440 spin_unlock(&tbl->slot_tbl_lock);
4441 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
4442 tbl, tbl->slots, tbl->max_slots);
4443out:
4444 dprintk("<-- %s: return %d\n", __func__, ret);
4445 return ret;
4446}
4447
4448/*
4449 * Reset the forechannel and backchannel slot tables
4450 */
4451static int nfs4_reset_slot_tables(struct nfs4_session *session)
4452{
4453 int status;
4454
4455 status = nfs4_reset_slot_table(&session->fc_slot_table,
4456 session->fc_attrs.max_reqs,
4457 session->fc_slot_table.max_slots,
4458 1);
4459 if (status)
4460 return status;
4461
4462 status = nfs4_reset_slot_table(&session->bc_slot_table,
4463 session->bc_attrs.max_reqs,
4464 session->bc_slot_table.max_slots,
4465 0);
4466 return status;
4467}
4468
4469/* Destroy the slot table */
4470static void nfs4_destroy_slot_tables(struct nfs4_session *session)
4471{
4472 if (session->fc_slot_table.slots != NULL) {
4473 kfree(session->fc_slot_table.slots);
4474 session->fc_slot_table.slots = NULL;
4475 }
4476 if (session->bc_slot_table.slots != NULL) {
4477 kfree(session->bc_slot_table.slots);
4478 session->bc_slot_table.slots = NULL;
4479 }
4480 return;
4481}
4482
4483/*
4484 * Initialize slot table
4485 */
4486static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
4487 int max_slots, int ivalue)
4488{
4489 int i;
4490 struct nfs4_slot *slot;
4491 int ret = -ENOMEM;
4492
4493 BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE);
4494
4495 dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
4496
4497 slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
4498 if (!slot)
4499 goto out;
4500 for (i = 0; i < max_slots; ++i)
4501 slot[i].seq_nr = ivalue;
4502 ret = 0;
4503
4504 spin_lock(&tbl->slot_tbl_lock);
4505 if (tbl->slots != NULL) {
4506 spin_unlock(&tbl->slot_tbl_lock);
4507 dprintk("%s: slot table already initialized. tbl=%p slots=%p\n",
4508 __func__, tbl, tbl->slots);
4509 WARN_ON(1);
4510 goto out_free;
4511 }
4512 tbl->max_slots = max_slots;
4513 tbl->slots = slot;
4514 tbl->highest_used_slotid = -1; /* no slot is currently used */
4515 spin_unlock(&tbl->slot_tbl_lock);
4516 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
4517 tbl, tbl->slots, tbl->max_slots);
4518out:
4519 dprintk("<-- %s: return %d\n", __func__, ret);
4520 return ret;
4521
4522out_free:
4523 kfree(slot);
4524 goto out;
4525}
4526
4527/*
4528 * Initialize the forechannel and backchannel tables
4529 */
4530static int nfs4_init_slot_tables(struct nfs4_session *session)
4531{
4532 int status;
4533
4534 status = nfs4_init_slot_table(&session->fc_slot_table,
4535 session->fc_attrs.max_reqs, 1);
4536 if (status)
4537 return status;
4538
4539 status = nfs4_init_slot_table(&session->bc_slot_table,
4540 session->bc_attrs.max_reqs, 0);
4541 if (status)
4542 nfs4_destroy_slot_tables(session);
4543
4544 return status;
4545}
4546
4547struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4548{
4549 struct nfs4_session *session;
4550 struct nfs4_slot_table *tbl;
4551
4552 session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL);
4553 if (!session)
4554 return NULL;
4555
4556 set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
4557 /*
4558 * The create session reply races with the server back
4559 * channel probe. Mark the client NFS_CS_SESSION_INITING
4560 * so that the client back channel can find the
4561 * nfs_client struct
4562 */
4563 clp->cl_cons_state = NFS_CS_SESSION_INITING;
4564
4565 tbl = &session->fc_slot_table;
4566 spin_lock_init(&tbl->slot_tbl_lock);
4567 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
4568
4569 tbl = &session->bc_slot_table;
4570 spin_lock_init(&tbl->slot_tbl_lock);
4571 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
4572
4573 session->clp = clp;
4574 return session;
4575}
4576
4577void nfs4_destroy_session(struct nfs4_session *session)
4578{
4579 nfs4_proc_destroy_session(session);
4580 dprintk("%s Destroy backchannel for xprt %p\n",
4581 __func__, session->clp->cl_rpcclient->cl_xprt);
4582 xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt,
4583 NFS41_BC_MIN_CALLBACKS);
4584 nfs4_destroy_slot_tables(session);
4585 kfree(session);
4586}
4587
4588/*
4589 * Initialize the values to be used by the client in CREATE_SESSION
4590 * If nfs4_init_session set the fore channel request and response sizes,
4591 * use them.
4592 *
4593 * Set the back channel max_resp_sz_cached to zero to force the client to
4594 * always set csa_cachethis to FALSE because the current implementation
4595 * of the back channel DRC only supports caching the CB_SEQUENCE operation.
4596 */
4597static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
4598{
4599 struct nfs4_session *session = args->client->cl_session;
4600 unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz,
4601 mxresp_sz = session->fc_attrs.max_resp_sz;
4602
4603 if (mxrqst_sz == 0)
4604 mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
4605 if (mxresp_sz == 0)
4606 mxresp_sz = NFS_MAX_FILE_IO_SIZE;
4607 /* Fore channel attributes */
4608 args->fc_attrs.headerpadsz = 0;
4609 args->fc_attrs.max_rqst_sz = mxrqst_sz;
4610 args->fc_attrs.max_resp_sz = mxresp_sz;
4611 args->fc_attrs.max_resp_sz_cached = mxresp_sz;
4612 args->fc_attrs.max_ops = NFS4_MAX_OPS;
4613 args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
4614
4615 dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
4616 "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
4617 __func__,
4618 args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
4619 args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops,
4620 args->fc_attrs.max_reqs);
4621
4622 /* Back channel attributes */
4623 args->bc_attrs.headerpadsz = 0;
4624 args->bc_attrs.max_rqst_sz = PAGE_SIZE;
4625 args->bc_attrs.max_resp_sz = PAGE_SIZE;
4626 args->bc_attrs.max_resp_sz_cached = 0;
4627 args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
4628 args->bc_attrs.max_reqs = 1;
4629
4630 dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
4631 "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
4632 __func__,
4633 args->bc_attrs.max_rqst_sz, args->bc_attrs.max_resp_sz,
4634 args->bc_attrs.max_resp_sz_cached, args->bc_attrs.max_ops,
4635 args->bc_attrs.max_reqs);
4636}
4637
4638static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd)
4639{
4640 if (rcvd <= sent)
4641 return 0;
4642 printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. "
4643 "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd);
4644 return -EINVAL;
4645}
4646
4647#define _verify_fore_channel_attr(_name_) \
4648 _verify_channel_attr("fore", #_name_, \
4649 args->fc_attrs._name_, \
4650 session->fc_attrs._name_)
4651
4652#define _verify_back_channel_attr(_name_) \
4653 _verify_channel_attr("back", #_name_, \
4654 args->bc_attrs._name_, \
4655 session->bc_attrs._name_)
4656
4657/*
4658 * The server is not allowed to increase the fore channel header pad size,
4659 * maximum response size, or maximum number of operations.
4660 *
4661 * The back channel attributes are only negotiatied down: We send what the
4662 * (back channel) server insists upon.
4663 */
4664static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
4665 struct nfs4_session *session)
4666{
4667 int ret = 0;
4668
4669 ret |= _verify_fore_channel_attr(headerpadsz);
4670 ret |= _verify_fore_channel_attr(max_resp_sz);
4671 ret |= _verify_fore_channel_attr(max_ops);
4672
4673 ret |= _verify_back_channel_attr(headerpadsz);
4674 ret |= _verify_back_channel_attr(max_rqst_sz);
4675 ret |= _verify_back_channel_attr(max_resp_sz);
4676 ret |= _verify_back_channel_attr(max_resp_sz_cached);
4677 ret |= _verify_back_channel_attr(max_ops);
4678 ret |= _verify_back_channel_attr(max_reqs);
4679
4680 return ret;
4681}
4682
4683static int _nfs4_proc_create_session(struct nfs_client *clp)
4684{
4685 struct nfs4_session *session = clp->cl_session;
4686 struct nfs41_create_session_args args = {
4687 .client = clp,
4688 .cb_program = NFS4_CALLBACK,
4689 };
4690 struct nfs41_create_session_res res = {
4691 .client = clp,
4692 };
4693 struct rpc_message msg = {
4694 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
4695 .rpc_argp = &args,
4696 .rpc_resp = &res,
4697 };
4698 int status;
4699
4700 nfs4_init_channel_attrs(&args);
4701 args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
4702
4703 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0);
4704
4705 if (!status)
4706 /* Verify the session's negotiated channel_attrs values */
4707 status = nfs4_verify_channel_attrs(&args, session);
4708 if (!status) {
4709 /* Increment the clientid slot sequence id */
4710 clp->cl_seqid++;
4711 }
4712
4713 return status;
4714}
4715
4716/*
4717 * Issues a CREATE_SESSION operation to the server.
4718 * It is the responsibility of the caller to verify the session is
4719 * expired before calling this routine.
4720 */
4721int nfs4_proc_create_session(struct nfs_client *clp, int reset)
4722{
4723 int status;
4724 unsigned *ptr;
4725 struct nfs_fsinfo fsinfo;
4726 struct nfs4_session *session = clp->cl_session;
4727
4728 dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
4729
4730 status = _nfs4_proc_create_session(clp);
4731 if (status)
4732 goto out;
4733
4734 /* Init or reset the fore channel */
4735 if (reset)
4736 status = nfs4_reset_slot_tables(session);
4737 else
4738 status = nfs4_init_slot_tables(session);
4739 dprintk("fore channel slot table initialization returned %d\n", status);
4740 if (status)
4741 goto out;
4742
4743 ptr = (unsigned *)&session->sess_id.data[0];
4744 dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
4745 clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
4746
4747 if (reset)
4748 /* Lease time is aleady set */
4749 goto out;
4750
4751 /* Get the lease time */
4752 status = nfs4_proc_get_lease_time(clp, &fsinfo);
4753 if (status == 0) {
4754 /* Update lease time and schedule renewal */
4755 spin_lock(&clp->cl_lock);
4756 clp->cl_lease_time = fsinfo.lease_time * HZ;
4757 clp->cl_last_renewal = jiffies;
4758 clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
4759 spin_unlock(&clp->cl_lock);
4760
4761 nfs4_schedule_state_renewal(clp);
4762 }
4763out:
4764 dprintk("<-- %s\n", __func__);
4765 return status;
4766}
4767
4768/*
4769 * Issue the over-the-wire RPC DESTROY_SESSION.
4770 * The caller must serialize access to this routine.
4771 */
4772int nfs4_proc_destroy_session(struct nfs4_session *session)
4773{
4774 int status = 0;
4775 struct rpc_message msg;
4776
4777 dprintk("--> nfs4_proc_destroy_session\n");
4778
4779 /* session is still being setup */
4780 if (session->clp->cl_cons_state != NFS_CS_READY)
4781 return status;
4782
4783 msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION];
4784 msg.rpc_argp = session;
4785 msg.rpc_resp = NULL;
4786 msg.rpc_cred = NULL;
4787 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0);
4788
4789 if (status)
4790 printk(KERN_WARNING
4791 "Got error %d from the server on DESTROY_SESSION. "
4792 "Session has been destroyed regardless...\n", status);
4793
4794 dprintk("<-- nfs4_proc_destroy_session\n");
4795 return status;
4796}
4797
4798int nfs4_init_session(struct nfs_server *server)
4799{
4800 struct nfs_client *clp = server->nfs_client;
4801 int ret;
4802
4803 if (!nfs4_has_session(clp))
4804 return 0;
4805
4806 clp->cl_session->fc_attrs.max_rqst_sz = server->wsize;
4807 clp->cl_session->fc_attrs.max_resp_sz = server->rsize;
4808 ret = nfs4_recover_expired_lease(server);
4809 if (!ret)
4810 ret = nfs4_check_client_ready(clp);
4811 return ret;
4812}
4813
4814/*
4815 * Renew the cl_session lease.
4816 */
4817static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
4818{
4819 struct nfs4_sequence_args args;
4820 struct nfs4_sequence_res res;
4821
4822 struct rpc_message msg = {
4823 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
4824 .rpc_argp = &args,
4825 .rpc_resp = &res,
4826 .rpc_cred = cred,
4827 };
4828
4829 args.sa_cache_this = 0;
4830
4831 return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args,
4832 &res, 0);
4833}
4834
4835void nfs41_sequence_call_done(struct rpc_task *task, void *data)
4836{
4837 struct nfs_client *clp = (struct nfs_client *)data;
4838
4839 nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status);
4840
4841 if (task->tk_status < 0) {
4842 dprintk("%s ERROR %d\n", __func__, task->tk_status);
4843
4844 if (_nfs4_async_handle_error(task, NULL, clp, NULL)
4845 == -EAGAIN) {
4846 nfs4_restart_rpc(task, clp);
4847 return;
4848 }
4849 }
4850 nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp);
4851 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
4852
4853 put_rpccred(task->tk_msg.rpc_cred);
4854 kfree(task->tk_msg.rpc_argp);
4855 kfree(task->tk_msg.rpc_resp);
4856
4857 dprintk("<-- %s\n", __func__);
4858}
4859
4860static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
4861{
4862 struct nfs_client *clp;
4863 struct nfs4_sequence_args *args;
4864 struct nfs4_sequence_res *res;
4865
4866 clp = (struct nfs_client *)data;
4867 args = task->tk_msg.rpc_argp;
4868 res = task->tk_msg.rpc_resp;
4869
4870 if (nfs4_setup_sequence(clp, args, res, 0, task))
4871 return;
4872 rpc_call_start(task);
4873}
4874
4875static const struct rpc_call_ops nfs41_sequence_ops = {
4876 .rpc_call_done = nfs41_sequence_call_done,
4877 .rpc_call_prepare = nfs41_sequence_prepare,
4878};
4879
4880static int nfs41_proc_async_sequence(struct nfs_client *clp,
4881 struct rpc_cred *cred)
4882{
4883 struct nfs4_sequence_args *args;
4884 struct nfs4_sequence_res *res;
4885 struct rpc_message msg = {
4886 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
4887 .rpc_cred = cred,
4888 };
4889
4890 args = kzalloc(sizeof(*args), GFP_KERNEL);
4891 if (!args)
4892 return -ENOMEM;
4893 res = kzalloc(sizeof(*res), GFP_KERNEL);
4894 if (!res) {
4895 kfree(args);
4896 return -ENOMEM;
4897 }
4898 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
4899 msg.rpc_argp = args;
4900 msg.rpc_resp = res;
4901
4902 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
4903 &nfs41_sequence_ops, (void *)clp);
4904}
4905
4906#endif /* CONFIG_NFS_V4_1 */
4907
4908struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
3727 .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT, 4909 .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
3728 .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, 4910 .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
3729 .recover_open = nfs4_open_reclaim, 4911 .recover_open = nfs4_open_reclaim,
3730 .recover_lock = nfs4_lock_reclaim, 4912 .recover_lock = nfs4_lock_reclaim,
4913 .establish_clid = nfs4_init_clientid,
4914 .get_clid_cred = nfs4_get_setclientid_cred,
4915};
4916
4917#if defined(CONFIG_NFS_V4_1)
4918struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
4919 .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
4920 .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
4921 .recover_open = nfs4_open_reclaim,
4922 .recover_lock = nfs4_lock_reclaim,
4923 .establish_clid = nfs4_proc_exchange_id,
4924 .get_clid_cred = nfs4_get_exchange_id_cred,
4925};
4926#endif /* CONFIG_NFS_V4_1 */
4927
4928struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
4929 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
4930 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
4931 .recover_open = nfs4_open_expired,
4932 .recover_lock = nfs4_lock_expired,
4933 .establish_clid = nfs4_init_clientid,
4934 .get_clid_cred = nfs4_get_setclientid_cred,
3731}; 4935};
3732 4936
3733struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops = { 4937#if defined(CONFIG_NFS_V4_1)
4938struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
3734 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, 4939 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
3735 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, 4940 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
3736 .recover_open = nfs4_open_expired, 4941 .recover_open = nfs4_open_expired,
3737 .recover_lock = nfs4_lock_expired, 4942 .recover_lock = nfs4_lock_expired,
4943 .establish_clid = nfs4_proc_exchange_id,
4944 .get_clid_cred = nfs4_get_exchange_id_cred,
4945};
4946#endif /* CONFIG_NFS_V4_1 */
4947
4948struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
4949 .sched_state_renewal = nfs4_proc_async_renew,
4950 .get_state_renewal_cred_locked = nfs4_get_renew_cred_locked,
4951 .renew_lease = nfs4_proc_renew,
4952};
4953
4954#if defined(CONFIG_NFS_V4_1)
4955struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
4956 .sched_state_renewal = nfs41_proc_async_sequence,
4957 .get_state_renewal_cred_locked = nfs4_get_machine_cred_locked,
4958 .renew_lease = nfs4_proc_sequence,
4959};
4960#endif
4961
4962/*
4963 * Per minor version reboot and network partition recovery ops
4964 */
4965
4966struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = {
4967 &nfs40_reboot_recovery_ops,
4968#if defined(CONFIG_NFS_V4_1)
4969 &nfs41_reboot_recovery_ops,
4970#endif
4971};
4972
4973struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = {
4974 &nfs40_nograce_recovery_ops,
4975#if defined(CONFIG_NFS_V4_1)
4976 &nfs41_nograce_recovery_ops,
4977#endif
4978};
4979
4980struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = {
4981 &nfs40_state_renewal_ops,
4982#if defined(CONFIG_NFS_V4_1)
4983 &nfs41_state_renewal_ops,
4984#endif
3738}; 4985};
3739 4986
3740static const struct inode_operations nfs4_file_inode_operations = { 4987static const struct inode_operations nfs4_file_inode_operations = {
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index f524e932ff7b..e27c6cef18f2 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -59,12 +59,14 @@
59void 59void
60nfs4_renew_state(struct work_struct *work) 60nfs4_renew_state(struct work_struct *work)
61{ 61{
62 struct nfs4_state_maintenance_ops *ops;
62 struct nfs_client *clp = 63 struct nfs_client *clp =
63 container_of(work, struct nfs_client, cl_renewd.work); 64 container_of(work, struct nfs_client, cl_renewd.work);
64 struct rpc_cred *cred; 65 struct rpc_cred *cred;
65 long lease, timeout; 66 long lease, timeout;
66 unsigned long last, now; 67 unsigned long last, now;
67 68
69 ops = nfs4_state_renewal_ops[clp->cl_minorversion];
68 dprintk("%s: start\n", __func__); 70 dprintk("%s: start\n", __func__);
69 /* Are there any active superblocks? */ 71 /* Are there any active superblocks? */
70 if (list_empty(&clp->cl_superblocks)) 72 if (list_empty(&clp->cl_superblocks))
@@ -76,7 +78,7 @@ nfs4_renew_state(struct work_struct *work)
76 timeout = (2 * lease) / 3 + (long)last - (long)now; 78 timeout = (2 * lease) / 3 + (long)last - (long)now;
77 /* Are we close to a lease timeout? */ 79 /* Are we close to a lease timeout? */
78 if (time_after(now, last + lease/3)) { 80 if (time_after(now, last + lease/3)) {
79 cred = nfs4_get_renew_cred_locked(clp); 81 cred = ops->get_state_renewal_cred_locked(clp);
80 spin_unlock(&clp->cl_lock); 82 spin_unlock(&clp->cl_lock);
81 if (cred == NULL) { 83 if (cred == NULL) {
82 if (list_empty(&clp->cl_delegations)) { 84 if (list_empty(&clp->cl_delegations)) {
@@ -86,7 +88,7 @@ nfs4_renew_state(struct work_struct *work)
86 nfs_expire_all_delegations(clp); 88 nfs_expire_all_delegations(clp);
87 } else { 89 } else {
88 /* Queue an asynchronous RENEW. */ 90 /* Queue an asynchronous RENEW. */
89 nfs4_proc_async_renew(clp, cred); 91 ops->sched_state_renewal(clp, cred);
90 put_rpccred(cred); 92 put_rpccred(cred);
91 } 93 }
92 timeout = (2 * lease) / 3; 94 timeout = (2 * lease) / 3;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0298e909559f..1434080aefeb 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -60,7 +60,7 @@ const nfs4_stateid zero_stateid;
60 60
61static LIST_HEAD(nfs4_clientid_list); 61static LIST_HEAD(nfs4_clientid_list);
62 62
63static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) 63int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
64{ 64{
65 unsigned short port; 65 unsigned short port;
66 int status; 66 int status;
@@ -77,7 +77,7 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
77 return status; 77 return status;
78} 78}
79 79
80static struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) 80struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
81{ 81{
82 struct rpc_cred *cred = NULL; 82 struct rpc_cred *cred = NULL;
83 83
@@ -114,17 +114,21 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
114 return cred; 114 return cred;
115} 115}
116 116
117static struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) 117#if defined(CONFIG_NFS_V4_1)
118
119struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
118{ 120{
119 struct rpc_cred *cred; 121 struct rpc_cred *cred;
120 122
121 spin_lock(&clp->cl_lock); 123 spin_lock(&clp->cl_lock);
122 cred = nfs4_get_renew_cred_locked(clp); 124 cred = nfs4_get_machine_cred_locked(clp);
123 spin_unlock(&clp->cl_lock); 125 spin_unlock(&clp->cl_lock);
124 return cred; 126 return cred;
125} 127}
126 128
127static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) 129#endif /* CONFIG_NFS_V4_1 */
130
131struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
128{ 132{
129 struct nfs4_state_owner *sp; 133 struct nfs4_state_owner *sp;
130 struct rb_node *pos; 134 struct rb_node *pos;
@@ -549,6 +553,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
549 INIT_LIST_HEAD(&lsp->ls_sequence.list); 553 INIT_LIST_HEAD(&lsp->ls_sequence.list);
550 lsp->ls_seqid.sequence = &lsp->ls_sequence; 554 lsp->ls_seqid.sequence = &lsp->ls_sequence;
551 atomic_set(&lsp->ls_count, 1); 555 atomic_set(&lsp->ls_count, 1);
556 lsp->ls_state = state;
552 lsp->ls_owner = fl_owner; 557 lsp->ls_owner = fl_owner;
553 spin_lock(&clp->cl_lock); 558 spin_lock(&clp->cl_lock);
554 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); 559 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
@@ -583,7 +588,6 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
583 if (lsp != NULL) 588 if (lsp != NULL)
584 break; 589 break;
585 if (new != NULL) { 590 if (new != NULL) {
586 new->ls_state = state;
587 list_add(&new->ls_locks, &state->lock_states); 591 list_add(&new->ls_locks, &state->lock_states);
588 set_bit(LK_STATE_IN_USE, &state->flags); 592 set_bit(LK_STATE_IN_USE, &state->flags);
589 lsp = new; 593 lsp = new;
@@ -738,12 +742,14 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
738 742
739void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) 743void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
740{ 744{
741 if (status == -NFS4ERR_BAD_SEQID) { 745 struct nfs4_state_owner *sp = container_of(seqid->sequence,
742 struct nfs4_state_owner *sp = container_of(seqid->sequence, 746 struct nfs4_state_owner, so_seqid);
743 struct nfs4_state_owner, so_seqid); 747 struct nfs_server *server = sp->so_server;
748
749 if (status == -NFS4ERR_BAD_SEQID)
744 nfs4_drop_state_owner(sp); 750 nfs4_drop_state_owner(sp);
745 } 751 if (!nfs4_has_session(server->nfs_client))
746 nfs_increment_seqid(status, seqid); 752 nfs_increment_seqid(status, seqid);
747} 753}
748 754
749/* 755/*
@@ -847,32 +853,45 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
847 struct file_lock *fl; 853 struct file_lock *fl;
848 int status = 0; 854 int status = 0;
849 855
856 if (inode->i_flock == NULL)
857 return 0;
858
859 /* Guard against delegation returns and new lock/unlock calls */
850 down_write(&nfsi->rwsem); 860 down_write(&nfsi->rwsem);
861 /* Protect inode->i_flock using the BKL */
862 lock_kernel();
851 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 863 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
852 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 864 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
853 continue; 865 continue;
854 if (nfs_file_open_context(fl->fl_file)->state != state) 866 if (nfs_file_open_context(fl->fl_file)->state != state)
855 continue; 867 continue;
868 unlock_kernel();
856 status = ops->recover_lock(state, fl); 869 status = ops->recover_lock(state, fl);
857 if (status >= 0)
858 continue;
859 switch (status) { 870 switch (status) {
871 case 0:
872 break;
873 case -ESTALE:
874 case -NFS4ERR_ADMIN_REVOKED:
875 case -NFS4ERR_STALE_STATEID:
876 case -NFS4ERR_BAD_STATEID:
877 case -NFS4ERR_EXPIRED:
878 case -NFS4ERR_NO_GRACE:
879 case -NFS4ERR_STALE_CLIENTID:
880 goto out;
860 default: 881 default:
861 printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", 882 printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
862 __func__, status); 883 __func__, status);
863 case -NFS4ERR_EXPIRED: 884 case -ENOMEM:
864 case -NFS4ERR_NO_GRACE: 885 case -NFS4ERR_DENIED:
865 case -NFS4ERR_RECLAIM_BAD: 886 case -NFS4ERR_RECLAIM_BAD:
866 case -NFS4ERR_RECLAIM_CONFLICT: 887 case -NFS4ERR_RECLAIM_CONFLICT:
867 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 888 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
868 break; 889 status = 0;
869 case -NFS4ERR_STALE_CLIENTID:
870 goto out_err;
871 } 890 }
891 lock_kernel();
872 } 892 }
873 up_write(&nfsi->rwsem); 893 unlock_kernel();
874 return 0; 894out:
875out_err:
876 up_write(&nfsi->rwsem); 895 up_write(&nfsi->rwsem);
877 return status; 896 return status;
878} 897}
@@ -918,6 +937,7 @@ restart:
918 printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", 937 printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
919 __func__, status); 938 __func__, status);
920 case -ENOENT: 939 case -ENOENT:
940 case -ENOMEM:
921 case -ESTALE: 941 case -ESTALE:
922 /* 942 /*
923 * Open state on this file cannot be recovered 943 * Open state on this file cannot be recovered
@@ -928,6 +948,9 @@ restart:
928 /* Mark the file as being 'closed' */ 948 /* Mark the file as being 'closed' */
929 state->state = 0; 949 state->state = 0;
930 break; 950 break;
951 case -NFS4ERR_ADMIN_REVOKED:
952 case -NFS4ERR_STALE_STATEID:
953 case -NFS4ERR_BAD_STATEID:
931 case -NFS4ERR_RECLAIM_BAD: 954 case -NFS4ERR_RECLAIM_BAD:
932 case -NFS4ERR_RECLAIM_CONFLICT: 955 case -NFS4ERR_RECLAIM_CONFLICT:
933 nfs4_state_mark_reclaim_nograce(sp->so_client, state); 956 nfs4_state_mark_reclaim_nograce(sp->so_client, state);
@@ -1042,6 +1065,14 @@ static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1042 case -NFS4ERR_EXPIRED: 1065 case -NFS4ERR_EXPIRED:
1043 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1066 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1044 nfs4_state_start_reclaim_nograce(clp); 1067 nfs4_state_start_reclaim_nograce(clp);
1068 case -NFS4ERR_BADSESSION:
1069 case -NFS4ERR_BADSLOT:
1070 case -NFS4ERR_BAD_HIGH_SLOT:
1071 case -NFS4ERR_DEADSESSION:
1072 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
1073 case -NFS4ERR_SEQ_FALSE_RETRY:
1074 case -NFS4ERR_SEQ_MISORDERED:
1075 set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
1045 } 1076 }
1046} 1077}
1047 1078
@@ -1075,18 +1106,22 @@ restart:
1075static int nfs4_check_lease(struct nfs_client *clp) 1106static int nfs4_check_lease(struct nfs_client *clp)
1076{ 1107{
1077 struct rpc_cred *cred; 1108 struct rpc_cred *cred;
1109 struct nfs4_state_maintenance_ops *ops =
1110 nfs4_state_renewal_ops[clp->cl_minorversion];
1078 int status = -NFS4ERR_EXPIRED; 1111 int status = -NFS4ERR_EXPIRED;
1079 1112
1080 /* Is the client already known to have an expired lease? */ 1113 /* Is the client already known to have an expired lease? */
1081 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) 1114 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
1082 return 0; 1115 return 0;
1083 cred = nfs4_get_renew_cred(clp); 1116 spin_lock(&clp->cl_lock);
1117 cred = ops->get_state_renewal_cred_locked(clp);
1118 spin_unlock(&clp->cl_lock);
1084 if (cred == NULL) { 1119 if (cred == NULL) {
1085 cred = nfs4_get_setclientid_cred(clp); 1120 cred = nfs4_get_setclientid_cred(clp);
1086 if (cred == NULL) 1121 if (cred == NULL)
1087 goto out; 1122 goto out;
1088 } 1123 }
1089 status = nfs4_proc_renew(clp, cred); 1124 status = ops->renew_lease(clp, cred);
1090 put_rpccred(cred); 1125 put_rpccred(cred);
1091out: 1126out:
1092 nfs4_recovery_handle_error(clp, status); 1127 nfs4_recovery_handle_error(clp, status);
@@ -1096,21 +1131,98 @@ out:
1096static int nfs4_reclaim_lease(struct nfs_client *clp) 1131static int nfs4_reclaim_lease(struct nfs_client *clp)
1097{ 1132{
1098 struct rpc_cred *cred; 1133 struct rpc_cred *cred;
1134 struct nfs4_state_recovery_ops *ops =
1135 nfs4_reboot_recovery_ops[clp->cl_minorversion];
1099 int status = -ENOENT; 1136 int status = -ENOENT;
1100 1137
1101 cred = nfs4_get_setclientid_cred(clp); 1138 cred = ops->get_clid_cred(clp);
1102 if (cred != NULL) { 1139 if (cred != NULL) {
1103 status = nfs4_init_client(clp, cred); 1140 status = ops->establish_clid(clp, cred);
1104 put_rpccred(cred); 1141 put_rpccred(cred);
1105 /* Handle case where the user hasn't set up machine creds */ 1142 /* Handle case where the user hasn't set up machine creds */
1106 if (status == -EACCES && cred == clp->cl_machine_cred) { 1143 if (status == -EACCES && cred == clp->cl_machine_cred) {
1107 nfs4_clear_machine_cred(clp); 1144 nfs4_clear_machine_cred(clp);
1108 status = -EAGAIN; 1145 status = -EAGAIN;
1109 } 1146 }
1147 if (status == -NFS4ERR_MINOR_VERS_MISMATCH)
1148 status = -EPROTONOSUPPORT;
1149 }
1150 return status;
1151}
1152
1153#ifdef CONFIG_NFS_V4_1
1154static void nfs4_session_recovery_handle_error(struct nfs_client *clp, int err)
1155{
1156 switch (err) {
1157 case -NFS4ERR_STALE_CLIENTID:
1158 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1159 set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
1160 }
1161}
1162
1163static int nfs4_reset_session(struct nfs_client *clp)
1164{
1165 int status;
1166
1167 status = nfs4_proc_destroy_session(clp->cl_session);
1168 if (status && status != -NFS4ERR_BADSESSION &&
1169 status != -NFS4ERR_DEADSESSION) {
1170 nfs4_session_recovery_handle_error(clp, status);
1171 goto out;
1110 } 1172 }
1173
1174 memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
1175 status = nfs4_proc_create_session(clp, 1);
1176 if (status)
1177 nfs4_session_recovery_handle_error(clp, status);
1178 /* fall through*/
1179out:
1180 /* Wake up the next rpc task even on error */
1181 rpc_wake_up_next(&clp->cl_session->fc_slot_table.slot_tbl_waitq);
1111 return status; 1182 return status;
1112} 1183}
1113 1184
1185static int nfs4_initialize_session(struct nfs_client *clp)
1186{
1187 int status;
1188
1189 status = nfs4_proc_create_session(clp, 0);
1190 if (!status) {
1191 nfs_mark_client_ready(clp, NFS_CS_READY);
1192 } else if (status == -NFS4ERR_STALE_CLIENTID) {
1193 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1194 set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
1195 } else {
1196 nfs_mark_client_ready(clp, status);
1197 }
1198 return status;
1199}
1200#else /* CONFIG_NFS_V4_1 */
1201static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
1202static int nfs4_initialize_session(struct nfs_client *clp) { return 0; }
1203#endif /* CONFIG_NFS_V4_1 */
1204
1205/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
1206 * on EXCHANGE_ID for v4.1
1207 */
1208static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
1209{
1210 if (nfs4_has_session(clp)) {
1211 switch (status) {
1212 case -NFS4ERR_DELAY:
1213 case -NFS4ERR_CLID_INUSE:
1214 case -EAGAIN:
1215 break;
1216
1217 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
1218 * in nfs4_exchange_id */
1219 default:
1220 return;
1221 }
1222 }
1223 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1224}
1225
1114static void nfs4_state_manager(struct nfs_client *clp) 1226static void nfs4_state_manager(struct nfs_client *clp)
1115{ 1227{
1116 int status = 0; 1228 int status = 0;
@@ -1121,9 +1233,12 @@ static void nfs4_state_manager(struct nfs_client *clp)
1121 /* We're going to have to re-establish a clientid */ 1233 /* We're going to have to re-establish a clientid */
1122 status = nfs4_reclaim_lease(clp); 1234 status = nfs4_reclaim_lease(clp);
1123 if (status) { 1235 if (status) {
1124 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1236 nfs4_set_lease_expired(clp, status);
1125 if (status == -EAGAIN) 1237 if (status == -EAGAIN)
1126 continue; 1238 continue;
1239 if (clp->cl_cons_state ==
1240 NFS_CS_SESSION_INITING)
1241 nfs_mark_client_ready(clp, status);
1127 goto out_error; 1242 goto out_error;
1128 } 1243 }
1129 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); 1244 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
@@ -1134,25 +1249,44 @@ static void nfs4_state_manager(struct nfs_client *clp)
1134 if (status != 0) 1249 if (status != 0)
1135 continue; 1250 continue;
1136 } 1251 }
1137 1252 /* Initialize or reset the session */
1253 if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)
1254 && nfs4_has_session(clp)) {
1255 if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
1256 status = nfs4_initialize_session(clp);
1257 else
1258 status = nfs4_reset_session(clp);
1259 if (status) {
1260 if (status == -NFS4ERR_STALE_CLIENTID)
1261 continue;
1262 goto out_error;
1263 }
1264 }
1138 /* First recover reboot state... */ 1265 /* First recover reboot state... */
1139 if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { 1266 if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
1140 status = nfs4_do_reclaim(clp, &nfs4_reboot_recovery_ops); 1267 status = nfs4_do_reclaim(clp,
1268 nfs4_reboot_recovery_ops[clp->cl_minorversion]);
1141 if (status == -NFS4ERR_STALE_CLIENTID) 1269 if (status == -NFS4ERR_STALE_CLIENTID)
1142 continue; 1270 continue;
1271 if (test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
1272 continue;
1143 nfs4_state_end_reclaim_reboot(clp); 1273 nfs4_state_end_reclaim_reboot(clp);
1144 continue; 1274 continue;
1145 } 1275 }
1146 1276
1147 /* Now recover expired state... */ 1277 /* Now recover expired state... */
1148 if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { 1278 if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
1149 status = nfs4_do_reclaim(clp, &nfs4_nograce_recovery_ops); 1279 status = nfs4_do_reclaim(clp,
1280 nfs4_nograce_recovery_ops[clp->cl_minorversion]);
1150 if (status < 0) { 1281 if (status < 0) {
1151 set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); 1282 set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
1152 if (status == -NFS4ERR_STALE_CLIENTID) 1283 if (status == -NFS4ERR_STALE_CLIENTID)
1153 continue; 1284 continue;
1154 if (status == -NFS4ERR_EXPIRED) 1285 if (status == -NFS4ERR_EXPIRED)
1155 continue; 1286 continue;
1287 if (test_bit(NFS4CLNT_SESSION_SETUP,
1288 &clp->cl_state))
1289 continue;
1156 goto out_error; 1290 goto out_error;
1157 } else 1291 } else
1158 nfs4_state_end_reclaim_nograce(clp); 1292 nfs4_state_end_reclaim_nograce(clp);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1690f0e44b91..617273e7d47f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -192,12 +192,16 @@ static int nfs4_stat_to_errno(int);
192 decode_verifier_maxsz) 192 decode_verifier_maxsz)
193#define encode_remove_maxsz (op_encode_hdr_maxsz + \ 193#define encode_remove_maxsz (op_encode_hdr_maxsz + \
194 nfs4_name_maxsz) 194 nfs4_name_maxsz)
195#define decode_remove_maxsz (op_decode_hdr_maxsz + \
196 decode_change_info_maxsz)
195#define encode_rename_maxsz (op_encode_hdr_maxsz + \ 197#define encode_rename_maxsz (op_encode_hdr_maxsz + \
196 2 * nfs4_name_maxsz) 198 2 * nfs4_name_maxsz)
197#define decode_rename_maxsz (op_decode_hdr_maxsz + 5 + 5) 199#define decode_rename_maxsz (op_decode_hdr_maxsz + \
200 decode_change_info_maxsz + \
201 decode_change_info_maxsz)
198#define encode_link_maxsz (op_encode_hdr_maxsz + \ 202#define encode_link_maxsz (op_encode_hdr_maxsz + \
199 nfs4_name_maxsz) 203 nfs4_name_maxsz)
200#define decode_link_maxsz (op_decode_hdr_maxsz + 5) 204#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
201#define encode_lock_maxsz (op_encode_hdr_maxsz + \ 205#define encode_lock_maxsz (op_encode_hdr_maxsz + \
202 7 + \ 206 7 + \
203 1 + encode_stateid_maxsz + 8) 207 1 + encode_stateid_maxsz + 8)
@@ -240,43 +244,115 @@ static int nfs4_stat_to_errno(int);
240 (encode_getattr_maxsz) 244 (encode_getattr_maxsz)
241#define decode_fs_locations_maxsz \ 245#define decode_fs_locations_maxsz \
242 (0) 246 (0)
247
248#if defined(CONFIG_NFS_V4_1)
249#define NFS4_MAX_MACHINE_NAME_LEN (64)
250
251#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
252 encode_verifier_maxsz + \
253 1 /* co_ownerid.len */ + \
254 XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
255 1 /* flags */ + \
256 1 /* spa_how */ + \
257 0 /* SP4_NONE (for now) */ + \
258 1 /* zero implemetation id array */)
259#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
260 2 /* eir_clientid */ + \
261 1 /* eir_sequenceid */ + \
262 1 /* eir_flags */ + \
263 1 /* spr_how */ + \
264 0 /* SP4_NONE (for now) */ + \
265 2 /* eir_server_owner.so_minor_id */ + \
266 /* eir_server_owner.so_major_id<> */ \
267 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
268 /* eir_server_scope<> */ \
269 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
270 1 /* eir_server_impl_id array length */ + \
271 0 /* ignored eir_server_impl_id contents */)
272#define encode_channel_attrs_maxsz (6 + 1 /* ca_rdma_ird.len (0) */)
273#define decode_channel_attrs_maxsz (6 + \
274 1 /* ca_rdma_ird.len */ + \
275 1 /* ca_rdma_ird */)
276#define encode_create_session_maxsz (op_encode_hdr_maxsz + \
277 2 /* csa_clientid */ + \
278 1 /* csa_sequence */ + \
279 1 /* csa_flags */ + \
280 encode_channel_attrs_maxsz + \
281 encode_channel_attrs_maxsz + \
282 1 /* csa_cb_program */ + \
283 1 /* csa_sec_parms.len (1) */ + \
284 1 /* cb_secflavor (AUTH_SYS) */ + \
285 1 /* stamp */ + \
286 1 /* machinename.len */ + \
287 XDR_QUADLEN(NFS4_MAX_MACHINE_NAME_LEN) + \
288 1 /* uid */ + \
289 1 /* gid */ + \
290 1 /* gids.len (0) */)
291#define decode_create_session_maxsz (op_decode_hdr_maxsz + \
292 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
293 1 /* csr_sequence */ + \
294 1 /* csr_flags */ + \
295 decode_channel_attrs_maxsz + \
296 decode_channel_attrs_maxsz)
297#define encode_destroy_session_maxsz (op_encode_hdr_maxsz + 4)
298#define decode_destroy_session_maxsz (op_decode_hdr_maxsz)
299#define encode_sequence_maxsz (op_encode_hdr_maxsz + \
300 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)
301#define decode_sequence_maxsz (op_decode_hdr_maxsz + \
302 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
303#else /* CONFIG_NFS_V4_1 */
304#define encode_sequence_maxsz 0
305#define decode_sequence_maxsz 0
306#endif /* CONFIG_NFS_V4_1 */
307
243#define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ 308#define NFS4_enc_compound_sz (1024) /* XXX: large enough? */
244#define NFS4_dec_compound_sz (1024) /* XXX: large enough? */ 309#define NFS4_dec_compound_sz (1024) /* XXX: large enough? */
245#define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \ 310#define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \
311 encode_sequence_maxsz + \
246 encode_putfh_maxsz + \ 312 encode_putfh_maxsz + \
247 encode_read_maxsz) 313 encode_read_maxsz)
248#define NFS4_dec_read_sz (compound_decode_hdr_maxsz + \ 314#define NFS4_dec_read_sz (compound_decode_hdr_maxsz + \
315 decode_sequence_maxsz + \
249 decode_putfh_maxsz + \ 316 decode_putfh_maxsz + \
250 decode_read_maxsz) 317 decode_read_maxsz)
251#define NFS4_enc_readlink_sz (compound_encode_hdr_maxsz + \ 318#define NFS4_enc_readlink_sz (compound_encode_hdr_maxsz + \
319 encode_sequence_maxsz + \
252 encode_putfh_maxsz + \ 320 encode_putfh_maxsz + \
253 encode_readlink_maxsz) 321 encode_readlink_maxsz)
254#define NFS4_dec_readlink_sz (compound_decode_hdr_maxsz + \ 322#define NFS4_dec_readlink_sz (compound_decode_hdr_maxsz + \
323 decode_sequence_maxsz + \
255 decode_putfh_maxsz + \ 324 decode_putfh_maxsz + \
256 decode_readlink_maxsz) 325 decode_readlink_maxsz)
257#define NFS4_enc_readdir_sz (compound_encode_hdr_maxsz + \ 326#define NFS4_enc_readdir_sz (compound_encode_hdr_maxsz + \
327 encode_sequence_maxsz + \
258 encode_putfh_maxsz + \ 328 encode_putfh_maxsz + \
259 encode_readdir_maxsz) 329 encode_readdir_maxsz)
260#define NFS4_dec_readdir_sz (compound_decode_hdr_maxsz + \ 330#define NFS4_dec_readdir_sz (compound_decode_hdr_maxsz + \
331 decode_sequence_maxsz + \
261 decode_putfh_maxsz + \ 332 decode_putfh_maxsz + \
262 decode_readdir_maxsz) 333 decode_readdir_maxsz)
263#define NFS4_enc_write_sz (compound_encode_hdr_maxsz + \ 334#define NFS4_enc_write_sz (compound_encode_hdr_maxsz + \
335 encode_sequence_maxsz + \
264 encode_putfh_maxsz + \ 336 encode_putfh_maxsz + \
265 encode_write_maxsz + \ 337 encode_write_maxsz + \
266 encode_getattr_maxsz) 338 encode_getattr_maxsz)
267#define NFS4_dec_write_sz (compound_decode_hdr_maxsz + \ 339#define NFS4_dec_write_sz (compound_decode_hdr_maxsz + \
340 decode_sequence_maxsz + \
268 decode_putfh_maxsz + \ 341 decode_putfh_maxsz + \
269 decode_write_maxsz + \ 342 decode_write_maxsz + \
270 decode_getattr_maxsz) 343 decode_getattr_maxsz)
271#define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \ 344#define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \
345 encode_sequence_maxsz + \
272 encode_putfh_maxsz + \ 346 encode_putfh_maxsz + \
273 encode_commit_maxsz + \ 347 encode_commit_maxsz + \
274 encode_getattr_maxsz) 348 encode_getattr_maxsz)
275#define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \ 349#define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \
350 decode_sequence_maxsz + \
276 decode_putfh_maxsz + \ 351 decode_putfh_maxsz + \
277 decode_commit_maxsz + \ 352 decode_commit_maxsz + \
278 decode_getattr_maxsz) 353 decode_getattr_maxsz)
279#define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \ 354#define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \
355 encode_sequence_maxsz + \
280 encode_putfh_maxsz + \ 356 encode_putfh_maxsz + \
281 encode_savefh_maxsz + \ 357 encode_savefh_maxsz + \
282 encode_open_maxsz + \ 358 encode_open_maxsz + \
@@ -285,6 +361,7 @@ static int nfs4_stat_to_errno(int);
285 encode_restorefh_maxsz + \ 361 encode_restorefh_maxsz + \
286 encode_getattr_maxsz) 362 encode_getattr_maxsz)
287#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ 363#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \
364 decode_sequence_maxsz + \
288 decode_putfh_maxsz + \ 365 decode_putfh_maxsz + \
289 decode_savefh_maxsz + \ 366 decode_savefh_maxsz + \
290 decode_open_maxsz + \ 367 decode_open_maxsz + \
@@ -301,43 +378,53 @@ static int nfs4_stat_to_errno(int);
301 decode_putfh_maxsz + \ 378 decode_putfh_maxsz + \
302 decode_open_confirm_maxsz) 379 decode_open_confirm_maxsz)
303#define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \ 380#define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \
381 encode_sequence_maxsz + \
304 encode_putfh_maxsz + \ 382 encode_putfh_maxsz + \
305 encode_open_maxsz + \ 383 encode_open_maxsz + \
306 encode_getattr_maxsz) 384 encode_getattr_maxsz)
307#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ 385#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \
386 decode_sequence_maxsz + \
308 decode_putfh_maxsz + \ 387 decode_putfh_maxsz + \
309 decode_open_maxsz + \ 388 decode_open_maxsz + \
310 decode_getattr_maxsz) 389 decode_getattr_maxsz)
311#define NFS4_enc_open_downgrade_sz \ 390#define NFS4_enc_open_downgrade_sz \
312 (compound_encode_hdr_maxsz + \ 391 (compound_encode_hdr_maxsz + \
392 encode_sequence_maxsz + \
313 encode_putfh_maxsz + \ 393 encode_putfh_maxsz + \
314 encode_open_downgrade_maxsz + \ 394 encode_open_downgrade_maxsz + \
315 encode_getattr_maxsz) 395 encode_getattr_maxsz)
316#define NFS4_dec_open_downgrade_sz \ 396#define NFS4_dec_open_downgrade_sz \
317 (compound_decode_hdr_maxsz + \ 397 (compound_decode_hdr_maxsz + \
398 decode_sequence_maxsz + \
318 decode_putfh_maxsz + \ 399 decode_putfh_maxsz + \
319 decode_open_downgrade_maxsz + \ 400 decode_open_downgrade_maxsz + \
320 decode_getattr_maxsz) 401 decode_getattr_maxsz)
321#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \ 402#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \
403 encode_sequence_maxsz + \
322 encode_putfh_maxsz + \ 404 encode_putfh_maxsz + \
323 encode_close_maxsz + \ 405 encode_close_maxsz + \
324 encode_getattr_maxsz) 406 encode_getattr_maxsz)
325#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \ 407#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \
408 decode_sequence_maxsz + \
326 decode_putfh_maxsz + \ 409 decode_putfh_maxsz + \
327 decode_close_maxsz + \ 410 decode_close_maxsz + \
328 decode_getattr_maxsz) 411 decode_getattr_maxsz)
329#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \ 412#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
413 encode_sequence_maxsz + \
330 encode_putfh_maxsz + \ 414 encode_putfh_maxsz + \
331 encode_setattr_maxsz + \ 415 encode_setattr_maxsz + \
332 encode_getattr_maxsz) 416 encode_getattr_maxsz)
333#define NFS4_dec_setattr_sz (compound_decode_hdr_maxsz + \ 417#define NFS4_dec_setattr_sz (compound_decode_hdr_maxsz + \
418 decode_sequence_maxsz + \
334 decode_putfh_maxsz + \ 419 decode_putfh_maxsz + \
335 decode_setattr_maxsz + \ 420 decode_setattr_maxsz + \
336 decode_getattr_maxsz) 421 decode_getattr_maxsz)
337#define NFS4_enc_fsinfo_sz (compound_encode_hdr_maxsz + \ 422#define NFS4_enc_fsinfo_sz (compound_encode_hdr_maxsz + \
423 encode_sequence_maxsz + \
338 encode_putfh_maxsz + \ 424 encode_putfh_maxsz + \
339 encode_fsinfo_maxsz) 425 encode_fsinfo_maxsz)
340#define NFS4_dec_fsinfo_sz (compound_decode_hdr_maxsz + \ 426#define NFS4_dec_fsinfo_sz (compound_decode_hdr_maxsz + \
427 decode_sequence_maxsz + \
341 decode_putfh_maxsz + \ 428 decode_putfh_maxsz + \
342 decode_fsinfo_maxsz) 429 decode_fsinfo_maxsz)
343#define NFS4_enc_renew_sz (compound_encode_hdr_maxsz + \ 430#define NFS4_enc_renew_sz (compound_encode_hdr_maxsz + \
@@ -359,64 +446,81 @@ static int nfs4_stat_to_errno(int);
359 decode_putrootfh_maxsz + \ 446 decode_putrootfh_maxsz + \
360 decode_fsinfo_maxsz) 447 decode_fsinfo_maxsz)
361#define NFS4_enc_lock_sz (compound_encode_hdr_maxsz + \ 448#define NFS4_enc_lock_sz (compound_encode_hdr_maxsz + \
449 encode_sequence_maxsz + \
362 encode_putfh_maxsz + \ 450 encode_putfh_maxsz + \
363 encode_lock_maxsz) 451 encode_lock_maxsz)
364#define NFS4_dec_lock_sz (compound_decode_hdr_maxsz + \ 452#define NFS4_dec_lock_sz (compound_decode_hdr_maxsz + \
453 decode_sequence_maxsz + \
365 decode_putfh_maxsz + \ 454 decode_putfh_maxsz + \
366 decode_lock_maxsz) 455 decode_lock_maxsz)
367#define NFS4_enc_lockt_sz (compound_encode_hdr_maxsz + \ 456#define NFS4_enc_lockt_sz (compound_encode_hdr_maxsz + \
457 encode_sequence_maxsz + \
368 encode_putfh_maxsz + \ 458 encode_putfh_maxsz + \
369 encode_lockt_maxsz) 459 encode_lockt_maxsz)
370#define NFS4_dec_lockt_sz (compound_decode_hdr_maxsz + \ 460#define NFS4_dec_lockt_sz (compound_decode_hdr_maxsz + \
461 decode_sequence_maxsz + \
371 decode_putfh_maxsz + \ 462 decode_putfh_maxsz + \
372 decode_lockt_maxsz) 463 decode_lockt_maxsz)
373#define NFS4_enc_locku_sz (compound_encode_hdr_maxsz + \ 464#define NFS4_enc_locku_sz (compound_encode_hdr_maxsz + \
465 encode_sequence_maxsz + \
374 encode_putfh_maxsz + \ 466 encode_putfh_maxsz + \
375 encode_locku_maxsz) 467 encode_locku_maxsz)
376#define NFS4_dec_locku_sz (compound_decode_hdr_maxsz + \ 468#define NFS4_dec_locku_sz (compound_decode_hdr_maxsz + \
469 decode_sequence_maxsz + \
377 decode_putfh_maxsz + \ 470 decode_putfh_maxsz + \
378 decode_locku_maxsz) 471 decode_locku_maxsz)
379#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ 472#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \
473 encode_sequence_maxsz + \
380 encode_putfh_maxsz + \ 474 encode_putfh_maxsz + \
381 encode_access_maxsz + \ 475 encode_access_maxsz + \
382 encode_getattr_maxsz) 476 encode_getattr_maxsz)
383#define NFS4_dec_access_sz (compound_decode_hdr_maxsz + \ 477#define NFS4_dec_access_sz (compound_decode_hdr_maxsz + \
478 decode_sequence_maxsz + \
384 decode_putfh_maxsz + \ 479 decode_putfh_maxsz + \
385 decode_access_maxsz + \ 480 decode_access_maxsz + \
386 decode_getattr_maxsz) 481 decode_getattr_maxsz)
387#define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \ 482#define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \
483 encode_sequence_maxsz + \
388 encode_putfh_maxsz + \ 484 encode_putfh_maxsz + \
389 encode_getattr_maxsz) 485 encode_getattr_maxsz)
390#define NFS4_dec_getattr_sz (compound_decode_hdr_maxsz + \ 486#define NFS4_dec_getattr_sz (compound_decode_hdr_maxsz + \
487 decode_sequence_maxsz + \
391 decode_putfh_maxsz + \ 488 decode_putfh_maxsz + \
392 decode_getattr_maxsz) 489 decode_getattr_maxsz)
393#define NFS4_enc_lookup_sz (compound_encode_hdr_maxsz + \ 490#define NFS4_enc_lookup_sz (compound_encode_hdr_maxsz + \
491 encode_sequence_maxsz + \
394 encode_putfh_maxsz + \ 492 encode_putfh_maxsz + \
395 encode_lookup_maxsz + \ 493 encode_lookup_maxsz + \
396 encode_getattr_maxsz + \ 494 encode_getattr_maxsz + \
397 encode_getfh_maxsz) 495 encode_getfh_maxsz)
398#define NFS4_dec_lookup_sz (compound_decode_hdr_maxsz + \ 496#define NFS4_dec_lookup_sz (compound_decode_hdr_maxsz + \
497 decode_sequence_maxsz + \
399 decode_putfh_maxsz + \ 498 decode_putfh_maxsz + \
400 decode_lookup_maxsz + \ 499 decode_lookup_maxsz + \
401 decode_getattr_maxsz + \ 500 decode_getattr_maxsz + \
402 decode_getfh_maxsz) 501 decode_getfh_maxsz)
403#define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \ 502#define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \
503 encode_sequence_maxsz + \
404 encode_putrootfh_maxsz + \ 504 encode_putrootfh_maxsz + \
405 encode_getattr_maxsz + \ 505 encode_getattr_maxsz + \
406 encode_getfh_maxsz) 506 encode_getfh_maxsz)
407#define NFS4_dec_lookup_root_sz (compound_decode_hdr_maxsz + \ 507#define NFS4_dec_lookup_root_sz (compound_decode_hdr_maxsz + \
508 decode_sequence_maxsz + \
408 decode_putrootfh_maxsz + \ 509 decode_putrootfh_maxsz + \
409 decode_getattr_maxsz + \ 510 decode_getattr_maxsz + \
410 decode_getfh_maxsz) 511 decode_getfh_maxsz)
411#define NFS4_enc_remove_sz (compound_encode_hdr_maxsz + \ 512#define NFS4_enc_remove_sz (compound_encode_hdr_maxsz + \
513 encode_sequence_maxsz + \
412 encode_putfh_maxsz + \ 514 encode_putfh_maxsz + \
413 encode_remove_maxsz + \ 515 encode_remove_maxsz + \
414 encode_getattr_maxsz) 516 encode_getattr_maxsz)
415#define NFS4_dec_remove_sz (compound_decode_hdr_maxsz + \ 517#define NFS4_dec_remove_sz (compound_decode_hdr_maxsz + \
518 decode_sequence_maxsz + \
416 decode_putfh_maxsz + \ 519 decode_putfh_maxsz + \
417 op_decode_hdr_maxsz + 5 + \ 520 decode_remove_maxsz + \
418 decode_getattr_maxsz) 521 decode_getattr_maxsz)
419#define NFS4_enc_rename_sz (compound_encode_hdr_maxsz + \ 522#define NFS4_enc_rename_sz (compound_encode_hdr_maxsz + \
523 encode_sequence_maxsz + \
420 encode_putfh_maxsz + \ 524 encode_putfh_maxsz + \
421 encode_savefh_maxsz + \ 525 encode_savefh_maxsz + \
422 encode_putfh_maxsz + \ 526 encode_putfh_maxsz + \
@@ -425,6 +529,7 @@ static int nfs4_stat_to_errno(int);
425 encode_restorefh_maxsz + \ 529 encode_restorefh_maxsz + \
426 encode_getattr_maxsz) 530 encode_getattr_maxsz)
427#define NFS4_dec_rename_sz (compound_decode_hdr_maxsz + \ 531#define NFS4_dec_rename_sz (compound_decode_hdr_maxsz + \
532 decode_sequence_maxsz + \
428 decode_putfh_maxsz + \ 533 decode_putfh_maxsz + \
429 decode_savefh_maxsz + \ 534 decode_savefh_maxsz + \
430 decode_putfh_maxsz + \ 535 decode_putfh_maxsz + \
@@ -433,6 +538,7 @@ static int nfs4_stat_to_errno(int);
433 decode_restorefh_maxsz + \ 538 decode_restorefh_maxsz + \
434 decode_getattr_maxsz) 539 decode_getattr_maxsz)
435#define NFS4_enc_link_sz (compound_encode_hdr_maxsz + \ 540#define NFS4_enc_link_sz (compound_encode_hdr_maxsz + \
541 encode_sequence_maxsz + \
436 encode_putfh_maxsz + \ 542 encode_putfh_maxsz + \
437 encode_savefh_maxsz + \ 543 encode_savefh_maxsz + \
438 encode_putfh_maxsz + \ 544 encode_putfh_maxsz + \
@@ -441,6 +547,7 @@ static int nfs4_stat_to_errno(int);
441 encode_restorefh_maxsz + \ 547 encode_restorefh_maxsz + \
442 decode_getattr_maxsz) 548 decode_getattr_maxsz)
443#define NFS4_dec_link_sz (compound_decode_hdr_maxsz + \ 549#define NFS4_dec_link_sz (compound_decode_hdr_maxsz + \
550 decode_sequence_maxsz + \
444 decode_putfh_maxsz + \ 551 decode_putfh_maxsz + \
445 decode_savefh_maxsz + \ 552 decode_savefh_maxsz + \
446 decode_putfh_maxsz + \ 553 decode_putfh_maxsz + \
@@ -449,16 +556,19 @@ static int nfs4_stat_to_errno(int);
449 decode_restorefh_maxsz + \ 556 decode_restorefh_maxsz + \
450 decode_getattr_maxsz) 557 decode_getattr_maxsz)
451#define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \ 558#define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \
559 encode_sequence_maxsz + \
452 encode_putfh_maxsz + \ 560 encode_putfh_maxsz + \
453 encode_symlink_maxsz + \ 561 encode_symlink_maxsz + \
454 encode_getattr_maxsz + \ 562 encode_getattr_maxsz + \
455 encode_getfh_maxsz) 563 encode_getfh_maxsz)
456#define NFS4_dec_symlink_sz (compound_decode_hdr_maxsz + \ 564#define NFS4_dec_symlink_sz (compound_decode_hdr_maxsz + \
565 decode_sequence_maxsz + \
457 decode_putfh_maxsz + \ 566 decode_putfh_maxsz + \
458 decode_symlink_maxsz + \ 567 decode_symlink_maxsz + \
459 decode_getattr_maxsz + \ 568 decode_getattr_maxsz + \
460 decode_getfh_maxsz) 569 decode_getfh_maxsz)
461#define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \ 570#define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \
571 encode_sequence_maxsz + \
462 encode_putfh_maxsz + \ 572 encode_putfh_maxsz + \
463 encode_savefh_maxsz + \ 573 encode_savefh_maxsz + \
464 encode_create_maxsz + \ 574 encode_create_maxsz + \
@@ -467,6 +577,7 @@ static int nfs4_stat_to_errno(int);
467 encode_restorefh_maxsz + \ 577 encode_restorefh_maxsz + \
468 encode_getattr_maxsz) 578 encode_getattr_maxsz)
469#define NFS4_dec_create_sz (compound_decode_hdr_maxsz + \ 579#define NFS4_dec_create_sz (compound_decode_hdr_maxsz + \
580 decode_sequence_maxsz + \
470 decode_putfh_maxsz + \ 581 decode_putfh_maxsz + \
471 decode_savefh_maxsz + \ 582 decode_savefh_maxsz + \
472 decode_create_maxsz + \ 583 decode_create_maxsz + \
@@ -475,52 +586,98 @@ static int nfs4_stat_to_errno(int);
475 decode_restorefh_maxsz + \ 586 decode_restorefh_maxsz + \
476 decode_getattr_maxsz) 587 decode_getattr_maxsz)
477#define NFS4_enc_pathconf_sz (compound_encode_hdr_maxsz + \ 588#define NFS4_enc_pathconf_sz (compound_encode_hdr_maxsz + \
589 encode_sequence_maxsz + \
478 encode_putfh_maxsz + \ 590 encode_putfh_maxsz + \
479 encode_getattr_maxsz) 591 encode_getattr_maxsz)
480#define NFS4_dec_pathconf_sz (compound_decode_hdr_maxsz + \ 592#define NFS4_dec_pathconf_sz (compound_decode_hdr_maxsz + \
593 decode_sequence_maxsz + \
481 decode_putfh_maxsz + \ 594 decode_putfh_maxsz + \
482 decode_getattr_maxsz) 595 decode_getattr_maxsz)
483#define NFS4_enc_statfs_sz (compound_encode_hdr_maxsz + \ 596#define NFS4_enc_statfs_sz (compound_encode_hdr_maxsz + \
597 encode_sequence_maxsz + \
484 encode_putfh_maxsz + \ 598 encode_putfh_maxsz + \
485 encode_statfs_maxsz) 599 encode_statfs_maxsz)
486#define NFS4_dec_statfs_sz (compound_decode_hdr_maxsz + \ 600#define NFS4_dec_statfs_sz (compound_decode_hdr_maxsz + \
601 decode_sequence_maxsz + \
487 decode_putfh_maxsz + \ 602 decode_putfh_maxsz + \
488 decode_statfs_maxsz) 603 decode_statfs_maxsz)
489#define NFS4_enc_server_caps_sz (compound_encode_hdr_maxsz + \ 604#define NFS4_enc_server_caps_sz (compound_encode_hdr_maxsz + \
605 encode_sequence_maxsz + \
490 encode_putfh_maxsz + \ 606 encode_putfh_maxsz + \
491 encode_getattr_maxsz) 607 encode_getattr_maxsz)
492#define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \ 608#define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \
609 decode_sequence_maxsz + \
493 decode_putfh_maxsz + \ 610 decode_putfh_maxsz + \
494 decode_getattr_maxsz) 611 decode_getattr_maxsz)
495#define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \ 612#define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \
613 encode_sequence_maxsz + \
496 encode_putfh_maxsz + \ 614 encode_putfh_maxsz + \
497 encode_delegreturn_maxsz + \ 615 encode_delegreturn_maxsz + \
498 encode_getattr_maxsz) 616 encode_getattr_maxsz)
499#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ 617#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
618 decode_sequence_maxsz + \
500 decode_delegreturn_maxsz + \ 619 decode_delegreturn_maxsz + \
501 decode_getattr_maxsz) 620 decode_getattr_maxsz)
502#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \ 621#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \
622 encode_sequence_maxsz + \
503 encode_putfh_maxsz + \ 623 encode_putfh_maxsz + \
504 encode_getacl_maxsz) 624 encode_getacl_maxsz)
505#define NFS4_dec_getacl_sz (compound_decode_hdr_maxsz + \ 625#define NFS4_dec_getacl_sz (compound_decode_hdr_maxsz + \
626 decode_sequence_maxsz + \
506 decode_putfh_maxsz + \ 627 decode_putfh_maxsz + \
507 decode_getacl_maxsz) 628 decode_getacl_maxsz)
508#define NFS4_enc_setacl_sz (compound_encode_hdr_maxsz + \ 629#define NFS4_enc_setacl_sz (compound_encode_hdr_maxsz + \
630 encode_sequence_maxsz + \
509 encode_putfh_maxsz + \ 631 encode_putfh_maxsz + \
510 encode_setacl_maxsz) 632 encode_setacl_maxsz)
511#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \ 633#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \
634 decode_sequence_maxsz + \
512 decode_putfh_maxsz + \ 635 decode_putfh_maxsz + \
513 decode_setacl_maxsz) 636 decode_setacl_maxsz)
514#define NFS4_enc_fs_locations_sz \ 637#define NFS4_enc_fs_locations_sz \
515 (compound_encode_hdr_maxsz + \ 638 (compound_encode_hdr_maxsz + \
639 encode_sequence_maxsz + \
516 encode_putfh_maxsz + \ 640 encode_putfh_maxsz + \
517 encode_lookup_maxsz + \ 641 encode_lookup_maxsz + \
518 encode_fs_locations_maxsz) 642 encode_fs_locations_maxsz)
519#define NFS4_dec_fs_locations_sz \ 643#define NFS4_dec_fs_locations_sz \
520 (compound_decode_hdr_maxsz + \ 644 (compound_decode_hdr_maxsz + \
645 decode_sequence_maxsz + \
521 decode_putfh_maxsz + \ 646 decode_putfh_maxsz + \
522 decode_lookup_maxsz + \ 647 decode_lookup_maxsz + \
523 decode_fs_locations_maxsz) 648 decode_fs_locations_maxsz)
649#if defined(CONFIG_NFS_V4_1)
650#define NFS4_enc_exchange_id_sz \
651 (compound_encode_hdr_maxsz + \
652 encode_exchange_id_maxsz)
653#define NFS4_dec_exchange_id_sz \
654 (compound_decode_hdr_maxsz + \
655 decode_exchange_id_maxsz)
656#define NFS4_enc_create_session_sz \
657 (compound_encode_hdr_maxsz + \
658 encode_create_session_maxsz)
659#define NFS4_dec_create_session_sz \
660 (compound_decode_hdr_maxsz + \
661 decode_create_session_maxsz)
662#define NFS4_enc_destroy_session_sz (compound_encode_hdr_maxsz + \
663 encode_destroy_session_maxsz)
664#define NFS4_dec_destroy_session_sz (compound_decode_hdr_maxsz + \
665 decode_destroy_session_maxsz)
666#define NFS4_enc_sequence_sz \
667 (compound_decode_hdr_maxsz + \
668 encode_sequence_maxsz)
669#define NFS4_dec_sequence_sz \
670 (compound_decode_hdr_maxsz + \
671 decode_sequence_maxsz)
672#define NFS4_enc_get_lease_time_sz (compound_encode_hdr_maxsz + \
673 encode_sequence_maxsz + \
674 encode_putrootfh_maxsz + \
675 encode_fsinfo_maxsz)
676#define NFS4_dec_get_lease_time_sz (compound_decode_hdr_maxsz + \
677 decode_sequence_maxsz + \
678 decode_putrootfh_maxsz + \
679 decode_fsinfo_maxsz)
680#endif /* CONFIG_NFS_V4_1 */
524 681
525static const umode_t nfs_type2fmt[] = { 682static const umode_t nfs_type2fmt[] = {
526 [NF4BAD] = 0, 683 [NF4BAD] = 0,
@@ -541,6 +698,8 @@ struct compound_hdr {
541 __be32 * nops_p; 698 __be32 * nops_p;
542 uint32_t taglen; 699 uint32_t taglen;
543 char * tag; 700 char * tag;
701 uint32_t replen; /* expected reply words */
702 u32 minorversion;
544}; 703};
545 704
546/* 705/*
@@ -576,22 +735,31 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *
576 xdr_encode_opaque(p, str, len); 735 xdr_encode_opaque(p, str, len);
577} 736}
578 737
579static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) 738static void encode_compound_hdr(struct xdr_stream *xdr,
739 struct rpc_rqst *req,
740 struct compound_hdr *hdr)
580{ 741{
581 __be32 *p; 742 __be32 *p;
743 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
744
745 /* initialize running count of expected bytes in reply.
746 * NOTE: the replied tag SHOULD be the same is the one sent,
747 * but this is not required as a MUST for the server to do so. */
748 hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
582 749
583 dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); 750 dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
584 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); 751 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
585 RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); 752 RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
586 WRITE32(hdr->taglen); 753 WRITE32(hdr->taglen);
587 WRITEMEM(hdr->tag, hdr->taglen); 754 WRITEMEM(hdr->tag, hdr->taglen);
588 WRITE32(NFS4_MINOR_VERSION); 755 WRITE32(hdr->minorversion);
589 hdr->nops_p = p; 756 hdr->nops_p = p;
590 WRITE32(hdr->nops); 757 WRITE32(hdr->nops);
591} 758}
592 759
593static void encode_nops(struct compound_hdr *hdr) 760static void encode_nops(struct compound_hdr *hdr)
594{ 761{
762 BUG_ON(hdr->nops > NFS4_MAX_OPS);
595 *hdr->nops_p = htonl(hdr->nops); 763 *hdr->nops_p = htonl(hdr->nops);
596} 764}
597 765
@@ -736,6 +904,7 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
736 WRITE32(OP_ACCESS); 904 WRITE32(OP_ACCESS);
737 WRITE32(access); 905 WRITE32(access);
738 hdr->nops++; 906 hdr->nops++;
907 hdr->replen += decode_access_maxsz;
739} 908}
740 909
741static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) 910static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
@@ -747,6 +916,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
747 WRITE32(arg->seqid->sequence->counter); 916 WRITE32(arg->seqid->sequence->counter);
748 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 917 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
749 hdr->nops++; 918 hdr->nops++;
919 hdr->replen += decode_close_maxsz;
750} 920}
751 921
752static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) 922static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
@@ -758,6 +928,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
758 WRITE64(args->offset); 928 WRITE64(args->offset);
759 WRITE32(args->count); 929 WRITE32(args->count);
760 hdr->nops++; 930 hdr->nops++;
931 hdr->replen += decode_commit_maxsz;
761} 932}
762 933
763static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr) 934static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
@@ -789,6 +960,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
789 WRITE32(create->name->len); 960 WRITE32(create->name->len);
790 WRITEMEM(create->name->name, create->name->len); 961 WRITEMEM(create->name->name, create->name->len);
791 hdr->nops++; 962 hdr->nops++;
963 hdr->replen += decode_create_maxsz;
792 964
793 encode_attrs(xdr, create->attrs, create->server); 965 encode_attrs(xdr, create->attrs, create->server);
794} 966}
@@ -802,6 +974,7 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
802 WRITE32(1); 974 WRITE32(1);
803 WRITE32(bitmap); 975 WRITE32(bitmap);
804 hdr->nops++; 976 hdr->nops++;
977 hdr->replen += decode_getattr_maxsz;
805} 978}
806 979
807static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr) 980static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
@@ -814,6 +987,7 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
814 WRITE32(bm0); 987 WRITE32(bm0);
815 WRITE32(bm1); 988 WRITE32(bm1);
816 hdr->nops++; 989 hdr->nops++;
990 hdr->replen += decode_getattr_maxsz;
817} 991}
818 992
819static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 993static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -841,6 +1015,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
841 RESERVE_SPACE(4); 1015 RESERVE_SPACE(4);
842 WRITE32(OP_GETFH); 1016 WRITE32(OP_GETFH);
843 hdr->nops++; 1017 hdr->nops++;
1018 hdr->replen += decode_getfh_maxsz;
844} 1019}
845 1020
846static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) 1021static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
@@ -852,6 +1027,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
852 WRITE32(name->len); 1027 WRITE32(name->len);
853 WRITEMEM(name->name, name->len); 1028 WRITEMEM(name->name, name->len);
854 hdr->nops++; 1029 hdr->nops++;
1030 hdr->replen += decode_link_maxsz;
855} 1031}
856 1032
857static inline int nfs4_lock_type(struct file_lock *fl, int block) 1033static inline int nfs4_lock_type(struct file_lock *fl, int block)
@@ -899,6 +1075,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
899 WRITE32(args->lock_seqid->sequence->counter); 1075 WRITE32(args->lock_seqid->sequence->counter);
900 } 1076 }
901 hdr->nops++; 1077 hdr->nops++;
1078 hdr->replen += decode_lock_maxsz;
902} 1079}
903 1080
904static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr) 1081static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
@@ -915,6 +1092,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
915 WRITEMEM("lock id:", 8); 1092 WRITEMEM("lock id:", 8);
916 WRITE64(args->lock_owner.id); 1093 WRITE64(args->lock_owner.id);
917 hdr->nops++; 1094 hdr->nops++;
1095 hdr->replen += decode_lockt_maxsz;
918} 1096}
919 1097
920static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr) 1098static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
@@ -929,6 +1107,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
929 WRITE64(args->fl->fl_start); 1107 WRITE64(args->fl->fl_start);
930 WRITE64(nfs4_lock_length(args->fl)); 1108 WRITE64(nfs4_lock_length(args->fl));
931 hdr->nops++; 1109 hdr->nops++;
1110 hdr->replen += decode_locku_maxsz;
932} 1111}
933 1112
934static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) 1113static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
@@ -941,6 +1120,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
941 WRITE32(len); 1120 WRITE32(len);
942 WRITEMEM(name->name, len); 1121 WRITEMEM(name->name, len);
943 hdr->nops++; 1122 hdr->nops++;
1123 hdr->replen += decode_lookup_maxsz;
944} 1124}
945 1125
946static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) 1126static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
@@ -1080,6 +1260,7 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg,
1080 BUG(); 1260 BUG();
1081 } 1261 }
1082 hdr->nops++; 1262 hdr->nops++;
1263 hdr->replen += decode_open_maxsz;
1083} 1264}
1084 1265
1085static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr) 1266static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
@@ -1091,6 +1272,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
1091 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 1272 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
1092 WRITE32(arg->seqid->sequence->counter); 1273 WRITE32(arg->seqid->sequence->counter);
1093 hdr->nops++; 1274 hdr->nops++;
1275 hdr->replen += decode_open_confirm_maxsz;
1094} 1276}
1095 1277
1096static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) 1278static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
@@ -1103,6 +1285,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
1103 WRITE32(arg->seqid->sequence->counter); 1285 WRITE32(arg->seqid->sequence->counter);
1104 encode_share_access(xdr, arg->fmode); 1286 encode_share_access(xdr, arg->fmode);
1105 hdr->nops++; 1287 hdr->nops++;
1288 hdr->replen += decode_open_downgrade_maxsz;
1106} 1289}
1107 1290
1108static void 1291static void
@@ -1116,6 +1299,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
1116 WRITE32(len); 1299 WRITE32(len);
1117 WRITEMEM(fh->data, len); 1300 WRITEMEM(fh->data, len);
1118 hdr->nops++; 1301 hdr->nops++;
1302 hdr->replen += decode_putfh_maxsz;
1119} 1303}
1120 1304
1121static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) 1305static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
@@ -1125,6 +1309,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1125 RESERVE_SPACE(4); 1309 RESERVE_SPACE(4);
1126 WRITE32(OP_PUTROOTFH); 1310 WRITE32(OP_PUTROOTFH);
1127 hdr->nops++; 1311 hdr->nops++;
1312 hdr->replen += decode_putrootfh_maxsz;
1128} 1313}
1129 1314
1130static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) 1315static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
@@ -1153,6 +1338,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1153 WRITE64(args->offset); 1338 WRITE64(args->offset);
1154 WRITE32(args->count); 1339 WRITE32(args->count);
1155 hdr->nops++; 1340 hdr->nops++;
1341 hdr->replen += decode_read_maxsz;
1156} 1342}
1157 1343
1158static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) 1344static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1178,6 +1364,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1178 WRITE32(attrs[0] & readdir->bitmask[0]); 1364 WRITE32(attrs[0] & readdir->bitmask[0]);
1179 WRITE32(attrs[1] & readdir->bitmask[1]); 1365 WRITE32(attrs[1] & readdir->bitmask[1]);
1180 hdr->nops++; 1366 hdr->nops++;
1367 hdr->replen += decode_readdir_maxsz;
1181 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", 1368 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
1182 __func__, 1369 __func__,
1183 (unsigned long long)readdir->cookie, 1370 (unsigned long long)readdir->cookie,
@@ -1194,6 +1381,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
1194 RESERVE_SPACE(4); 1381 RESERVE_SPACE(4);
1195 WRITE32(OP_READLINK); 1382 WRITE32(OP_READLINK);
1196 hdr->nops++; 1383 hdr->nops++;
1384 hdr->replen += decode_readlink_maxsz;
1197} 1385}
1198 1386
1199static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) 1387static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
@@ -1205,6 +1393,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
1205 WRITE32(name->len); 1393 WRITE32(name->len);
1206 WRITEMEM(name->name, name->len); 1394 WRITEMEM(name->name, name->len);
1207 hdr->nops++; 1395 hdr->nops++;
1396 hdr->replen += decode_remove_maxsz;
1208} 1397}
1209 1398
1210static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr) 1399static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
@@ -1220,6 +1409,7 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
1220 WRITE32(newname->len); 1409 WRITE32(newname->len);
1221 WRITEMEM(newname->name, newname->len); 1410 WRITEMEM(newname->name, newname->len);
1222 hdr->nops++; 1411 hdr->nops++;
1412 hdr->replen += decode_rename_maxsz;
1223} 1413}
1224 1414
1225static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr) 1415static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr)
@@ -1230,6 +1420,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
1230 WRITE32(OP_RENEW); 1420 WRITE32(OP_RENEW);
1231 WRITE64(client_stateid->cl_clientid); 1421 WRITE64(client_stateid->cl_clientid);
1232 hdr->nops++; 1422 hdr->nops++;
1423 hdr->replen += decode_renew_maxsz;
1233} 1424}
1234 1425
1235static void 1426static void
@@ -1240,6 +1431,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1240 RESERVE_SPACE(4); 1431 RESERVE_SPACE(4);
1241 WRITE32(OP_RESTOREFH); 1432 WRITE32(OP_RESTOREFH);
1242 hdr->nops++; 1433 hdr->nops++;
1434 hdr->replen += decode_restorefh_maxsz;
1243} 1435}
1244 1436
1245static int 1437static int
@@ -1259,6 +1451,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
1259 WRITE32(arg->acl_len); 1451 WRITE32(arg->acl_len);
1260 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); 1452 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
1261 hdr->nops++; 1453 hdr->nops++;
1454 hdr->replen += decode_setacl_maxsz;
1262 return 0; 1455 return 0;
1263} 1456}
1264 1457
@@ -1270,6 +1463,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1270 RESERVE_SPACE(4); 1463 RESERVE_SPACE(4);
1271 WRITE32(OP_SAVEFH); 1464 WRITE32(OP_SAVEFH);
1272 hdr->nops++; 1465 hdr->nops++;
1466 hdr->replen += decode_savefh_maxsz;
1273} 1467}
1274 1468
1275static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr) 1469static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
@@ -1280,6 +1474,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
1280 WRITE32(OP_SETATTR); 1474 WRITE32(OP_SETATTR);
1281 WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); 1475 WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
1282 hdr->nops++; 1476 hdr->nops++;
1477 hdr->replen += decode_setattr_maxsz;
1283 encode_attrs(xdr, arg->iap, server); 1478 encode_attrs(xdr, arg->iap, server);
1284} 1479}
1285 1480
@@ -1299,6 +1494,7 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
1299 RESERVE_SPACE(4); 1494 RESERVE_SPACE(4);
1300 WRITE32(setclientid->sc_cb_ident); 1495 WRITE32(setclientid->sc_cb_ident);
1301 hdr->nops++; 1496 hdr->nops++;
1497 hdr->replen += decode_setclientid_maxsz;
1302} 1498}
1303 1499
1304static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr) 1500static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
@@ -1310,6 +1506,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
1310 WRITE64(client_state->cl_clientid); 1506 WRITE64(client_state->cl_clientid);
1311 WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); 1507 WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
1312 hdr->nops++; 1508 hdr->nops++;
1509 hdr->replen += decode_setclientid_confirm_maxsz;
1313} 1510}
1314 1511
1315static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) 1512static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
@@ -1328,6 +1525,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
1328 1525
1329 xdr_write_pages(xdr, args->pages, args->pgbase, args->count); 1526 xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
1330 hdr->nops++; 1527 hdr->nops++;
1528 hdr->replen += decode_write_maxsz;
1331} 1529}
1332 1530
1333static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr) 1531static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
@@ -1339,11 +1537,163 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
1339 WRITE32(OP_DELEGRETURN); 1537 WRITE32(OP_DELEGRETURN);
1340 WRITEMEM(stateid->data, NFS4_STATEID_SIZE); 1538 WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
1341 hdr->nops++; 1539 hdr->nops++;
1540 hdr->replen += decode_delegreturn_maxsz;
1541}
1542
1543#if defined(CONFIG_NFS_V4_1)
1544/* NFSv4.1 operations */
1545static void encode_exchange_id(struct xdr_stream *xdr,
1546 struct nfs41_exchange_id_args *args,
1547 struct compound_hdr *hdr)
1548{
1549 __be32 *p;
1550
1551 RESERVE_SPACE(4 + sizeof(args->verifier->data));
1552 WRITE32(OP_EXCHANGE_ID);
1553 WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
1554
1555 encode_string(xdr, args->id_len, args->id);
1556
1557 RESERVE_SPACE(12);
1558 WRITE32(args->flags);
1559 WRITE32(0); /* zero length state_protect4_a */
1560 WRITE32(0); /* zero length implementation id array */
1561 hdr->nops++;
1562 hdr->replen += decode_exchange_id_maxsz;
1563}
1564
1565static void encode_create_session(struct xdr_stream *xdr,
1566 struct nfs41_create_session_args *args,
1567 struct compound_hdr *hdr)
1568{
1569 __be32 *p;
1570 char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
1571 uint32_t len;
1572 struct nfs_client *clp = args->client;
1573
1574 RESERVE_SPACE(4);
1575 WRITE32(OP_CREATE_SESSION);
1576
1577 RESERVE_SPACE(8);
1578 WRITE64(clp->cl_ex_clid);
1579
1580 RESERVE_SPACE(8);
1581 WRITE32(clp->cl_seqid); /*Sequence id */
1582 WRITE32(args->flags); /*flags */
1583
1584 RESERVE_SPACE(2*28); /* 2 channel_attrs */
1585 /* Fore Channel */
1586 WRITE32(args->fc_attrs.headerpadsz); /* header padding size */
1587 WRITE32(args->fc_attrs.max_rqst_sz); /* max req size */
1588 WRITE32(args->fc_attrs.max_resp_sz); /* max resp size */
1589 WRITE32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */
1590 WRITE32(args->fc_attrs.max_ops); /* max operations */
1591 WRITE32(args->fc_attrs.max_reqs); /* max requests */
1592 WRITE32(0); /* rdmachannel_attrs */
1593
1594 /* Back Channel */
1595 WRITE32(args->fc_attrs.headerpadsz); /* header padding size */
1596 WRITE32(args->bc_attrs.max_rqst_sz); /* max req size */
1597 WRITE32(args->bc_attrs.max_resp_sz); /* max resp size */
1598 WRITE32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */
1599 WRITE32(args->bc_attrs.max_ops); /* max operations */
1600 WRITE32(args->bc_attrs.max_reqs); /* max requests */
1601 WRITE32(0); /* rdmachannel_attrs */
1602
1603 RESERVE_SPACE(4);
1604 WRITE32(args->cb_program); /* cb_program */
1605
1606 RESERVE_SPACE(4); /* # of security flavors */
1607 WRITE32(1);
1608
1609 RESERVE_SPACE(4);
1610 WRITE32(RPC_AUTH_UNIX); /* auth_sys */
1611
1612 /* authsys_parms rfc1831 */
1613 RESERVE_SPACE(4);
1614 WRITE32((u32)clp->cl_boot_time.tv_nsec); /* stamp */
1615 len = scnprintf(machine_name, sizeof(machine_name), "%s",
1616 clp->cl_ipaddr);
1617 RESERVE_SPACE(16 + len);
1618 WRITE32(len);
1619 WRITEMEM(machine_name, len);
1620 WRITE32(0); /* UID */
1621 WRITE32(0); /* GID */
1622 WRITE32(0); /* No more gids */
1623 hdr->nops++;
1624 hdr->replen += decode_create_session_maxsz;
1625}
1626
1627static void encode_destroy_session(struct xdr_stream *xdr,
1628 struct nfs4_session *session,
1629 struct compound_hdr *hdr)
1630{
1631 __be32 *p;
1632 RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
1633 WRITE32(OP_DESTROY_SESSION);
1634 WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1635 hdr->nops++;
1636 hdr->replen += decode_destroy_session_maxsz;
1342} 1637}
1638#endif /* CONFIG_NFS_V4_1 */
1639
1640static void encode_sequence(struct xdr_stream *xdr,
1641 const struct nfs4_sequence_args *args,
1642 struct compound_hdr *hdr)
1643{
1644#if defined(CONFIG_NFS_V4_1)
1645 struct nfs4_session *session = args->sa_session;
1646 struct nfs4_slot_table *tp;
1647 struct nfs4_slot *slot;
1648 __be32 *p;
1649
1650 if (!session)
1651 return;
1652
1653 tp = &session->fc_slot_table;
1654
1655 WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
1656 slot = tp->slots + args->sa_slotid;
1657
1658 RESERVE_SPACE(4);
1659 WRITE32(OP_SEQUENCE);
1660
1661 /*
1662 * Sessionid + seqid + slotid + max slotid + cache_this
1663 */
1664 dprintk("%s: sessionid=%u:%u:%u:%u seqid=%d slotid=%d "
1665 "max_slotid=%d cache_this=%d\n",
1666 __func__,
1667 ((u32 *)session->sess_id.data)[0],
1668 ((u32 *)session->sess_id.data)[1],
1669 ((u32 *)session->sess_id.data)[2],
1670 ((u32 *)session->sess_id.data)[3],
1671 slot->seq_nr, args->sa_slotid,
1672 tp->highest_used_slotid, args->sa_cache_this);
1673 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
1674 WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1675 WRITE32(slot->seq_nr);
1676 WRITE32(args->sa_slotid);
1677 WRITE32(tp->highest_used_slotid);
1678 WRITE32(args->sa_cache_this);
1679 hdr->nops++;
1680 hdr->replen += decode_sequence_maxsz;
1681#endif /* CONFIG_NFS_V4_1 */
1682}
1683
1343/* 1684/*
1344 * END OF "GENERIC" ENCODE ROUTINES. 1685 * END OF "GENERIC" ENCODE ROUTINES.
1345 */ 1686 */
1346 1687
1688static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
1689{
1690#if defined(CONFIG_NFS_V4_1)
1691 if (args->sa_session)
1692 return args->sa_session->clp->cl_minorversion;
1693#endif /* CONFIG_NFS_V4_1 */
1694 return 0;
1695}
1696
1347/* 1697/*
1348 * Encode an ACCESS request 1698 * Encode an ACCESS request
1349 */ 1699 */
@@ -1351,11 +1701,12 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs
1351{ 1701{
1352 struct xdr_stream xdr; 1702 struct xdr_stream xdr;
1353 struct compound_hdr hdr = { 1703 struct compound_hdr hdr = {
1354 .nops = 0, 1704 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1355 }; 1705 };
1356 1706
1357 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1707 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1358 encode_compound_hdr(&xdr, &hdr); 1708 encode_compound_hdr(&xdr, req, &hdr);
1709 encode_sequence(&xdr, &args->seq_args, &hdr);
1359 encode_putfh(&xdr, args->fh, &hdr); 1710 encode_putfh(&xdr, args->fh, &hdr);
1360 encode_access(&xdr, args->access, &hdr); 1711 encode_access(&xdr, args->access, &hdr);
1361 encode_getfattr(&xdr, args->bitmask, &hdr); 1712 encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1370,11 +1721,12 @@ static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs
1370{ 1721{
1371 struct xdr_stream xdr; 1722 struct xdr_stream xdr;
1372 struct compound_hdr hdr = { 1723 struct compound_hdr hdr = {
1373 .nops = 0, 1724 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1374 }; 1725 };
1375 1726
1376 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1727 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1377 encode_compound_hdr(&xdr, &hdr); 1728 encode_compound_hdr(&xdr, req, &hdr);
1729 encode_sequence(&xdr, &args->seq_args, &hdr);
1378 encode_putfh(&xdr, args->dir_fh, &hdr); 1730 encode_putfh(&xdr, args->dir_fh, &hdr);
1379 encode_lookup(&xdr, args->name, &hdr); 1731 encode_lookup(&xdr, args->name, &hdr);
1380 encode_getfh(&xdr, &hdr); 1732 encode_getfh(&xdr, &hdr);
@@ -1390,11 +1742,12 @@ static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struc
1390{ 1742{
1391 struct xdr_stream xdr; 1743 struct xdr_stream xdr;
1392 struct compound_hdr hdr = { 1744 struct compound_hdr hdr = {
1393 .nops = 0, 1745 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1394 }; 1746 };
1395 1747
1396 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1748 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1397 encode_compound_hdr(&xdr, &hdr); 1749 encode_compound_hdr(&xdr, req, &hdr);
1750 encode_sequence(&xdr, &args->seq_args, &hdr);
1398 encode_putrootfh(&xdr, &hdr); 1751 encode_putrootfh(&xdr, &hdr);
1399 encode_getfh(&xdr, &hdr); 1752 encode_getfh(&xdr, &hdr);
1400 encode_getfattr(&xdr, args->bitmask, &hdr); 1753 encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1409,11 +1762,12 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
1409{ 1762{
1410 struct xdr_stream xdr; 1763 struct xdr_stream xdr;
1411 struct compound_hdr hdr = { 1764 struct compound_hdr hdr = {
1412 .nops = 0, 1765 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1413 }; 1766 };
1414 1767
1415 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1768 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1416 encode_compound_hdr(&xdr, &hdr); 1769 encode_compound_hdr(&xdr, req, &hdr);
1770 encode_sequence(&xdr, &args->seq_args, &hdr);
1417 encode_putfh(&xdr, args->fh, &hdr); 1771 encode_putfh(&xdr, args->fh, &hdr);
1418 encode_remove(&xdr, &args->name, &hdr); 1772 encode_remove(&xdr, &args->name, &hdr);
1419 encode_getfattr(&xdr, args->bitmask, &hdr); 1773 encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1428,11 +1782,12 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs
1428{ 1782{
1429 struct xdr_stream xdr; 1783 struct xdr_stream xdr;
1430 struct compound_hdr hdr = { 1784 struct compound_hdr hdr = {
1431 .nops = 0, 1785 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1432 }; 1786 };
1433 1787
1434 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1788 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1435 encode_compound_hdr(&xdr, &hdr); 1789 encode_compound_hdr(&xdr, req, &hdr);
1790 encode_sequence(&xdr, &args->seq_args, &hdr);
1436 encode_putfh(&xdr, args->old_dir, &hdr); 1791 encode_putfh(&xdr, args->old_dir, &hdr);
1437 encode_savefh(&xdr, &hdr); 1792 encode_savefh(&xdr, &hdr);
1438 encode_putfh(&xdr, args->new_dir, &hdr); 1793 encode_putfh(&xdr, args->new_dir, &hdr);
@@ -1451,11 +1806,12 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_
1451{ 1806{
1452 struct xdr_stream xdr; 1807 struct xdr_stream xdr;
1453 struct compound_hdr hdr = { 1808 struct compound_hdr hdr = {
1454 .nops = 0, 1809 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1455 }; 1810 };
1456 1811
1457 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1812 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1458 encode_compound_hdr(&xdr, &hdr); 1813 encode_compound_hdr(&xdr, req, &hdr);
1814 encode_sequence(&xdr, &args->seq_args, &hdr);
1459 encode_putfh(&xdr, args->fh, &hdr); 1815 encode_putfh(&xdr, args->fh, &hdr);
1460 encode_savefh(&xdr, &hdr); 1816 encode_savefh(&xdr, &hdr);
1461 encode_putfh(&xdr, args->dir_fh, &hdr); 1817 encode_putfh(&xdr, args->dir_fh, &hdr);
@@ -1474,11 +1830,12 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs
1474{ 1830{
1475 struct xdr_stream xdr; 1831 struct xdr_stream xdr;
1476 struct compound_hdr hdr = { 1832 struct compound_hdr hdr = {
1477 .nops = 0, 1833 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1478 }; 1834 };
1479 1835
1480 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1836 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1481 encode_compound_hdr(&xdr, &hdr); 1837 encode_compound_hdr(&xdr, req, &hdr);
1838 encode_sequence(&xdr, &args->seq_args, &hdr);
1482 encode_putfh(&xdr, args->dir_fh, &hdr); 1839 encode_putfh(&xdr, args->dir_fh, &hdr);
1483 encode_savefh(&xdr, &hdr); 1840 encode_savefh(&xdr, &hdr);
1484 encode_create(&xdr, args, &hdr); 1841 encode_create(&xdr, args, &hdr);
@@ -1505,11 +1862,12 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
1505{ 1862{
1506 struct xdr_stream xdr; 1863 struct xdr_stream xdr;
1507 struct compound_hdr hdr = { 1864 struct compound_hdr hdr = {
1508 .nops = 0, 1865 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1509 }; 1866 };
1510 1867
1511 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1868 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1512 encode_compound_hdr(&xdr, &hdr); 1869 encode_compound_hdr(&xdr, req, &hdr);
1870 encode_sequence(&xdr, &args->seq_args, &hdr);
1513 encode_putfh(&xdr, args->fh, &hdr); 1871 encode_putfh(&xdr, args->fh, &hdr);
1514 encode_getfattr(&xdr, args->bitmask, &hdr); 1872 encode_getfattr(&xdr, args->bitmask, &hdr);
1515 encode_nops(&hdr); 1873 encode_nops(&hdr);
@@ -1523,11 +1881,12 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closea
1523{ 1881{
1524 struct xdr_stream xdr; 1882 struct xdr_stream xdr;
1525 struct compound_hdr hdr = { 1883 struct compound_hdr hdr = {
1526 .nops = 0, 1884 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1527 }; 1885 };
1528 1886
1529 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1887 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1530 encode_compound_hdr(&xdr, &hdr); 1888 encode_compound_hdr(&xdr, req, &hdr);
1889 encode_sequence(&xdr, &args->seq_args, &hdr);
1531 encode_putfh(&xdr, args->fh, &hdr); 1890 encode_putfh(&xdr, args->fh, &hdr);
1532 encode_close(&xdr, args, &hdr); 1891 encode_close(&xdr, args, &hdr);
1533 encode_getfattr(&xdr, args->bitmask, &hdr); 1892 encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1542,11 +1901,12 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openarg
1542{ 1901{
1543 struct xdr_stream xdr; 1902 struct xdr_stream xdr;
1544 struct compound_hdr hdr = { 1903 struct compound_hdr hdr = {
1545 .nops = 0, 1904 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1546 }; 1905 };
1547 1906
1548 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1907 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1549 encode_compound_hdr(&xdr, &hdr); 1908 encode_compound_hdr(&xdr, req, &hdr);
1909 encode_sequence(&xdr, &args->seq_args, &hdr);
1550 encode_putfh(&xdr, args->fh, &hdr); 1910 encode_putfh(&xdr, args->fh, &hdr);
1551 encode_savefh(&xdr, &hdr); 1911 encode_savefh(&xdr, &hdr);
1552 encode_open(&xdr, args, &hdr); 1912 encode_open(&xdr, args, &hdr);
@@ -1569,7 +1929,7 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs
1569 }; 1929 };
1570 1930
1571 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1931 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1572 encode_compound_hdr(&xdr, &hdr); 1932 encode_compound_hdr(&xdr, req, &hdr);
1573 encode_putfh(&xdr, args->fh, &hdr); 1933 encode_putfh(&xdr, args->fh, &hdr);
1574 encode_open_confirm(&xdr, args, &hdr); 1934 encode_open_confirm(&xdr, args, &hdr);
1575 encode_nops(&hdr); 1935 encode_nops(&hdr);
@@ -1583,11 +1943,12 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_
1583{ 1943{
1584 struct xdr_stream xdr; 1944 struct xdr_stream xdr;
1585 struct compound_hdr hdr = { 1945 struct compound_hdr hdr = {
1586 .nops = 0, 1946 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1587 }; 1947 };
1588 1948
1589 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1949 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1590 encode_compound_hdr(&xdr, &hdr); 1950 encode_compound_hdr(&xdr, req, &hdr);
1951 encode_sequence(&xdr, &args->seq_args, &hdr);
1591 encode_putfh(&xdr, args->fh, &hdr); 1952 encode_putfh(&xdr, args->fh, &hdr);
1592 encode_open(&xdr, args, &hdr); 1953 encode_open(&xdr, args, &hdr);
1593 encode_getfattr(&xdr, args->bitmask, &hdr); 1954 encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1602,11 +1963,12 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct n
1602{ 1963{
1603 struct xdr_stream xdr; 1964 struct xdr_stream xdr;
1604 struct compound_hdr hdr = { 1965 struct compound_hdr hdr = {
1605 .nops = 0, 1966 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1606 }; 1967 };
1607 1968
1608 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1969 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1609 encode_compound_hdr(&xdr, &hdr); 1970 encode_compound_hdr(&xdr, req, &hdr);
1971 encode_sequence(&xdr, &args->seq_args, &hdr);
1610 encode_putfh(&xdr, args->fh, &hdr); 1972 encode_putfh(&xdr, args->fh, &hdr);
1611 encode_open_downgrade(&xdr, args, &hdr); 1973 encode_open_downgrade(&xdr, args, &hdr);
1612 encode_getfattr(&xdr, args->bitmask, &hdr); 1974 encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1621,11 +1983,12 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_ar
1621{ 1983{
1622 struct xdr_stream xdr; 1984 struct xdr_stream xdr;
1623 struct compound_hdr hdr = { 1985 struct compound_hdr hdr = {
1624 .nops = 0, 1986 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1625 }; 1987 };
1626 1988
1627 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1989 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1628 encode_compound_hdr(&xdr, &hdr); 1990 encode_compound_hdr(&xdr, req, &hdr);
1991 encode_sequence(&xdr, &args->seq_args, &hdr);
1629 encode_putfh(&xdr, args->fh, &hdr); 1992 encode_putfh(&xdr, args->fh, &hdr);
1630 encode_lock(&xdr, args, &hdr); 1993 encode_lock(&xdr, args, &hdr);
1631 encode_nops(&hdr); 1994 encode_nops(&hdr);
@@ -1639,11 +2002,12 @@ static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_
1639{ 2002{
1640 struct xdr_stream xdr; 2003 struct xdr_stream xdr;
1641 struct compound_hdr hdr = { 2004 struct compound_hdr hdr = {
1642 .nops = 0, 2005 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1643 }; 2006 };
1644 2007
1645 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2008 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1646 encode_compound_hdr(&xdr, &hdr); 2009 encode_compound_hdr(&xdr, req, &hdr);
2010 encode_sequence(&xdr, &args->seq_args, &hdr);
1647 encode_putfh(&xdr, args->fh, &hdr); 2011 encode_putfh(&xdr, args->fh, &hdr);
1648 encode_lockt(&xdr, args, &hdr); 2012 encode_lockt(&xdr, args, &hdr);
1649 encode_nops(&hdr); 2013 encode_nops(&hdr);
@@ -1657,11 +2021,12 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
1657{ 2021{
1658 struct xdr_stream xdr; 2022 struct xdr_stream xdr;
1659 struct compound_hdr hdr = { 2023 struct compound_hdr hdr = {
1660 .nops = 0, 2024 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1661 }; 2025 };
1662 2026
1663 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2027 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1664 encode_compound_hdr(&xdr, &hdr); 2028 encode_compound_hdr(&xdr, req, &hdr);
2029 encode_sequence(&xdr, &args->seq_args, &hdr);
1665 encode_putfh(&xdr, args->fh, &hdr); 2030 encode_putfh(&xdr, args->fh, &hdr);
1666 encode_locku(&xdr, args, &hdr); 2031 encode_locku(&xdr, args, &hdr);
1667 encode_nops(&hdr); 2032 encode_nops(&hdr);
@@ -1675,22 +2040,16 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
1675{ 2040{
1676 struct xdr_stream xdr; 2041 struct xdr_stream xdr;
1677 struct compound_hdr hdr = { 2042 struct compound_hdr hdr = {
1678 .nops = 0, 2043 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1679 }; 2044 };
1680 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
1681 unsigned int replen;
1682 2045
1683 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2046 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1684 encode_compound_hdr(&xdr, &hdr); 2047 encode_compound_hdr(&xdr, req, &hdr);
2048 encode_sequence(&xdr, &args->seq_args, &hdr);
1685 encode_putfh(&xdr, args->fh, &hdr); 2049 encode_putfh(&xdr, args->fh, &hdr);
1686 encode_readlink(&xdr, args, req, &hdr); 2050 encode_readlink(&xdr, args, req, &hdr);
1687 2051
1688 /* set up reply kvec 2052 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
1689 * toplevel_status + taglen + rescount + OP_PUTFH + status
1690 * + OP_READLINK + status + string length = 8
1691 */
1692 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readlink_sz) << 2;
1693 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
1694 args->pgbase, args->pglen); 2053 args->pgbase, args->pglen);
1695 encode_nops(&hdr); 2054 encode_nops(&hdr);
1696 return 0; 2055 return 0;
@@ -1703,25 +2062,19 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
1703{ 2062{
1704 struct xdr_stream xdr; 2063 struct xdr_stream xdr;
1705 struct compound_hdr hdr = { 2064 struct compound_hdr hdr = {
1706 .nops = 0, 2065 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1707 }; 2066 };
1708 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
1709 int replen;
1710 2067
1711 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2068 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1712 encode_compound_hdr(&xdr, &hdr); 2069 encode_compound_hdr(&xdr, req, &hdr);
2070 encode_sequence(&xdr, &args->seq_args, &hdr);
1713 encode_putfh(&xdr, args->fh, &hdr); 2071 encode_putfh(&xdr, args->fh, &hdr);
1714 encode_readdir(&xdr, args, req, &hdr); 2072 encode_readdir(&xdr, args, req, &hdr);
1715 2073
1716 /* set up reply kvec 2074 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
1717 * toplevel_status + taglen + rescount + OP_PUTFH + status
1718 * + OP_READDIR + status + verifer(2) = 9
1719 */
1720 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readdir_sz) << 2;
1721 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
1722 args->pgbase, args->count); 2075 args->pgbase, args->count);
1723 dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", 2076 dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
1724 __func__, replen, args->pages, 2077 __func__, hdr.replen << 2, args->pages,
1725 args->pgbase, args->count); 2078 args->pgbase, args->count);
1726 encode_nops(&hdr); 2079 encode_nops(&hdr);
1727 return 0; 2080 return 0;
@@ -1732,24 +2085,18 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
1732 */ 2085 */
1733static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 2086static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
1734{ 2087{
1735 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
1736 struct xdr_stream xdr; 2088 struct xdr_stream xdr;
1737 struct compound_hdr hdr = { 2089 struct compound_hdr hdr = {
1738 .nops = 0, 2090 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1739 }; 2091 };
1740 int replen;
1741 2092
1742 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2093 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1743 encode_compound_hdr(&xdr, &hdr); 2094 encode_compound_hdr(&xdr, req, &hdr);
2095 encode_sequence(&xdr, &args->seq_args, &hdr);
1744 encode_putfh(&xdr, args->fh, &hdr); 2096 encode_putfh(&xdr, args->fh, &hdr);
1745 encode_read(&xdr, args, &hdr); 2097 encode_read(&xdr, args, &hdr);
1746 2098
1747 /* set up reply kvec 2099 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
1748 * toplevel status + taglen=0 + rescount + OP_PUTFH + status
1749 * + OP_READ + status + eof + datalen = 9
1750 */
1751 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_read_sz) << 2;
1752 xdr_inline_pages(&req->rq_rcv_buf, replen,
1753 args->pages, args->pgbase, args->count); 2100 args->pages, args->pgbase, args->count);
1754 req->rq_rcv_buf.flags |= XDRBUF_READ; 2101 req->rq_rcv_buf.flags |= XDRBUF_READ;
1755 encode_nops(&hdr); 2102 encode_nops(&hdr);
@@ -1763,11 +2110,12 @@ static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_seta
1763{ 2110{
1764 struct xdr_stream xdr; 2111 struct xdr_stream xdr;
1765 struct compound_hdr hdr = { 2112 struct compound_hdr hdr = {
1766 .nops = 0, 2113 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1767 }; 2114 };
1768 2115
1769 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2116 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1770 encode_compound_hdr(&xdr, &hdr); 2117 encode_compound_hdr(&xdr, req, &hdr);
2118 encode_sequence(&xdr, &args->seq_args, &hdr);
1771 encode_putfh(&xdr, args->fh, &hdr); 2119 encode_putfh(&xdr, args->fh, &hdr);
1772 encode_setattr(&xdr, args, args->server, &hdr); 2120 encode_setattr(&xdr, args, args->server, &hdr);
1773 encode_getfattr(&xdr, args->bitmask, &hdr); 2121 encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1783,20 +2131,19 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
1783 struct nfs_getaclargs *args) 2131 struct nfs_getaclargs *args)
1784{ 2132{
1785 struct xdr_stream xdr; 2133 struct xdr_stream xdr;
1786 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
1787 struct compound_hdr hdr = { 2134 struct compound_hdr hdr = {
1788 .nops = 0, 2135 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1789 }; 2136 };
1790 int replen; 2137 uint32_t replen;
1791 2138
1792 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2139 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1793 encode_compound_hdr(&xdr, &hdr); 2140 encode_compound_hdr(&xdr, req, &hdr);
2141 encode_sequence(&xdr, &args->seq_args, &hdr);
1794 encode_putfh(&xdr, args->fh, &hdr); 2142 encode_putfh(&xdr, args->fh, &hdr);
2143 replen = hdr.replen + nfs4_fattr_bitmap_maxsz + 1;
1795 encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr); 2144 encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
1796 2145
1797 /* set up reply buffer: */ 2146 xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
1798 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2;
1799 xdr_inline_pages(&req->rq_rcv_buf, replen,
1800 args->acl_pages, args->acl_pgbase, args->acl_len); 2147 args->acl_pages, args->acl_pgbase, args->acl_len);
1801 encode_nops(&hdr); 2148 encode_nops(&hdr);
1802 return 0; 2149 return 0;
@@ -1809,11 +2156,12 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea
1809{ 2156{
1810 struct xdr_stream xdr; 2157 struct xdr_stream xdr;
1811 struct compound_hdr hdr = { 2158 struct compound_hdr hdr = {
1812 .nops = 0, 2159 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1813 }; 2160 };
1814 2161
1815 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2162 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1816 encode_compound_hdr(&xdr, &hdr); 2163 encode_compound_hdr(&xdr, req, &hdr);
2164 encode_sequence(&xdr, &args->seq_args, &hdr);
1817 encode_putfh(&xdr, args->fh, &hdr); 2165 encode_putfh(&xdr, args->fh, &hdr);
1818 encode_write(&xdr, args, &hdr); 2166 encode_write(&xdr, args, &hdr);
1819 req->rq_snd_buf.flags |= XDRBUF_WRITE; 2167 req->rq_snd_buf.flags |= XDRBUF_WRITE;
@@ -1829,11 +2177,12 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_write
1829{ 2177{
1830 struct xdr_stream xdr; 2178 struct xdr_stream xdr;
1831 struct compound_hdr hdr = { 2179 struct compound_hdr hdr = {
1832 .nops = 0, 2180 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1833 }; 2181 };
1834 2182
1835 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2183 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1836 encode_compound_hdr(&xdr, &hdr); 2184 encode_compound_hdr(&xdr, req, &hdr);
2185 encode_sequence(&xdr, &args->seq_args, &hdr);
1837 encode_putfh(&xdr, args->fh, &hdr); 2186 encode_putfh(&xdr, args->fh, &hdr);
1838 encode_commit(&xdr, args, &hdr); 2187 encode_commit(&xdr, args, &hdr);
1839 encode_getfattr(&xdr, args->bitmask, &hdr); 2188 encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1848,11 +2197,12 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsin
1848{ 2197{
1849 struct xdr_stream xdr; 2198 struct xdr_stream xdr;
1850 struct compound_hdr hdr = { 2199 struct compound_hdr hdr = {
1851 .nops = 0, 2200 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1852 }; 2201 };
1853 2202
1854 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2203 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1855 encode_compound_hdr(&xdr, &hdr); 2204 encode_compound_hdr(&xdr, req, &hdr);
2205 encode_sequence(&xdr, &args->seq_args, &hdr);
1856 encode_putfh(&xdr, args->fh, &hdr); 2206 encode_putfh(&xdr, args->fh, &hdr);
1857 encode_fsinfo(&xdr, args->bitmask, &hdr); 2207 encode_fsinfo(&xdr, args->bitmask, &hdr);
1858 encode_nops(&hdr); 2208 encode_nops(&hdr);
@@ -1866,11 +2216,12 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct n
1866{ 2216{
1867 struct xdr_stream xdr; 2217 struct xdr_stream xdr;
1868 struct compound_hdr hdr = { 2218 struct compound_hdr hdr = {
1869 .nops = 0, 2219 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1870 }; 2220 };
1871 2221
1872 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2222 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1873 encode_compound_hdr(&xdr, &hdr); 2223 encode_compound_hdr(&xdr, req, &hdr);
2224 encode_sequence(&xdr, &args->seq_args, &hdr);
1874 encode_putfh(&xdr, args->fh, &hdr); 2225 encode_putfh(&xdr, args->fh, &hdr);
1875 encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0], 2226 encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
1876 &hdr); 2227 &hdr);
@@ -1885,11 +2236,12 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
1885{ 2236{
1886 struct xdr_stream xdr; 2237 struct xdr_stream xdr;
1887 struct compound_hdr hdr = { 2238 struct compound_hdr hdr = {
1888 .nops = 0, 2239 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1889 }; 2240 };
1890 2241
1891 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2242 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1892 encode_compound_hdr(&xdr, &hdr); 2243 encode_compound_hdr(&xdr, req, &hdr);
2244 encode_sequence(&xdr, &args->seq_args, &hdr);
1893 encode_putfh(&xdr, args->fh, &hdr); 2245 encode_putfh(&xdr, args->fh, &hdr);
1894 encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0], 2246 encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
1895 args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); 2247 args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
@@ -1900,16 +2252,18 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
1900/* 2252/*
1901 * GETATTR_BITMAP request 2253 * GETATTR_BITMAP request
1902 */ 2254 */
1903static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struct nfs_fh *fhandle) 2255static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p,
2256 struct nfs4_server_caps_arg *args)
1904{ 2257{
1905 struct xdr_stream xdr; 2258 struct xdr_stream xdr;
1906 struct compound_hdr hdr = { 2259 struct compound_hdr hdr = {
1907 .nops = 0, 2260 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1908 }; 2261 };
1909 2262
1910 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2263 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1911 encode_compound_hdr(&xdr, &hdr); 2264 encode_compound_hdr(&xdr, req, &hdr);
1912 encode_putfh(&xdr, fhandle, &hdr); 2265 encode_sequence(&xdr, &args->seq_args, &hdr);
2266 encode_putfh(&xdr, args->fhandle, &hdr);
1913 encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS| 2267 encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
1914 FATTR4_WORD0_LINK_SUPPORT| 2268 FATTR4_WORD0_LINK_SUPPORT|
1915 FATTR4_WORD0_SYMLINK_SUPPORT| 2269 FATTR4_WORD0_SYMLINK_SUPPORT|
@@ -1929,7 +2283,7 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client
1929 }; 2283 };
1930 2284
1931 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2285 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1932 encode_compound_hdr(&xdr, &hdr); 2286 encode_compound_hdr(&xdr, req, &hdr);
1933 encode_renew(&xdr, clp, &hdr); 2287 encode_renew(&xdr, clp, &hdr);
1934 encode_nops(&hdr); 2288 encode_nops(&hdr);
1935 return 0; 2289 return 0;
@@ -1946,7 +2300,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
1946 }; 2300 };
1947 2301
1948 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2302 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1949 encode_compound_hdr(&xdr, &hdr); 2303 encode_compound_hdr(&xdr, req, &hdr);
1950 encode_setclientid(&xdr, sc, &hdr); 2304 encode_setclientid(&xdr, sc, &hdr);
1951 encode_nops(&hdr); 2305 encode_nops(&hdr);
1952 return 0; 2306 return 0;
@@ -1964,7 +2318,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
1964 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2318 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
1965 2319
1966 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2320 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1967 encode_compound_hdr(&xdr, &hdr); 2321 encode_compound_hdr(&xdr, req, &hdr);
1968 encode_setclientid_confirm(&xdr, clp, &hdr); 2322 encode_setclientid_confirm(&xdr, clp, &hdr);
1969 encode_putrootfh(&xdr, &hdr); 2323 encode_putrootfh(&xdr, &hdr);
1970 encode_fsinfo(&xdr, lease_bitmap, &hdr); 2324 encode_fsinfo(&xdr, lease_bitmap, &hdr);
@@ -1979,11 +2333,12 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struc
1979{ 2333{
1980 struct xdr_stream xdr; 2334 struct xdr_stream xdr;
1981 struct compound_hdr hdr = { 2335 struct compound_hdr hdr = {
1982 .nops = 0, 2336 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1983 }; 2337 };
1984 2338
1985 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2339 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1986 encode_compound_hdr(&xdr, &hdr); 2340 encode_compound_hdr(&xdr, req, &hdr);
2341 encode_sequence(&xdr, &args->seq_args, &hdr);
1987 encode_putfh(&xdr, args->fhandle, &hdr); 2342 encode_putfh(&xdr, args->fhandle, &hdr);
1988 encode_delegreturn(&xdr, args->stateid, &hdr); 2343 encode_delegreturn(&xdr, args->stateid, &hdr);
1989 encode_getfattr(&xdr, args->bitmask, &hdr); 2344 encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1998,28 +2353,119 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
1998{ 2353{
1999 struct xdr_stream xdr; 2354 struct xdr_stream xdr;
2000 struct compound_hdr hdr = { 2355 struct compound_hdr hdr = {
2001 .nops = 0, 2356 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2002 }; 2357 };
2003 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 2358 uint32_t replen;
2004 int replen;
2005 2359
2006 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2360 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2007 encode_compound_hdr(&xdr, &hdr); 2361 encode_compound_hdr(&xdr, req, &hdr);
2362 encode_sequence(&xdr, &args->seq_args, &hdr);
2008 encode_putfh(&xdr, args->dir_fh, &hdr); 2363 encode_putfh(&xdr, args->dir_fh, &hdr);
2009 encode_lookup(&xdr, args->name, &hdr); 2364 encode_lookup(&xdr, args->name, &hdr);
2365 replen = hdr.replen; /* get the attribute into args->page */
2010 encode_fs_locations(&xdr, args->bitmask, &hdr); 2366 encode_fs_locations(&xdr, args->bitmask, &hdr);
2011 2367
2012 /* set up reply 2368 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
2013 * toplevel_status + OP_PUTFH + status
2014 * + OP_LOOKUP + status + OP_GETATTR + status = 7
2015 */
2016 replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
2017 xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
2018 0, PAGE_SIZE); 2369 0, PAGE_SIZE);
2019 encode_nops(&hdr); 2370 encode_nops(&hdr);
2020 return 0; 2371 return 0;
2021} 2372}
2022 2373
2374#if defined(CONFIG_NFS_V4_1)
2375/*
2376 * EXCHANGE_ID request
2377 */
2378static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
2379 struct nfs41_exchange_id_args *args)
2380{
2381 struct xdr_stream xdr;
2382 struct compound_hdr hdr = {
2383 .minorversion = args->client->cl_minorversion,
2384 };
2385
2386 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2387 encode_compound_hdr(&xdr, req, &hdr);
2388 encode_exchange_id(&xdr, args, &hdr);
2389 encode_nops(&hdr);
2390 return 0;
2391}
2392
2393/*
2394 * a CREATE_SESSION request
2395 */
2396static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
2397 struct nfs41_create_session_args *args)
2398{
2399 struct xdr_stream xdr;
2400 struct compound_hdr hdr = {
2401 .minorversion = args->client->cl_minorversion,
2402 };
2403
2404 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2405 encode_compound_hdr(&xdr, req, &hdr);
2406 encode_create_session(&xdr, args, &hdr);
2407 encode_nops(&hdr);
2408 return 0;
2409}
2410
2411/*
2412 * a DESTROY_SESSION request
2413 */
2414static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
2415 struct nfs4_session *session)
2416{
2417 struct xdr_stream xdr;
2418 struct compound_hdr hdr = {
2419 .minorversion = session->clp->cl_minorversion,
2420 };
2421
2422 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2423 encode_compound_hdr(&xdr, req, &hdr);
2424 encode_destroy_session(&xdr, session, &hdr);
2425 encode_nops(&hdr);
2426 return 0;
2427}
2428
2429/*
2430 * a SEQUENCE request
2431 */
2432static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p,
2433 struct nfs4_sequence_args *args)
2434{
2435 struct xdr_stream xdr;
2436 struct compound_hdr hdr = {
2437 .minorversion = nfs4_xdr_minorversion(args),
2438 };
2439
2440 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2441 encode_compound_hdr(&xdr, req, &hdr);
2442 encode_sequence(&xdr, args, &hdr);
2443 encode_nops(&hdr);
2444 return 0;
2445}
2446
2447/*
2448 * a GET_LEASE_TIME request
2449 */
2450static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
2451 struct nfs4_get_lease_time_args *args)
2452{
2453 struct xdr_stream xdr;
2454 struct compound_hdr hdr = {
2455 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
2456 };
2457 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
2458
2459 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2460 encode_compound_hdr(&xdr, req, &hdr);
2461 encode_sequence(&xdr, &args->la_seq_args, &hdr);
2462 encode_putrootfh(&xdr, &hdr);
2463 encode_fsinfo(&xdr, lease_bitmap, &hdr);
2464 encode_nops(&hdr);
2465 return 0;
2466}
2467#endif /* CONFIG_NFS_V4_1 */
2468
2023/* 2469/*
2024 * START OF "GENERIC" DECODE ROUTINES. 2470 * START OF "GENERIC" DECODE ROUTINES.
2025 * These may look a little ugly since they are imported from a "generic" 2471 * These may look a little ugly since they are imported from a "generic"
@@ -3657,7 +4103,7 @@ decode_savefh(struct xdr_stream *xdr)
3657 return decode_op_hdr(xdr, OP_SAVEFH); 4103 return decode_op_hdr(xdr, OP_SAVEFH);
3658} 4104}
3659 4105
3660static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res) 4106static int decode_setattr(struct xdr_stream *xdr)
3661{ 4107{
3662 __be32 *p; 4108 __be32 *p;
3663 uint32_t bmlen; 4109 uint32_t bmlen;
@@ -3735,6 +4181,169 @@ static int decode_delegreturn(struct xdr_stream *xdr)
3735 return decode_op_hdr(xdr, OP_DELEGRETURN); 4181 return decode_op_hdr(xdr, OP_DELEGRETURN);
3736} 4182}
3737 4183
4184#if defined(CONFIG_NFS_V4_1)
4185static int decode_exchange_id(struct xdr_stream *xdr,
4186 struct nfs41_exchange_id_res *res)
4187{
4188 __be32 *p;
4189 uint32_t dummy;
4190 int status;
4191 struct nfs_client *clp = res->client;
4192
4193 status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
4194 if (status)
4195 return status;
4196
4197 READ_BUF(8);
4198 READ64(clp->cl_ex_clid);
4199 READ_BUF(12);
4200 READ32(clp->cl_seqid);
4201 READ32(clp->cl_exchange_flags);
4202
4203 /* We ask for SP4_NONE */
4204 READ32(dummy);
4205 if (dummy != SP4_NONE)
4206 return -EIO;
4207
4208 /* Throw away minor_id */
4209 READ_BUF(8);
4210
4211 /* Throw away Major id */
4212 READ_BUF(4);
4213 READ32(dummy);
4214 READ_BUF(dummy);
4215
4216 /* Throw away server_scope */
4217 READ_BUF(4);
4218 READ32(dummy);
4219 READ_BUF(dummy);
4220
4221 /* Throw away Implementation id array */
4222 READ_BUF(4);
4223 READ32(dummy);
4224 READ_BUF(dummy);
4225
4226 return 0;
4227}
4228
4229static int decode_chan_attrs(struct xdr_stream *xdr,
4230 struct nfs4_channel_attrs *attrs)
4231{
4232 __be32 *p;
4233 u32 nr_attrs;
4234
4235 READ_BUF(28);
4236 READ32(attrs->headerpadsz);
4237 READ32(attrs->max_rqst_sz);
4238 READ32(attrs->max_resp_sz);
4239 READ32(attrs->max_resp_sz_cached);
4240 READ32(attrs->max_ops);
4241 READ32(attrs->max_reqs);
4242 READ32(nr_attrs);
4243 if (unlikely(nr_attrs > 1)) {
4244 printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
4245 __func__, nr_attrs);
4246 return -EINVAL;
4247 }
4248 if (nr_attrs == 1)
4249 READ_BUF(4); /* skip rdma_attrs */
4250 return 0;
4251}
4252
4253static int decode_create_session(struct xdr_stream *xdr,
4254 struct nfs41_create_session_res *res)
4255{
4256 __be32 *p;
4257 int status;
4258 struct nfs_client *clp = res->client;
4259 struct nfs4_session *session = clp->cl_session;
4260
4261 status = decode_op_hdr(xdr, OP_CREATE_SESSION);
4262
4263 if (status)
4264 return status;
4265
4266 /* sessionid */
4267 READ_BUF(NFS4_MAX_SESSIONID_LEN);
4268 COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
4269
4270 /* seqid, flags */
4271 READ_BUF(8);
4272 READ32(clp->cl_seqid);
4273 READ32(session->flags);
4274
4275 /* Channel attributes */
4276 status = decode_chan_attrs(xdr, &session->fc_attrs);
4277 if (!status)
4278 status = decode_chan_attrs(xdr, &session->bc_attrs);
4279 return status;
4280}
4281
4282static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
4283{
4284 return decode_op_hdr(xdr, OP_DESTROY_SESSION);
4285}
4286#endif /* CONFIG_NFS_V4_1 */
4287
4288static int decode_sequence(struct xdr_stream *xdr,
4289 struct nfs4_sequence_res *res,
4290 struct rpc_rqst *rqstp)
4291{
4292#if defined(CONFIG_NFS_V4_1)
4293 struct nfs4_slot *slot;
4294 struct nfs4_sessionid id;
4295 u32 dummy;
4296 int status;
4297 __be32 *p;
4298
4299 if (!res->sr_session)
4300 return 0;
4301
4302 status = decode_op_hdr(xdr, OP_SEQUENCE);
4303 if (status)
4304 goto out_err;
4305
4306 /*
4307 * If the server returns different values for sessionID, slotID or
4308 * sequence number, the server is looney tunes.
4309 */
4310 status = -ESERVERFAULT;
4311
4312 slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
4313 READ_BUF(NFS4_MAX_SESSIONID_LEN + 20);
4314 COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
4315 if (memcmp(id.data, res->sr_session->sess_id.data,
4316 NFS4_MAX_SESSIONID_LEN)) {
4317 dprintk("%s Invalid session id\n", __func__);
4318 goto out_err;
4319 }
4320 /* seqid */
4321 READ32(dummy);
4322 if (dummy != slot->seq_nr) {
4323 dprintk("%s Invalid sequence number\n", __func__);
4324 goto out_err;
4325 }
4326 /* slot id */
4327 READ32(dummy);
4328 if (dummy != res->sr_slotid) {
4329 dprintk("%s Invalid slot id\n", __func__);
4330 goto out_err;
4331 }
4332 /* highest slot id - currently not processed */
4333 READ32(dummy);
4334 /* target highest slot id - currently not processed */
4335 READ32(dummy);
4336 /* result flags - currently not processed */
4337 READ32(dummy);
4338 status = 0;
4339out_err:
4340 res->sr_status = status;
4341 return status;
4342#else /* CONFIG_NFS_V4_1 */
4343 return 0;
4344#endif /* CONFIG_NFS_V4_1 */
4345}
4346
3738/* 4347/*
3739 * END OF "GENERIC" DECODE ROUTINES. 4348 * END OF "GENERIC" DECODE ROUTINES.
3740 */ 4349 */
@@ -3752,6 +4361,9 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct
3752 status = decode_compound_hdr(&xdr, &hdr); 4361 status = decode_compound_hdr(&xdr, &hdr);
3753 if (status) 4362 if (status)
3754 goto out; 4363 goto out;
4364 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4365 if (status)
4366 goto out;
3755 status = decode_putfh(&xdr); 4367 status = decode_putfh(&xdr);
3756 if (status) 4368 if (status)
3757 goto out; 4369 goto out;
@@ -3773,7 +4385,11 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
3773 int status; 4385 int status;
3774 4386
3775 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 4387 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3776 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 4388 status = decode_compound_hdr(&xdr, &hdr);
4389 if (status)
4390 goto out;
4391 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4392 if (status)
3777 goto out; 4393 goto out;
3778 status = decode_putfh(&xdr); 4394 status = decode_putfh(&xdr);
3779 if (status != 0) 4395 if (status != 0)
@@ -3796,7 +4412,11 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
3796 int status; 4412 int status;
3797 4413
3798 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 4414 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3799 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 4415 status = decode_compound_hdr(&xdr, &hdr);
4416 if (status)
4417 goto out;
4418 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4419 if (status)
3800 goto out; 4420 goto out;
3801 if ((status = decode_putfh(&xdr)) != 0) 4421 if ((status = decode_putfh(&xdr)) != 0)
3802 goto out; 4422 goto out;
@@ -3819,7 +4439,11 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
3819 int status; 4439 int status;
3820 4440
3821 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 4441 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3822 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 4442 status = decode_compound_hdr(&xdr, &hdr);
4443 if (status)
4444 goto out;
4445 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4446 if (status)
3823 goto out; 4447 goto out;
3824 if ((status = decode_putrootfh(&xdr)) != 0) 4448 if ((status = decode_putrootfh(&xdr)) != 0)
3825 goto out; 4449 goto out;
@@ -3839,7 +4463,11 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
3839 int status; 4463 int status;
3840 4464
3841 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 4465 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3842 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 4466 status = decode_compound_hdr(&xdr, &hdr);
4467 if (status)
4468 goto out;
4469 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4470 if (status)
3843 goto out; 4471 goto out;
3844 if ((status = decode_putfh(&xdr)) != 0) 4472 if ((status = decode_putfh(&xdr)) != 0)
3845 goto out; 4473 goto out;
@@ -3860,7 +4488,11 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
3860 int status; 4488 int status;
3861 4489
3862 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 4490 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3863 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 4491 status = decode_compound_hdr(&xdr, &hdr);
4492 if (status)
4493 goto out;
4494 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4495 if (status)
3864 goto out; 4496 goto out;
3865 if ((status = decode_putfh(&xdr)) != 0) 4497 if ((status = decode_putfh(&xdr)) != 0)
3866 goto out; 4498 goto out;
@@ -3890,7 +4522,11 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
3890 int status; 4522 int status;
3891 4523
3892 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 4524 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3893 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 4525 status = decode_compound_hdr(&xdr, &hdr);
4526 if (status)
4527 goto out;
4528 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4529 if (status)
3894 goto out; 4530 goto out;
3895 if ((status = decode_putfh(&xdr)) != 0) 4531 if ((status = decode_putfh(&xdr)) != 0)
3896 goto out; 4532 goto out;
@@ -3923,7 +4559,11 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
3923 int status; 4559 int status;
3924 4560
3925 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 4561 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3926 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 4562 status = decode_compound_hdr(&xdr, &hdr);
4563 if (status)
4564 goto out;
4565 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4566 if (status)
3927 goto out; 4567 goto out;
3928 if ((status = decode_putfh(&xdr)) != 0) 4568 if ((status = decode_putfh(&xdr)) != 0)
3929 goto out; 4569 goto out;
@@ -3963,6 +4603,9 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
3963 status = decode_compound_hdr(&xdr, &hdr); 4603 status = decode_compound_hdr(&xdr, &hdr);
3964 if (status) 4604 if (status)
3965 goto out; 4605 goto out;
4606 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4607 if (status)
4608 goto out;
3966 status = decode_putfh(&xdr); 4609 status = decode_putfh(&xdr);
3967 if (status) 4610 if (status)
3968 goto out; 4611 goto out;
@@ -3979,12 +4622,13 @@ nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args
3979{ 4622{
3980 struct xdr_stream xdr; 4623 struct xdr_stream xdr;
3981 struct compound_hdr hdr = { 4624 struct compound_hdr hdr = {
3982 .nops = 0, 4625 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
3983 }; 4626 };
3984 int status; 4627 int status;
3985 4628
3986 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 4629 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
3987 encode_compound_hdr(&xdr, &hdr); 4630 encode_compound_hdr(&xdr, req, &hdr);
4631 encode_sequence(&xdr, &args->seq_args, &hdr);
3988 encode_putfh(&xdr, args->fh, &hdr); 4632 encode_putfh(&xdr, args->fh, &hdr);
3989 status = encode_setacl(&xdr, args, &hdr); 4633 status = encode_setacl(&xdr, args, &hdr);
3990 encode_nops(&hdr); 4634 encode_nops(&hdr);
@@ -3995,7 +4639,8 @@ nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args
3995 * Decode SETACL response 4639 * Decode SETACL response
3996 */ 4640 */
3997static int 4641static int
3998nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res) 4642nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p,
4643 struct nfs_setaclres *res)
3999{ 4644{
4000 struct xdr_stream xdr; 4645 struct xdr_stream xdr;
4001 struct compound_hdr hdr; 4646 struct compound_hdr hdr;
@@ -4005,10 +4650,13 @@ nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res)
4005 status = decode_compound_hdr(&xdr, &hdr); 4650 status = decode_compound_hdr(&xdr, &hdr);
4006 if (status) 4651 if (status)
4007 goto out; 4652 goto out;
4653 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4654 if (status)
4655 goto out;
4008 status = decode_putfh(&xdr); 4656 status = decode_putfh(&xdr);
4009 if (status) 4657 if (status)
4010 goto out; 4658 goto out;
4011 status = decode_setattr(&xdr, res); 4659 status = decode_setattr(&xdr);
4012out: 4660out:
4013 return status; 4661 return status;
4014} 4662}
@@ -4017,7 +4665,8 @@ out:
4017 * Decode GETACL response 4665 * Decode GETACL response
4018 */ 4666 */
4019static int 4667static int
4020nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len) 4668nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p,
4669 struct nfs_getaclres *res)
4021{ 4670{
4022 struct xdr_stream xdr; 4671 struct xdr_stream xdr;
4023 struct compound_hdr hdr; 4672 struct compound_hdr hdr;
@@ -4027,10 +4676,13 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
4027 status = decode_compound_hdr(&xdr, &hdr); 4676 status = decode_compound_hdr(&xdr, &hdr);
4028 if (status) 4677 if (status)
4029 goto out; 4678 goto out;
4679 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4680 if (status)
4681 goto out;
4030 status = decode_putfh(&xdr); 4682 status = decode_putfh(&xdr);
4031 if (status) 4683 if (status)
4032 goto out; 4684 goto out;
4033 status = decode_getacl(&xdr, rqstp, acl_len); 4685 status = decode_getacl(&xdr, rqstp, &res->acl_len);
4034 4686
4035out: 4687out:
4036 return status; 4688 return status;
@@ -4049,6 +4701,9 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
4049 status = decode_compound_hdr(&xdr, &hdr); 4701 status = decode_compound_hdr(&xdr, &hdr);
4050 if (status) 4702 if (status)
4051 goto out; 4703 goto out;
4704 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4705 if (status)
4706 goto out;
4052 status = decode_putfh(&xdr); 4707 status = decode_putfh(&xdr);
4053 if (status) 4708 if (status)
4054 goto out; 4709 goto out;
@@ -4079,6 +4734,9 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
4079 status = decode_compound_hdr(&xdr, &hdr); 4734 status = decode_compound_hdr(&xdr, &hdr);
4080 if (status) 4735 if (status)
4081 goto out; 4736 goto out;
4737 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4738 if (status)
4739 goto out;
4082 status = decode_putfh(&xdr); 4740 status = decode_putfh(&xdr);
4083 if (status) 4741 if (status)
4084 goto out; 4742 goto out;
@@ -4133,6 +4791,9 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf
4133 status = decode_compound_hdr(&xdr, &hdr); 4791 status = decode_compound_hdr(&xdr, &hdr);
4134 if (status) 4792 if (status)
4135 goto out; 4793 goto out;
4794 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4795 if (status)
4796 goto out;
4136 status = decode_putfh(&xdr); 4797 status = decode_putfh(&xdr);
4137 if (status) 4798 if (status)
4138 goto out; 4799 goto out;
@@ -4157,10 +4818,13 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
4157 status = decode_compound_hdr(&xdr, &hdr); 4818 status = decode_compound_hdr(&xdr, &hdr);
4158 if (status) 4819 if (status)
4159 goto out; 4820 goto out;
4821 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4822 if (status)
4823 goto out;
4160 status = decode_putfh(&xdr); 4824 status = decode_putfh(&xdr);
4161 if (status) 4825 if (status)
4162 goto out; 4826 goto out;
4163 status = decode_setattr(&xdr, res); 4827 status = decode_setattr(&xdr);
4164 if (status) 4828 if (status)
4165 goto out; 4829 goto out;
4166 decode_getfattr(&xdr, res->fattr, res->server); 4830 decode_getfattr(&xdr, res->fattr, res->server);
@@ -4181,6 +4845,9 @@ static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_
4181 status = decode_compound_hdr(&xdr, &hdr); 4845 status = decode_compound_hdr(&xdr, &hdr);
4182 if (status) 4846 if (status)
4183 goto out; 4847 goto out;
4848 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4849 if (status)
4850 goto out;
4184 status = decode_putfh(&xdr); 4851 status = decode_putfh(&xdr);
4185 if (status) 4852 if (status)
4186 goto out; 4853 goto out;
@@ -4202,6 +4869,9 @@ static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock
4202 status = decode_compound_hdr(&xdr, &hdr); 4869 status = decode_compound_hdr(&xdr, &hdr);
4203 if (status) 4870 if (status)
4204 goto out; 4871 goto out;
4872 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4873 if (status)
4874 goto out;
4205 status = decode_putfh(&xdr); 4875 status = decode_putfh(&xdr);
4206 if (status) 4876 if (status)
4207 goto out; 4877 goto out;
@@ -4223,6 +4893,9 @@ static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock
4223 status = decode_compound_hdr(&xdr, &hdr); 4893 status = decode_compound_hdr(&xdr, &hdr);
4224 if (status) 4894 if (status)
4225 goto out; 4895 goto out;
4896 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4897 if (status)
4898 goto out;
4226 status = decode_putfh(&xdr); 4899 status = decode_putfh(&xdr);
4227 if (status) 4900 if (status)
4228 goto out; 4901 goto out;
@@ -4234,7 +4907,8 @@ out:
4234/* 4907/*
4235 * Decode READLINK response 4908 * Decode READLINK response
4236 */ 4909 */
4237static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res) 4910static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p,
4911 struct nfs4_readlink_res *res)
4238{ 4912{
4239 struct xdr_stream xdr; 4913 struct xdr_stream xdr;
4240 struct compound_hdr hdr; 4914 struct compound_hdr hdr;
@@ -4244,6 +4918,9 @@ static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res)
4244 status = decode_compound_hdr(&xdr, &hdr); 4918 status = decode_compound_hdr(&xdr, &hdr);
4245 if (status) 4919 if (status)
4246 goto out; 4920 goto out;
4921 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4922 if (status)
4923 goto out;
4247 status = decode_putfh(&xdr); 4924 status = decode_putfh(&xdr);
4248 if (status) 4925 if (status)
4249 goto out; 4926 goto out;
@@ -4265,6 +4942,9 @@ static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_r
4265 status = decode_compound_hdr(&xdr, &hdr); 4942 status = decode_compound_hdr(&xdr, &hdr);
4266 if (status) 4943 if (status)
4267 goto out; 4944 goto out;
4945 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4946 if (status)
4947 goto out;
4268 status = decode_putfh(&xdr); 4948 status = decode_putfh(&xdr);
4269 if (status) 4949 if (status)
4270 goto out; 4950 goto out;
@@ -4286,6 +4966,9 @@ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readr
4286 status = decode_compound_hdr(&xdr, &hdr); 4966 status = decode_compound_hdr(&xdr, &hdr);
4287 if (status) 4967 if (status)
4288 goto out; 4968 goto out;
4969 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4970 if (status)
4971 goto out;
4289 status = decode_putfh(&xdr); 4972 status = decode_putfh(&xdr);
4290 if (status) 4973 if (status)
4291 goto out; 4974 goto out;
@@ -4309,6 +4992,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ
4309 status = decode_compound_hdr(&xdr, &hdr); 4992 status = decode_compound_hdr(&xdr, &hdr);
4310 if (status) 4993 if (status)
4311 goto out; 4994 goto out;
4995 status = decode_sequence(&xdr, &res->seq_res, rqstp);
4996 if (status)
4997 goto out;
4312 status = decode_putfh(&xdr); 4998 status = decode_putfh(&xdr);
4313 if (status) 4999 if (status)
4314 goto out; 5000 goto out;
@@ -4335,6 +5021,9 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri
4335 status = decode_compound_hdr(&xdr, &hdr); 5021 status = decode_compound_hdr(&xdr, &hdr);
4336 if (status) 5022 if (status)
4337 goto out; 5023 goto out;
5024 status = decode_sequence(&xdr, &res->seq_res, rqstp);
5025 if (status)
5026 goto out;
4338 status = decode_putfh(&xdr); 5027 status = decode_putfh(&xdr);
4339 if (status) 5028 if (status)
4340 goto out; 5029 goto out;
@@ -4349,7 +5038,8 @@ out:
4349/* 5038/*
4350 * FSINFO request 5039 * FSINFO request
4351 */ 5040 */
4352static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo) 5041static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
5042 struct nfs4_fsinfo_res *res)
4353{ 5043{
4354 struct xdr_stream xdr; 5044 struct xdr_stream xdr;
4355 struct compound_hdr hdr; 5045 struct compound_hdr hdr;
@@ -4358,16 +5048,19 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf
4358 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5048 xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
4359 status = decode_compound_hdr(&xdr, &hdr); 5049 status = decode_compound_hdr(&xdr, &hdr);
4360 if (!status) 5050 if (!status)
5051 status = decode_sequence(&xdr, &res->seq_res, req);
5052 if (!status)
4361 status = decode_putfh(&xdr); 5053 status = decode_putfh(&xdr);
4362 if (!status) 5054 if (!status)
4363 status = decode_fsinfo(&xdr, fsinfo); 5055 status = decode_fsinfo(&xdr, res->fsinfo);
4364 return status; 5056 return status;
4365} 5057}
4366 5058
4367/* 5059/*
4368 * PATHCONF request 5060 * PATHCONF request
4369 */ 5061 */
4370static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *pathconf) 5062static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
5063 struct nfs4_pathconf_res *res)
4371{ 5064{
4372 struct xdr_stream xdr; 5065 struct xdr_stream xdr;
4373 struct compound_hdr hdr; 5066 struct compound_hdr hdr;
@@ -4376,16 +5069,19 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pat
4376 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5069 xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
4377 status = decode_compound_hdr(&xdr, &hdr); 5070 status = decode_compound_hdr(&xdr, &hdr);
4378 if (!status) 5071 if (!status)
5072 status = decode_sequence(&xdr, &res->seq_res, req);
5073 if (!status)
4379 status = decode_putfh(&xdr); 5074 status = decode_putfh(&xdr);
4380 if (!status) 5075 if (!status)
4381 status = decode_pathconf(&xdr, pathconf); 5076 status = decode_pathconf(&xdr, res->pathconf);
4382 return status; 5077 return status;
4383} 5078}
4384 5079
4385/* 5080/*
4386 * STATFS request 5081 * STATFS request
4387 */ 5082 */
4388static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *fsstat) 5083static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
5084 struct nfs4_statfs_res *res)
4389{ 5085{
4390 struct xdr_stream xdr; 5086 struct xdr_stream xdr;
4391 struct compound_hdr hdr; 5087 struct compound_hdr hdr;
@@ -4394,9 +5090,11 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fssta
4394 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5090 xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
4395 status = decode_compound_hdr(&xdr, &hdr); 5091 status = decode_compound_hdr(&xdr, &hdr);
4396 if (!status) 5092 if (!status)
5093 status = decode_sequence(&xdr, &res->seq_res, req);
5094 if (!status)
4397 status = decode_putfh(&xdr); 5095 status = decode_putfh(&xdr);
4398 if (!status) 5096 if (!status)
4399 status = decode_statfs(&xdr, fsstat); 5097 status = decode_statfs(&xdr, res->fsstat);
4400 return status; 5098 return status;
4401} 5099}
4402 5100
@@ -4410,7 +5108,11 @@ static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4
4410 int status; 5108 int status;
4411 5109
4412 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5110 xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
4413 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 5111 status = decode_compound_hdr(&xdr, &hdr);
5112 if (status)
5113 goto out;
5114 status = decode_sequence(&xdr, &res->seq_res, req);
5115 if (status)
4414 goto out; 5116 goto out;
4415 if ((status = decode_putfh(&xdr)) != 0) 5117 if ((status = decode_putfh(&xdr)) != 0)
4416 goto out; 5118 goto out;
@@ -4483,7 +5185,10 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
4483 5185
4484 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5186 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
4485 status = decode_compound_hdr(&xdr, &hdr); 5187 status = decode_compound_hdr(&xdr, &hdr);
4486 if (status != 0) 5188 if (status)
5189 goto out;
5190 status = decode_sequence(&xdr, &res->seq_res, rqstp);
5191 if (status)
4487 goto out; 5192 goto out;
4488 status = decode_putfh(&xdr); 5193 status = decode_putfh(&xdr);
4489 if (status != 0) 5194 if (status != 0)
@@ -4497,7 +5202,8 @@ out:
4497/* 5202/*
4498 * FS_LOCATIONS request 5203 * FS_LOCATIONS request
4499 */ 5204 */
4500static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations *res) 5205static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
5206 struct nfs4_fs_locations_res *res)
4501{ 5207{
4502 struct xdr_stream xdr; 5208 struct xdr_stream xdr;
4503 struct compound_hdr hdr; 5209 struct compound_hdr hdr;
@@ -4505,18 +5211,113 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
4505 5211
4506 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5212 xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
4507 status = decode_compound_hdr(&xdr, &hdr); 5213 status = decode_compound_hdr(&xdr, &hdr);
4508 if (status != 0) 5214 if (status)
5215 goto out;
5216 status = decode_sequence(&xdr, &res->seq_res, req);
5217 if (status)
4509 goto out; 5218 goto out;
4510 if ((status = decode_putfh(&xdr)) != 0) 5219 if ((status = decode_putfh(&xdr)) != 0)
4511 goto out; 5220 goto out;
4512 if ((status = decode_lookup(&xdr)) != 0) 5221 if ((status = decode_lookup(&xdr)) != 0)
4513 goto out; 5222 goto out;
4514 xdr_enter_page(&xdr, PAGE_SIZE); 5223 xdr_enter_page(&xdr, PAGE_SIZE);
4515 status = decode_getfattr(&xdr, &res->fattr, res->server); 5224 status = decode_getfattr(&xdr, &res->fs_locations->fattr,
5225 res->fs_locations->server);
4516out: 5226out:
4517 return status; 5227 return status;
4518} 5228}
4519 5229
5230#if defined(CONFIG_NFS_V4_1)
5231/*
5232 * EXCHANGE_ID request
5233 */
5234static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
5235 void *res)
5236{
5237 struct xdr_stream xdr;
5238 struct compound_hdr hdr;
5239 int status;
5240
5241 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
5242 status = decode_compound_hdr(&xdr, &hdr);
5243 if (!status)
5244 status = decode_exchange_id(&xdr, res);
5245 return status;
5246}
5247
5248/*
5249 * a CREATE_SESSION request
5250 */
5251static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
5252 struct nfs41_create_session_res *res)
5253{
5254 struct xdr_stream xdr;
5255 struct compound_hdr hdr;
5256 int status;
5257
5258 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
5259 status = decode_compound_hdr(&xdr, &hdr);
5260 if (!status)
5261 status = decode_create_session(&xdr, res);
5262 return status;
5263}
5264
5265/*
5266 * a DESTROY_SESSION request
5267 */
5268static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
5269 void *dummy)
5270{
5271 struct xdr_stream xdr;
5272 struct compound_hdr hdr;
5273 int status;
5274
5275 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
5276 status = decode_compound_hdr(&xdr, &hdr);
5277 if (!status)
5278 status = decode_destroy_session(&xdr, dummy);
5279 return status;
5280}
5281
5282/*
5283 * a SEQUENCE request
5284 */
5285static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
5286 struct nfs4_sequence_res *res)
5287{
5288 struct xdr_stream xdr;
5289 struct compound_hdr hdr;
5290 int status;
5291
5292 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
5293 status = decode_compound_hdr(&xdr, &hdr);
5294 if (!status)
5295 status = decode_sequence(&xdr, res, rqstp);
5296 return status;
5297}
5298
5299/*
5300 * a GET_LEASE_TIME request
5301 */
5302static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
5303 struct nfs4_get_lease_time_res *res)
5304{
5305 struct xdr_stream xdr;
5306 struct compound_hdr hdr;
5307 int status;
5308
5309 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
5310 status = decode_compound_hdr(&xdr, &hdr);
5311 if (!status)
5312 status = decode_sequence(&xdr, &res->lr_seq_res, rqstp);
5313 if (!status)
5314 status = decode_putrootfh(&xdr);
5315 if (!status)
5316 status = decode_fsinfo(&xdr, res->lr_fsinfo);
5317 return status;
5318}
5319#endif /* CONFIG_NFS_V4_1 */
5320
4520__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 5321__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
4521{ 5322{
4522 uint32_t bitmap[2] = {0}; 5323 uint32_t bitmap[2] = {0};
@@ -4686,6 +5487,13 @@ struct rpc_procinfo nfs4_procedures[] = {
4686 PROC(GETACL, enc_getacl, dec_getacl), 5487 PROC(GETACL, enc_getacl, dec_getacl),
4687 PROC(SETACL, enc_setacl, dec_setacl), 5488 PROC(SETACL, enc_setacl, dec_setacl),
4688 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), 5489 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
5490#if defined(CONFIG_NFS_V4_1)
5491 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
5492 PROC(CREATE_SESSION, enc_create_session, dec_create_session),
5493 PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session),
5494 PROC(SEQUENCE, enc_sequence, dec_sequence),
5495 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
5496#endif /* CONFIG_NFS_V4_1 */
4689}; 5497};
4690 5498
4691struct rpc_version nfs_version4 = { 5499struct rpc_version nfs_version4 = {
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index e3ed5908820b..8c55b27c0de4 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -92,6 +92,9 @@
92#undef NFSROOT_DEBUG 92#undef NFSROOT_DEBUG
93#define NFSDBG_FACILITY NFSDBG_ROOT 93#define NFSDBG_FACILITY NFSDBG_ROOT
94 94
95/* Default port to use if server is not running a portmapper */
96#define NFS_MNT_PORT 627
97
95/* Default path we try to mount. "%s" gets replaced by our IP address */ 98/* Default path we try to mount. "%s" gets replaced by our IP address */
96#define NFS_ROOT "/tftpboot/%s" 99#define NFS_ROOT "/tftpboot/%s"
97 100
@@ -487,6 +490,7 @@ static int __init root_nfs_get_handle(void)
487{ 490{
488 struct nfs_fh fh; 491 struct nfs_fh fh;
489 struct sockaddr_in sin; 492 struct sockaddr_in sin;
493 unsigned int auth_flav_len = 0;
490 struct nfs_mount_request request = { 494 struct nfs_mount_request request = {
491 .sap = (struct sockaddr *)&sin, 495 .sap = (struct sockaddr *)&sin,
492 .salen = sizeof(sin), 496 .salen = sizeof(sin),
@@ -496,6 +500,7 @@ static int __init root_nfs_get_handle(void)
496 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? 500 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
497 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP, 501 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
498 .fh = &fh, 502 .fh = &fh,
503 .auth_flav_len = &auth_flav_len,
499 }; 504 };
500 int status; 505 int status;
501 506
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4ace3c50a8eb..12c9e66d3f1d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,10 +18,10 @@
18#include <linux/sunrpc/clnt.h> 18#include <linux/sunrpc/clnt.h>
19#include <linux/nfs_fs.h> 19#include <linux/nfs_fs.h>
20#include <linux/nfs_page.h> 20#include <linux/nfs_page.h>
21#include <linux/smp_lock.h>
22 21
23#include <asm/system.h> 22#include <asm/system.h>
24 23
24#include "nfs4_fs.h"
25#include "internal.h" 25#include "internal.h"
26#include "iostat.h" 26#include "iostat.h"
27#include "fscache.h" 27#include "fscache.h"
@@ -46,6 +46,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
46 memset(p, 0, sizeof(*p)); 46 memset(p, 0, sizeof(*p));
47 INIT_LIST_HEAD(&p->pages); 47 INIT_LIST_HEAD(&p->pages);
48 p->npages = pagecount; 48 p->npages = pagecount;
49 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
49 if (pagecount <= ARRAY_SIZE(p->page_array)) 50 if (pagecount <= ARRAY_SIZE(p->page_array))
50 p->pagevec = p->page_array; 51 p->pagevec = p->page_array;
51 else { 52 else {
@@ -59,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
59 return p; 60 return p;
60} 61}
61 62
62static void nfs_readdata_free(struct nfs_read_data *p) 63void nfs_readdata_free(struct nfs_read_data *p)
63{ 64{
64 if (p && (p->pagevec != &p->page_array[0])) 65 if (p && (p->pagevec != &p->page_array[0]))
65 kfree(p->pagevec); 66 kfree(p->pagevec);
66 mempool_free(p, nfs_rdata_mempool); 67 mempool_free(p, nfs_rdata_mempool);
67} 68}
68 69
69void nfs_readdata_release(void *data) 70static void nfs_readdata_release(struct nfs_read_data *rdata)
70{ 71{
71 struct nfs_read_data *rdata = data;
72
73 put_nfs_open_context(rdata->args.context); 72 put_nfs_open_context(rdata->args.context);
74 nfs_readdata_free(rdata); 73 nfs_readdata_free(rdata);
75} 74}
@@ -357,19 +356,25 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
357 struct nfs_readres *resp = &data->res; 356 struct nfs_readres *resp = &data->res;
358 357
359 if (resp->eof || resp->count == argp->count) 358 if (resp->eof || resp->count == argp->count)
360 return; 359 goto out;
361 360
362 /* This is a short read! */ 361 /* This is a short read! */
363 nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); 362 nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
364 /* Has the server at least made some progress? */ 363 /* Has the server at least made some progress? */
365 if (resp->count == 0) 364 if (resp->count == 0)
366 return; 365 goto out;
367 366
368 /* Yes, so retry the read at the end of the data */ 367 /* Yes, so retry the read at the end of the data */
369 argp->offset += resp->count; 368 argp->offset += resp->count;
370 argp->pgbase += resp->count; 369 argp->pgbase += resp->count;
371 argp->count -= resp->count; 370 argp->count -= resp->count;
372 rpc_restart_call(task); 371 nfs4_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
372 return;
373out:
374 nfs4_sequence_free_slot(NFS_SERVER(data->inode)->nfs_client,
375 &data->res.seq_res);
376 return;
377
373} 378}
374 379
375/* 380/*
@@ -406,7 +411,23 @@ static void nfs_readpage_release_partial(void *calldata)
406 nfs_readdata_release(calldata); 411 nfs_readdata_release(calldata);
407} 412}
408 413
414#if defined(CONFIG_NFS_V4_1)
415void nfs_read_prepare(struct rpc_task *task, void *calldata)
416{
417 struct nfs_read_data *data = calldata;
418
419 if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client,
420 &data->args.seq_args, &data->res.seq_res,
421 0, task))
422 return;
423 rpc_call_start(task);
424}
425#endif /* CONFIG_NFS_V4_1 */
426
409static const struct rpc_call_ops nfs_read_partial_ops = { 427static const struct rpc_call_ops nfs_read_partial_ops = {
428#if defined(CONFIG_NFS_V4_1)
429 .rpc_call_prepare = nfs_read_prepare,
430#endif /* CONFIG_NFS_V4_1 */
410 .rpc_call_done = nfs_readpage_result_partial, 431 .rpc_call_done = nfs_readpage_result_partial,
411 .rpc_release = nfs_readpage_release_partial, 432 .rpc_release = nfs_readpage_release_partial,
412}; 433};
@@ -470,6 +491,9 @@ static void nfs_readpage_release_full(void *calldata)
470} 491}
471 492
472static const struct rpc_call_ops nfs_read_full_ops = { 493static const struct rpc_call_ops nfs_read_full_ops = {
494#if defined(CONFIG_NFS_V4_1)
495 .rpc_call_prepare = nfs_read_prepare,
496#endif /* CONFIG_NFS_V4_1 */
473 .rpc_call_done = nfs_readpage_result_full, 497 .rpc_call_done = nfs_readpage_result_full,
474 .rpc_release = nfs_readpage_release_full, 498 .rpc_release = nfs_readpage_release_full,
475}; 499};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d2d67781c579..0b4cbdc60abd 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -42,6 +42,8 @@
42#include <linux/smp_lock.h> 42#include <linux/smp_lock.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/mount.h> 44#include <linux/mount.h>
45#include <linux/mnt_namespace.h>
46#include <linux/namei.h>
45#include <linux/nfs_idmap.h> 47#include <linux/nfs_idmap.h>
46#include <linux/vfs.h> 48#include <linux/vfs.h>
47#include <linux/inet.h> 49#include <linux/inet.h>
@@ -90,6 +92,7 @@ enum {
90 Opt_mountport, 92 Opt_mountport,
91 Opt_mountvers, 93 Opt_mountvers,
92 Opt_nfsvers, 94 Opt_nfsvers,
95 Opt_minorversion,
93 96
94 /* Mount options that take string arguments */ 97 /* Mount options that take string arguments */
95 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, 98 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
@@ -139,22 +142,23 @@ static const match_table_t nfs_mount_option_tokens = {
139 { Opt_fscache_uniq, "fsc=%s" }, 142 { Opt_fscache_uniq, "fsc=%s" },
140 { Opt_nofscache, "nofsc" }, 143 { Opt_nofscache, "nofsc" },
141 144
142 { Opt_port, "port=%u" }, 145 { Opt_port, "port=%s" },
143 { Opt_rsize, "rsize=%u" }, 146 { Opt_rsize, "rsize=%s" },
144 { Opt_wsize, "wsize=%u" }, 147 { Opt_wsize, "wsize=%s" },
145 { Opt_bsize, "bsize=%u" }, 148 { Opt_bsize, "bsize=%s" },
146 { Opt_timeo, "timeo=%u" }, 149 { Opt_timeo, "timeo=%s" },
147 { Opt_retrans, "retrans=%u" }, 150 { Opt_retrans, "retrans=%s" },
148 { Opt_acregmin, "acregmin=%u" }, 151 { Opt_acregmin, "acregmin=%s" },
149 { Opt_acregmax, "acregmax=%u" }, 152 { Opt_acregmax, "acregmax=%s" },
150 { Opt_acdirmin, "acdirmin=%u" }, 153 { Opt_acdirmin, "acdirmin=%s" },
151 { Opt_acdirmax, "acdirmax=%u" }, 154 { Opt_acdirmax, "acdirmax=%s" },
152 { Opt_actimeo, "actimeo=%u" }, 155 { Opt_actimeo, "actimeo=%s" },
153 { Opt_namelen, "namlen=%u" }, 156 { Opt_namelen, "namlen=%s" },
154 { Opt_mountport, "mountport=%u" }, 157 { Opt_mountport, "mountport=%s" },
155 { Opt_mountvers, "mountvers=%u" }, 158 { Opt_mountvers, "mountvers=%s" },
156 { Opt_nfsvers, "nfsvers=%u" }, 159 { Opt_nfsvers, "nfsvers=%s" },
157 { Opt_nfsvers, "vers=%u" }, 160 { Opt_nfsvers, "vers=%s" },
161 { Opt_minorversion, "minorversion=%u" },
158 162
159 { Opt_sec, "sec=%s" }, 163 { Opt_sec, "sec=%s" },
160 { Opt_proto, "proto=%s" }, 164 { Opt_proto, "proto=%s" },
@@ -270,10 +274,14 @@ static const struct super_operations nfs_sops = {
270#ifdef CONFIG_NFS_V4 274#ifdef CONFIG_NFS_V4
271static int nfs4_get_sb(struct file_system_type *fs_type, 275static int nfs4_get_sb(struct file_system_type *fs_type,
272 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 276 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
277static int nfs4_remote_get_sb(struct file_system_type *fs_type,
278 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
273static int nfs4_xdev_get_sb(struct file_system_type *fs_type, 279static int nfs4_xdev_get_sb(struct file_system_type *fs_type,
274 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 280 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
275static int nfs4_referral_get_sb(struct file_system_type *fs_type, 281static int nfs4_referral_get_sb(struct file_system_type *fs_type,
276 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 282 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
283static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
284 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
277static void nfs4_kill_super(struct super_block *sb); 285static void nfs4_kill_super(struct super_block *sb);
278 286
279static struct file_system_type nfs4_fs_type = { 287static struct file_system_type nfs4_fs_type = {
@@ -284,6 +292,14 @@ static struct file_system_type nfs4_fs_type = {
284 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 292 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
285}; 293};
286 294
295static struct file_system_type nfs4_remote_fs_type = {
296 .owner = THIS_MODULE,
297 .name = "nfs4",
298 .get_sb = nfs4_remote_get_sb,
299 .kill_sb = nfs4_kill_super,
300 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
301};
302
287struct file_system_type nfs4_xdev_fs_type = { 303struct file_system_type nfs4_xdev_fs_type = {
288 .owner = THIS_MODULE, 304 .owner = THIS_MODULE,
289 .name = "nfs4", 305 .name = "nfs4",
@@ -292,6 +308,14 @@ struct file_system_type nfs4_xdev_fs_type = {
292 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 308 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
293}; 309};
294 310
311static struct file_system_type nfs4_remote_referral_fs_type = {
312 .owner = THIS_MODULE,
313 .name = "nfs4",
314 .get_sb = nfs4_remote_referral_get_sb,
315 .kill_sb = nfs4_kill_super,
316 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
317};
318
295struct file_system_type nfs4_referral_fs_type = { 319struct file_system_type nfs4_referral_fs_type = {
296 .owner = THIS_MODULE, 320 .owner = THIS_MODULE,
297 .name = "nfs4", 321 .name = "nfs4",
@@ -514,7 +538,6 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
514 const char *nostr; 538 const char *nostr;
515 } nfs_info[] = { 539 } nfs_info[] = {
516 { NFS_MOUNT_SOFT, ",soft", ",hard" }, 540 { NFS_MOUNT_SOFT, ",soft", ",hard" },
517 { NFS_MOUNT_INTR, ",intr", ",nointr" },
518 { NFS_MOUNT_POSIX, ",posix", "" }, 541 { NFS_MOUNT_POSIX, ",posix", "" },
519 { NFS_MOUNT_NOCTO, ",nocto", "" }, 542 { NFS_MOUNT_NOCTO, ",nocto", "" },
520 { NFS_MOUNT_NOAC, ",noac", "" }, 543 { NFS_MOUNT_NOAC, ",noac", "" },
@@ -943,11 +966,6 @@ static int nfs_parse_security_flavors(char *value,
943 return 1; 966 return 1;
944} 967}
945 968
946static void nfs_parse_invalid_value(const char *option)
947{
948 dfprintk(MOUNT, "NFS: bad value specified for %s option\n", option);
949}
950
951/* 969/*
952 * Error-check and convert a string of mount options from user space into 970 * Error-check and convert a string of mount options from user space into
953 * a data structure. The whole mount string is processed; bad options are 971 * a data structure. The whole mount string is processed; bad options are
@@ -958,7 +976,7 @@ static int nfs_parse_mount_options(char *raw,
958 struct nfs_parsed_mount_data *mnt) 976 struct nfs_parsed_mount_data *mnt)
959{ 977{
960 char *p, *string, *secdata; 978 char *p, *string, *secdata;
961 int rc, sloppy = 0, errors = 0; 979 int rc, sloppy = 0, invalid_option = 0;
962 980
963 if (!raw) { 981 if (!raw) {
964 dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); 982 dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -982,7 +1000,9 @@ static int nfs_parse_mount_options(char *raw,
982 1000
983 while ((p = strsep(&raw, ",")) != NULL) { 1001 while ((p = strsep(&raw, ",")) != NULL) {
984 substring_t args[MAX_OPT_ARGS]; 1002 substring_t args[MAX_OPT_ARGS];
985 int option, token; 1003 unsigned long option;
1004 int int_option;
1005 int token;
986 1006
987 if (!*p) 1007 if (!*p)
988 continue; 1008 continue;
@@ -1091,114 +1111,156 @@ static int nfs_parse_mount_options(char *raw,
1091 * options that take numeric values 1111 * options that take numeric values
1092 */ 1112 */
1093 case Opt_port: 1113 case Opt_port:
1094 if (match_int(args, &option) || 1114 string = match_strdup(args);
1095 option < 0 || option > USHORT_MAX) { 1115 if (string == NULL)
1096 errors++; 1116 goto out_nomem;
1097 nfs_parse_invalid_value("port"); 1117 rc = strict_strtoul(string, 10, &option);
1098 } else 1118 kfree(string);
1099 mnt->nfs_server.port = option; 1119 if (rc != 0 || option > USHORT_MAX)
1120 goto out_invalid_value;
1121 mnt->nfs_server.port = option;
1100 break; 1122 break;
1101 case Opt_rsize: 1123 case Opt_rsize:
1102 if (match_int(args, &option) || option < 0) { 1124 string = match_strdup(args);
1103 errors++; 1125 if (string == NULL)
1104 nfs_parse_invalid_value("rsize"); 1126 goto out_nomem;
1105 } else 1127 rc = strict_strtoul(string, 10, &option);
1106 mnt->rsize = option; 1128 kfree(string);
1129 if (rc != 0)
1130 goto out_invalid_value;
1131 mnt->rsize = option;
1107 break; 1132 break;
1108 case Opt_wsize: 1133 case Opt_wsize:
1109 if (match_int(args, &option) || option < 0) { 1134 string = match_strdup(args);
1110 errors++; 1135 if (string == NULL)
1111 nfs_parse_invalid_value("wsize"); 1136 goto out_nomem;
1112 } else 1137 rc = strict_strtoul(string, 10, &option);
1113 mnt->wsize = option; 1138 kfree(string);
1139 if (rc != 0)
1140 goto out_invalid_value;
1141 mnt->wsize = option;
1114 break; 1142 break;
1115 case Opt_bsize: 1143 case Opt_bsize:
1116 if (match_int(args, &option) || option < 0) { 1144 string = match_strdup(args);
1117 errors++; 1145 if (string == NULL)
1118 nfs_parse_invalid_value("bsize"); 1146 goto out_nomem;
1119 } else 1147 rc = strict_strtoul(string, 10, &option);
1120 mnt->bsize = option; 1148 kfree(string);
1149 if (rc != 0)
1150 goto out_invalid_value;
1151 mnt->bsize = option;
1121 break; 1152 break;
1122 case Opt_timeo: 1153 case Opt_timeo:
1123 if (match_int(args, &option) || option <= 0) { 1154 string = match_strdup(args);
1124 errors++; 1155 if (string == NULL)
1125 nfs_parse_invalid_value("timeo"); 1156 goto out_nomem;
1126 } else 1157 rc = strict_strtoul(string, 10, &option);
1127 mnt->timeo = option; 1158 kfree(string);
1159 if (rc != 0 || option == 0)
1160 goto out_invalid_value;
1161 mnt->timeo = option;
1128 break; 1162 break;
1129 case Opt_retrans: 1163 case Opt_retrans:
1130 if (match_int(args, &option) || option <= 0) { 1164 string = match_strdup(args);
1131 errors++; 1165 if (string == NULL)
1132 nfs_parse_invalid_value("retrans"); 1166 goto out_nomem;
1133 } else 1167 rc = strict_strtoul(string, 10, &option);
1134 mnt->retrans = option; 1168 kfree(string);
1169 if (rc != 0 || option == 0)
1170 goto out_invalid_value;
1171 mnt->retrans = option;
1135 break; 1172 break;
1136 case Opt_acregmin: 1173 case Opt_acregmin:
1137 if (match_int(args, &option) || option < 0) { 1174 string = match_strdup(args);
1138 errors++; 1175 if (string == NULL)
1139 nfs_parse_invalid_value("acregmin"); 1176 goto out_nomem;
1140 } else 1177 rc = strict_strtoul(string, 10, &option);
1141 mnt->acregmin = option; 1178 kfree(string);
1179 if (rc != 0)
1180 goto out_invalid_value;
1181 mnt->acregmin = option;
1142 break; 1182 break;
1143 case Opt_acregmax: 1183 case Opt_acregmax:
1144 if (match_int(args, &option) || option < 0) { 1184 string = match_strdup(args);
1145 errors++; 1185 if (string == NULL)
1146 nfs_parse_invalid_value("acregmax"); 1186 goto out_nomem;
1147 } else 1187 rc = strict_strtoul(string, 10, &option);
1148 mnt->acregmax = option; 1188 kfree(string);
1189 if (rc != 0)
1190 goto out_invalid_value;
1191 mnt->acregmax = option;
1149 break; 1192 break;
1150 case Opt_acdirmin: 1193 case Opt_acdirmin:
1151 if (match_int(args, &option) || option < 0) { 1194 string = match_strdup(args);
1152 errors++; 1195 if (string == NULL)
1153 nfs_parse_invalid_value("acdirmin"); 1196 goto out_nomem;
1154 } else 1197 rc = strict_strtoul(string, 10, &option);
1155 mnt->acdirmin = option; 1198 kfree(string);
1199 if (rc != 0)
1200 goto out_invalid_value;
1201 mnt->acdirmin = option;
1156 break; 1202 break;
1157 case Opt_acdirmax: 1203 case Opt_acdirmax:
1158 if (match_int(args, &option) || option < 0) { 1204 string = match_strdup(args);
1159 errors++; 1205 if (string == NULL)
1160 nfs_parse_invalid_value("acdirmax"); 1206 goto out_nomem;
1161 } else 1207 rc = strict_strtoul(string, 10, &option);
1162 mnt->acdirmax = option; 1208 kfree(string);
1209 if (rc != 0)
1210 goto out_invalid_value;
1211 mnt->acdirmax = option;
1163 break; 1212 break;
1164 case Opt_actimeo: 1213 case Opt_actimeo:
1165 if (match_int(args, &option) || option < 0) { 1214 string = match_strdup(args);
1166 errors++; 1215 if (string == NULL)
1167 nfs_parse_invalid_value("actimeo"); 1216 goto out_nomem;
1168 } else 1217 rc = strict_strtoul(string, 10, &option);
1169 mnt->acregmin = mnt->acregmax = 1218 kfree(string);
1170 mnt->acdirmin = mnt->acdirmax = option; 1219 if (rc != 0)
1220 goto out_invalid_value;
1221 mnt->acregmin = mnt->acregmax =
1222 mnt->acdirmin = mnt->acdirmax = option;
1171 break; 1223 break;
1172 case Opt_namelen: 1224 case Opt_namelen:
1173 if (match_int(args, &option) || option < 0) { 1225 string = match_strdup(args);
1174 errors++; 1226 if (string == NULL)
1175 nfs_parse_invalid_value("namlen"); 1227 goto out_nomem;
1176 } else 1228 rc = strict_strtoul(string, 10, &option);
1177 mnt->namlen = option; 1229 kfree(string);
1230 if (rc != 0)
1231 goto out_invalid_value;
1232 mnt->namlen = option;
1178 break; 1233 break;
1179 case Opt_mountport: 1234 case Opt_mountport:
1180 if (match_int(args, &option) || 1235 string = match_strdup(args);
1181 option < 0 || option > USHORT_MAX) { 1236 if (string == NULL)
1182 errors++; 1237 goto out_nomem;
1183 nfs_parse_invalid_value("mountport"); 1238 rc = strict_strtoul(string, 10, &option);
1184 } else 1239 kfree(string);
1185 mnt->mount_server.port = option; 1240 if (rc != 0 || option > USHORT_MAX)
1241 goto out_invalid_value;
1242 mnt->mount_server.port = option;
1186 break; 1243 break;
1187 case Opt_mountvers: 1244 case Opt_mountvers:
1188 if (match_int(args, &option) || 1245 string = match_strdup(args);
1246 if (string == NULL)
1247 goto out_nomem;
1248 rc = strict_strtoul(string, 10, &option);
1249 kfree(string);
1250 if (rc != 0 ||
1189 option < NFS_MNT_VERSION || 1251 option < NFS_MNT_VERSION ||
1190 option > NFS_MNT3_VERSION) { 1252 option > NFS_MNT3_VERSION)
1191 errors++; 1253 goto out_invalid_value;
1192 nfs_parse_invalid_value("mountvers"); 1254 mnt->mount_server.version = option;
1193 } else
1194 mnt->mount_server.version = option;
1195 break; 1255 break;
1196 case Opt_nfsvers: 1256 case Opt_nfsvers:
1197 if (match_int(args, &option)) { 1257 string = match_strdup(args);
1198 errors++; 1258 if (string == NULL)
1199 nfs_parse_invalid_value("nfsvers"); 1259 goto out_nomem;
1200 break; 1260 rc = strict_strtoul(string, 10, &option);
1201 } 1261 kfree(string);
1262 if (rc != 0)
1263 goto out_invalid_value;
1202 switch (option) { 1264 switch (option) {
1203 case NFS2_VERSION: 1265 case NFS2_VERSION:
1204 mnt->flags &= ~NFS_MOUNT_VER3; 1266 mnt->flags &= ~NFS_MOUNT_VER3;
@@ -1207,10 +1269,16 @@ static int nfs_parse_mount_options(char *raw,
1207 mnt->flags |= NFS_MOUNT_VER3; 1269 mnt->flags |= NFS_MOUNT_VER3;
1208 break; 1270 break;
1209 default: 1271 default:
1210 errors++; 1272 goto out_invalid_value;
1211 nfs_parse_invalid_value("nfsvers");
1212 } 1273 }
1213 break; 1274 break;
1275 case Opt_minorversion:
1276 if (match_int(args, &int_option))
1277 return 0;
1278 if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION)
1279 return 0;
1280 mnt->minorversion = int_option;
1281 break;
1214 1282
1215 /* 1283 /*
1216 * options that take text values 1284 * options that take text values
@@ -1222,9 +1290,9 @@ static int nfs_parse_mount_options(char *raw,
1222 rc = nfs_parse_security_flavors(string, mnt); 1290 rc = nfs_parse_security_flavors(string, mnt);
1223 kfree(string); 1291 kfree(string);
1224 if (!rc) { 1292 if (!rc) {
1225 errors++;
1226 dfprintk(MOUNT, "NFS: unrecognized " 1293 dfprintk(MOUNT, "NFS: unrecognized "
1227 "security flavor\n"); 1294 "security flavor\n");
1295 return 0;
1228 } 1296 }
1229 break; 1297 break;
1230 case Opt_proto: 1298 case Opt_proto:
@@ -1238,23 +1306,25 @@ static int nfs_parse_mount_options(char *raw,
1238 case Opt_xprt_udp: 1306 case Opt_xprt_udp:
1239 mnt->flags &= ~NFS_MOUNT_TCP; 1307 mnt->flags &= ~NFS_MOUNT_TCP;
1240 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1308 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
1309 kfree(string);
1241 break; 1310 break;
1242 case Opt_xprt_tcp: 1311 case Opt_xprt_tcp:
1243 mnt->flags |= NFS_MOUNT_TCP; 1312 mnt->flags |= NFS_MOUNT_TCP;
1244 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1313 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1314 kfree(string);
1245 break; 1315 break;
1246 case Opt_xprt_rdma: 1316 case Opt_xprt_rdma:
1247 /* vector side protocols to TCP */ 1317 /* vector side protocols to TCP */
1248 mnt->flags |= NFS_MOUNT_TCP; 1318 mnt->flags |= NFS_MOUNT_TCP;
1249 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1319 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1250 xprt_load_transport(string); 1320 xprt_load_transport(string);
1321 kfree(string);
1251 break; 1322 break;
1252 default: 1323 default:
1253 errors++;
1254 dfprintk(MOUNT, "NFS: unrecognized " 1324 dfprintk(MOUNT, "NFS: unrecognized "
1255 "transport protocol\n"); 1325 "transport protocol\n");
1326 return 0;
1256 } 1327 }
1257 kfree(string);
1258 break; 1328 break;
1259 case Opt_mountproto: 1329 case Opt_mountproto:
1260 string = match_strdup(args); 1330 string = match_strdup(args);
@@ -1273,9 +1343,9 @@ static int nfs_parse_mount_options(char *raw,
1273 break; 1343 break;
1274 case Opt_xprt_rdma: /* not used for side protocols */ 1344 case Opt_xprt_rdma: /* not used for side protocols */
1275 default: 1345 default:
1276 errors++;
1277 dfprintk(MOUNT, "NFS: unrecognized " 1346 dfprintk(MOUNT, "NFS: unrecognized "
1278 "transport protocol\n"); 1347 "transport protocol\n");
1348 return 0;
1279 } 1349 }
1280 break; 1350 break;
1281 case Opt_addr: 1351 case Opt_addr:
@@ -1331,9 +1401,9 @@ static int nfs_parse_mount_options(char *raw,
1331 mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; 1401 mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
1332 break; 1402 break;
1333 default: 1403 default:
1334 errors++;
1335 dfprintk(MOUNT, "NFS: invalid " 1404 dfprintk(MOUNT, "NFS: invalid "
1336 "lookupcache argument\n"); 1405 "lookupcache argument\n");
1406 return 0;
1337 }; 1407 };
1338 break; 1408 break;
1339 1409
@@ -1351,20 +1421,20 @@ static int nfs_parse_mount_options(char *raw,
1351 break; 1421 break;
1352 1422
1353 default: 1423 default:
1354 errors++; 1424 invalid_option = 1;
1355 dfprintk(MOUNT, "NFS: unrecognized mount option " 1425 dfprintk(MOUNT, "NFS: unrecognized mount option "
1356 "'%s'\n", p); 1426 "'%s'\n", p);
1357 } 1427 }
1358 } 1428 }
1359 1429
1360 if (errors > 0) { 1430 if (!sloppy && invalid_option)
1361 dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n", 1431 return 0;
1362 errors, (errors == 1 ? "" : "s")); 1432
1363 if (!sloppy)
1364 return 0;
1365 }
1366 return 1; 1433 return 1;
1367 1434
1435out_invalid_value:
1436 printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p);
1437 return 0;
1368out_nomem: 1438out_nomem:
1369 printk(KERN_INFO "NFS: not enough memory to parse option\n"); 1439 printk(KERN_INFO "NFS: not enough memory to parse option\n");
1370 return 0; 1440 return 0;
@@ -1381,6 +1451,7 @@ out_security_failure:
1381static int nfs_try_mount(struct nfs_parsed_mount_data *args, 1451static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1382 struct nfs_fh *root_fh) 1452 struct nfs_fh *root_fh)
1383{ 1453{
1454 unsigned int auth_flavor_len = 0;
1384 struct nfs_mount_request request = { 1455 struct nfs_mount_request request = {
1385 .sap = (struct sockaddr *) 1456 .sap = (struct sockaddr *)
1386 &args->mount_server.address, 1457 &args->mount_server.address,
@@ -1388,6 +1459,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1388 .protocol = args->mount_server.protocol, 1459 .protocol = args->mount_server.protocol,
1389 .fh = root_fh, 1460 .fh = root_fh,
1390 .noresvport = args->flags & NFS_MOUNT_NORESVPORT, 1461 .noresvport = args->flags & NFS_MOUNT_NORESVPORT,
1462 .auth_flav_len = &auth_flavor_len,
1391 }; 1463 };
1392 int status; 1464 int status;
1393 1465
@@ -1813,6 +1885,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
1813 if (data == NULL) 1885 if (data == NULL)
1814 return -ENOMEM; 1886 return -ENOMEM;
1815 1887
1888 lock_kernel();
1816 /* fill out struct with values from existing mount */ 1889 /* fill out struct with values from existing mount */
1817 data->flags = nfss->flags; 1890 data->flags = nfss->flags;
1818 data->rsize = nfss->rsize; 1891 data->rsize = nfss->rsize;
@@ -1837,6 +1910,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
1837 error = nfs_compare_remount_data(nfss, data); 1910 error = nfs_compare_remount_data(nfss, data);
1838out: 1911out:
1839 kfree(data); 1912 kfree(data);
1913 unlock_kernel();
1840 return error; 1914 return error;
1841} 1915}
1842 1916
@@ -2238,6 +2312,11 @@ static void nfs4_fill_super(struct super_block *sb)
2238 nfs_initialise_sb(sb); 2312 nfs_initialise_sb(sb);
2239} 2313}
2240 2314
2315static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
2316{
2317 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
2318}
2319
2241/* 2320/*
2242 * Validate NFSv4 mount options 2321 * Validate NFSv4 mount options
2243 */ 2322 */
@@ -2261,6 +2340,7 @@ static int nfs4_validate_mount_data(void *options,
2261 args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ 2340 args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */
2262 args->auth_flavors[0] = RPC_AUTH_UNIX; 2341 args->auth_flavors[0] = RPC_AUTH_UNIX;
2263 args->auth_flavor_len = 0; 2342 args->auth_flavor_len = 0;
2343 args->minorversion = 0;
2264 2344
2265 switch (data->version) { 2345 switch (data->version) {
2266 case 1: 2346 case 1:
@@ -2334,6 +2414,8 @@ static int nfs4_validate_mount_data(void *options,
2334 2414
2335 nfs_validate_transport_protocol(args); 2415 nfs_validate_transport_protocol(args);
2336 2416
2417 nfs4_validate_mount_flags(args);
2418
2337 if (args->auth_flavor_len > 1) 2419 if (args->auth_flavor_len > 1)
2338 goto out_inval_auth; 2420 goto out_inval_auth;
2339 2421
@@ -2373,12 +2455,12 @@ out_no_client_address:
2373} 2455}
2374 2456
2375/* 2457/*
2376 * Get the superblock for an NFS4 mountpoint 2458 * Get the superblock for the NFS4 root partition
2377 */ 2459 */
2378static int nfs4_get_sb(struct file_system_type *fs_type, 2460static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2379 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 2461 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
2380{ 2462{
2381 struct nfs_parsed_mount_data *data; 2463 struct nfs_parsed_mount_data *data = raw_data;
2382 struct super_block *s; 2464 struct super_block *s;
2383 struct nfs_server *server; 2465 struct nfs_server *server;
2384 struct nfs_fh *mntfh; 2466 struct nfs_fh *mntfh;
@@ -2389,18 +2471,12 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2389 }; 2471 };
2390 int error = -ENOMEM; 2472 int error = -ENOMEM;
2391 2473
2392 data = kzalloc(sizeof(*data), GFP_KERNEL);
2393 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); 2474 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
2394 if (data == NULL || mntfh == NULL) 2475 if (data == NULL || mntfh == NULL)
2395 goto out_free_fh; 2476 goto out_free_fh;
2396 2477
2397 security_init_mnt_opts(&data->lsm_opts); 2478 security_init_mnt_opts(&data->lsm_opts);
2398 2479
2399 /* Validate the mount data */
2400 error = nfs4_validate_mount_data(raw_data, data, dev_name);
2401 if (error < 0)
2402 goto out;
2403
2404 /* Get a volume representation */ 2480 /* Get a volume representation */
2405 server = nfs4_create_server(data, mntfh); 2481 server = nfs4_create_server(data, mntfh);
2406 if (IS_ERR(server)) { 2482 if (IS_ERR(server)) {
@@ -2413,7 +2489,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2413 compare_super = NULL; 2489 compare_super = NULL;
2414 2490
2415 /* Get a superblock - note that we may end up sharing one that already exists */ 2491 /* Get a superblock - note that we may end up sharing one that already exists */
2416 s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); 2492 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
2417 if (IS_ERR(s)) { 2493 if (IS_ERR(s)) {
2418 error = PTR_ERR(s); 2494 error = PTR_ERR(s);
2419 goto out_free; 2495 goto out_free;
@@ -2450,14 +2526,9 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2450 error = 0; 2526 error = 0;
2451 2527
2452out: 2528out:
2453 kfree(data->client_address);
2454 kfree(data->nfs_server.export_path);
2455 kfree(data->nfs_server.hostname);
2456 kfree(data->fscache_uniq);
2457 security_free_mnt_opts(&data->lsm_opts); 2529 security_free_mnt_opts(&data->lsm_opts);
2458out_free_fh: 2530out_free_fh:
2459 kfree(mntfh); 2531 kfree(mntfh);
2460 kfree(data);
2461 return error; 2532 return error;
2462 2533
2463out_free: 2534out_free:
@@ -2471,16 +2542,137 @@ error_splat_super:
2471 goto out; 2542 goto out;
2472} 2543}
2473 2544
2545static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
2546 int flags, void *data, const char *hostname)
2547{
2548 struct vfsmount *root_mnt;
2549 char *root_devname;
2550 size_t len;
2551
2552 len = strlen(hostname) + 3;
2553 root_devname = kmalloc(len, GFP_KERNEL);
2554 if (root_devname == NULL)
2555 return ERR_PTR(-ENOMEM);
2556 snprintf(root_devname, len, "%s:/", hostname);
2557 root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data);
2558 kfree(root_devname);
2559 return root_mnt;
2560}
2561
2562static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
2563{
2564 char *page = (char *) __get_free_page(GFP_KERNEL);
2565 char *devname, *tmp;
2566
2567 if (page == NULL)
2568 return;
2569 devname = nfs_path(path->mnt->mnt_devname,
2570 path->mnt->mnt_root, path->dentry,
2571 page, PAGE_SIZE);
2572 if (devname == NULL)
2573 goto out_freepage;
2574 tmp = kstrdup(devname, GFP_KERNEL);
2575 if (tmp == NULL)
2576 goto out_freepage;
2577 kfree(mnt->mnt_devname);
2578 mnt->mnt_devname = tmp;
2579out_freepage:
2580 free_page((unsigned long)page);
2581}
2582
2583static int nfs_follow_remote_path(struct vfsmount *root_mnt,
2584 const char *export_path, struct vfsmount *mnt_target)
2585{
2586 struct mnt_namespace *ns_private;
2587 struct nameidata nd;
2588 struct super_block *s;
2589 int ret;
2590
2591 ns_private = create_mnt_ns(root_mnt);
2592 ret = PTR_ERR(ns_private);
2593 if (IS_ERR(ns_private))
2594 goto out_mntput;
2595
2596 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
2597 export_path, LOOKUP_FOLLOW, &nd);
2598
2599 put_mnt_ns(ns_private);
2600
2601 if (ret != 0)
2602 goto out_err;
2603
2604 s = nd.path.mnt->mnt_sb;
2605 atomic_inc(&s->s_active);
2606 mnt_target->mnt_sb = s;
2607 mnt_target->mnt_root = dget(nd.path.dentry);
2608
2609 /* Correct the device pathname */
2610 nfs_fix_devname(&nd.path, mnt_target);
2611
2612 path_put(&nd.path);
2613 down_write(&s->s_umount);
2614 return 0;
2615out_mntput:
2616 mntput(root_mnt);
2617out_err:
2618 return ret;
2619}
2620
2621/*
2622 * Get the superblock for an NFS4 mountpoint
2623 */
2624static int nfs4_get_sb(struct file_system_type *fs_type,
2625 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
2626{
2627 struct nfs_parsed_mount_data *data;
2628 char *export_path;
2629 struct vfsmount *root_mnt;
2630 int error = -ENOMEM;
2631
2632 data = kzalloc(sizeof(*data), GFP_KERNEL);
2633 if (data == NULL)
2634 goto out_free_data;
2635
2636 /* Validate the mount data */
2637 error = nfs4_validate_mount_data(raw_data, data, dev_name);
2638 if (error < 0)
2639 goto out;
2640
2641 export_path = data->nfs_server.export_path;
2642 data->nfs_server.export_path = "/";
2643 root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
2644 data->nfs_server.hostname);
2645 data->nfs_server.export_path = export_path;
2646
2647 error = PTR_ERR(root_mnt);
2648 if (IS_ERR(root_mnt))
2649 goto out;
2650
2651 error = nfs_follow_remote_path(root_mnt, export_path, mnt);
2652
2653out:
2654 kfree(data->client_address);
2655 kfree(data->nfs_server.export_path);
2656 kfree(data->nfs_server.hostname);
2657 kfree(data->fscache_uniq);
2658out_free_data:
2659 kfree(data);
2660 dprintk("<-- nfs4_get_sb() = %d%s\n", error,
2661 error != 0 ? " [error]" : "");
2662 return error;
2663}
2664
2474static void nfs4_kill_super(struct super_block *sb) 2665static void nfs4_kill_super(struct super_block *sb)
2475{ 2666{
2476 struct nfs_server *server = NFS_SB(sb); 2667 struct nfs_server *server = NFS_SB(sb);
2477 2668
2669 dprintk("--> %s\n", __func__);
2478 nfs_super_return_all_delegations(sb); 2670 nfs_super_return_all_delegations(sb);
2479 kill_anon_super(sb); 2671 kill_anon_super(sb);
2480
2481 nfs4_renewd_prepare_shutdown(server); 2672 nfs4_renewd_prepare_shutdown(server);
2482 nfs_fscache_release_super_cookie(sb); 2673 nfs_fscache_release_super_cookie(sb);
2483 nfs_free_server(server); 2674 nfs_free_server(server);
2675 dprintk("<-- %s\n", __func__);
2484} 2676}
2485 2677
2486/* 2678/*
@@ -2566,12 +2758,9 @@ error_splat_super:
2566 return error; 2758 return error;
2567} 2759}
2568 2760
2569/* 2761static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2570 * Create an NFS4 server record on referral traversal 2762 int flags, const char *dev_name, void *raw_data,
2571 */ 2763 struct vfsmount *mnt)
2572static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
2573 const char *dev_name, void *raw_data,
2574 struct vfsmount *mnt)
2575{ 2764{
2576 struct nfs_clone_mount *data = raw_data; 2765 struct nfs_clone_mount *data = raw_data;
2577 struct super_block *s; 2766 struct super_block *s;
@@ -2650,4 +2839,36 @@ error_splat_super:
2650 return error; 2839 return error;
2651} 2840}
2652 2841
2842/*
2843 * Create an NFS4 server record on referral traversal
2844 */
2845static int nfs4_referral_get_sb(struct file_system_type *fs_type,
2846 int flags, const char *dev_name, void *raw_data,
2847 struct vfsmount *mnt)
2848{
2849 struct nfs_clone_mount *data = raw_data;
2850 char *export_path;
2851 struct vfsmount *root_mnt;
2852 int error;
2853
2854 dprintk("--> nfs4_referral_get_sb()\n");
2855
2856 export_path = data->mnt_path;
2857 data->mnt_path = "/";
2858
2859 root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type,
2860 flags, data, data->hostname);
2861 data->mnt_path = export_path;
2862
2863 error = PTR_ERR(root_mnt);
2864 if (IS_ERR(root_mnt))
2865 goto out;
2866
2867 error = nfs_follow_remote_path(root_mnt, export_path, mnt);
2868out:
2869 dprintk("<-- nfs4_referral_get_sb() = %d%s\n", error,
2870 error != 0 ? " [error]" : "");
2871 return error;
2872}
2873
2653#endif /* CONFIG_NFS_V4 */ 2874#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index ecc295347775..1064c91ae810 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -15,6 +15,7 @@
15#include <linux/wait.h> 15#include <linux/wait.h>
16 16
17#include "internal.h" 17#include "internal.h"
18#include "nfs4_fs.h"
18 19
19struct nfs_unlinkdata { 20struct nfs_unlinkdata {
20 struct hlist_node list; 21 struct hlist_node list;
@@ -82,7 +83,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
82 struct inode *dir = data->dir; 83 struct inode *dir = data->dir;
83 84
84 if (!NFS_PROTO(dir)->unlink_done(task, dir)) 85 if (!NFS_PROTO(dir)->unlink_done(task, dir))
85 rpc_restart_call(task); 86 nfs4_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
86} 87}
87 88
88/** 89/**
@@ -102,9 +103,25 @@ static void nfs_async_unlink_release(void *calldata)
102 nfs_sb_deactive(sb); 103 nfs_sb_deactive(sb);
103} 104}
104 105
106#if defined(CONFIG_NFS_V4_1)
107void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
108{
109 struct nfs_unlinkdata *data = calldata;
110 struct nfs_server *server = NFS_SERVER(data->dir);
111
112 if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
113 &data->res.seq_res, 1, task))
114 return;
115 rpc_call_start(task);
116}
117#endif /* CONFIG_NFS_V4_1 */
118
105static const struct rpc_call_ops nfs_unlink_ops = { 119static const struct rpc_call_ops nfs_unlink_ops = {
106 .rpc_call_done = nfs_async_unlink_done, 120 .rpc_call_done = nfs_async_unlink_done,
107 .rpc_release = nfs_async_unlink_release, 121 .rpc_release = nfs_async_unlink_release,
122#if defined(CONFIG_NFS_V4_1)
123 .rpc_call_prepare = nfs_unlink_prepare,
124#endif /* CONFIG_NFS_V4_1 */
108}; 125};
109 126
110static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data) 127static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
@@ -241,6 +258,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
241 status = PTR_ERR(data->cred); 258 status = PTR_ERR(data->cred);
242 goto out_free; 259 goto out_free;
243 } 260 }
261 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
244 262
245 status = -EBUSY; 263 status = -EBUSY;
246 spin_lock(&dentry->d_lock); 264 spin_lock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e560a78995a3..a34fae21fe10 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -25,6 +25,7 @@
25#include "delegation.h" 25#include "delegation.h"
26#include "internal.h" 26#include "internal.h"
27#include "iostat.h" 27#include "iostat.h"
28#include "nfs4_fs.h"
28 29
29#define NFSDBG_FACILITY NFSDBG_PAGECACHE 30#define NFSDBG_FACILITY NFSDBG_PAGECACHE
30 31
@@ -52,6 +53,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
52 if (p) { 53 if (p) {
53 memset(p, 0, sizeof(*p)); 54 memset(p, 0, sizeof(*p));
54 INIT_LIST_HEAD(&p->pages); 55 INIT_LIST_HEAD(&p->pages);
56 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
55 } 57 }
56 return p; 58 return p;
57} 59}
@@ -71,6 +73,7 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
71 memset(p, 0, sizeof(*p)); 73 memset(p, 0, sizeof(*p));
72 INIT_LIST_HEAD(&p->pages); 74 INIT_LIST_HEAD(&p->pages);
73 p->npages = pagecount; 75 p->npages = pagecount;
76 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
74 if (pagecount <= ARRAY_SIZE(p->page_array)) 77 if (pagecount <= ARRAY_SIZE(p->page_array))
75 p->pagevec = p->page_array; 78 p->pagevec = p->page_array;
76 else { 79 else {
@@ -84,17 +87,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
84 return p; 87 return p;
85} 88}
86 89
87static void nfs_writedata_free(struct nfs_write_data *p) 90void nfs_writedata_free(struct nfs_write_data *p)
88{ 91{
89 if (p && (p->pagevec != &p->page_array[0])) 92 if (p && (p->pagevec != &p->page_array[0]))
90 kfree(p->pagevec); 93 kfree(p->pagevec);
91 mempool_free(p, nfs_wdata_mempool); 94 mempool_free(p, nfs_wdata_mempool);
92} 95}
93 96
94void nfs_writedata_release(void *data) 97static void nfs_writedata_release(struct nfs_write_data *wdata)
95{ 98{
96 struct nfs_write_data *wdata = data;
97
98 put_nfs_open_context(wdata->args.context); 99 put_nfs_open_context(wdata->args.context);
99 nfs_writedata_free(wdata); 100 nfs_writedata_free(wdata);
100} 101}
@@ -199,8 +200,10 @@ static int nfs_set_page_writeback(struct page *page)
199 struct nfs_server *nfss = NFS_SERVER(inode); 200 struct nfs_server *nfss = NFS_SERVER(inode);
200 201
201 if (atomic_long_inc_return(&nfss->writeback) > 202 if (atomic_long_inc_return(&nfss->writeback) >
202 NFS_CONGESTION_ON_THRESH) 203 NFS_CONGESTION_ON_THRESH) {
203 set_bdi_congested(&nfss->backing_dev_info, WRITE); 204 set_bdi_congested(&nfss->backing_dev_info,
205 BLK_RW_ASYNC);
206 }
204 } 207 }
205 return ret; 208 return ret;
206} 209}
@@ -212,7 +215,7 @@ static void nfs_end_page_writeback(struct page *page)
212 215
213 end_page_writeback(page); 216 end_page_writeback(page);
214 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) 217 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
215 clear_bdi_congested(&nfss->backing_dev_info, WRITE); 218 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
216} 219}
217 220
218/* 221/*
@@ -1048,7 +1051,23 @@ out:
1048 nfs_writedata_release(calldata); 1051 nfs_writedata_release(calldata);
1049} 1052}
1050 1053
1054#if defined(CONFIG_NFS_V4_1)
1055void nfs_write_prepare(struct rpc_task *task, void *calldata)
1056{
1057 struct nfs_write_data *data = calldata;
1058 struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client;
1059
1060 if (nfs4_setup_sequence(clp, &data->args.seq_args,
1061 &data->res.seq_res, 1, task))
1062 return;
1063 rpc_call_start(task);
1064}
1065#endif /* CONFIG_NFS_V4_1 */
1066
1051static const struct rpc_call_ops nfs_write_partial_ops = { 1067static const struct rpc_call_ops nfs_write_partial_ops = {
1068#if defined(CONFIG_NFS_V4_1)
1069 .rpc_call_prepare = nfs_write_prepare,
1070#endif /* CONFIG_NFS_V4_1 */
1052 .rpc_call_done = nfs_writeback_done_partial, 1071 .rpc_call_done = nfs_writeback_done_partial,
1053 .rpc_release = nfs_writeback_release_partial, 1072 .rpc_release = nfs_writeback_release_partial,
1054}; 1073};
@@ -1111,6 +1130,9 @@ remove_request:
1111} 1130}
1112 1131
1113static const struct rpc_call_ops nfs_write_full_ops = { 1132static const struct rpc_call_ops nfs_write_full_ops = {
1133#if defined(CONFIG_NFS_V4_1)
1134 .rpc_call_prepare = nfs_write_prepare,
1135#endif /* CONFIG_NFS_V4_1 */
1114 .rpc_call_done = nfs_writeback_done_full, 1136 .rpc_call_done = nfs_writeback_done_full,
1115 .rpc_release = nfs_writeback_release_full, 1137 .rpc_release = nfs_writeback_release_full,
1116}; 1138};
@@ -1123,6 +1145,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1123{ 1145{
1124 struct nfs_writeargs *argp = &data->args; 1146 struct nfs_writeargs *argp = &data->args;
1125 struct nfs_writeres *resp = &data->res; 1147 struct nfs_writeres *resp = &data->res;
1148 struct nfs_server *server = NFS_SERVER(data->inode);
1126 int status; 1149 int status;
1127 1150
1128 dprintk("NFS: %5u nfs_writeback_done (status %d)\n", 1151 dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1155,7 +1178,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1155 if (time_before(complain, jiffies)) { 1178 if (time_before(complain, jiffies)) {
1156 dprintk("NFS: faulty NFS server %s:" 1179 dprintk("NFS: faulty NFS server %s:"
1157 " (committed = %d) != (stable = %d)\n", 1180 " (committed = %d) != (stable = %d)\n",
1158 NFS_SERVER(data->inode)->nfs_client->cl_hostname, 1181 server->nfs_client->cl_hostname,
1159 resp->verf->committed, argp->stable); 1182 resp->verf->committed, argp->stable);
1160 complain = jiffies + 300 * HZ; 1183 complain = jiffies + 300 * HZ;
1161 } 1184 }
@@ -1181,7 +1204,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1181 */ 1204 */
1182 argp->stable = NFS_FILE_SYNC; 1205 argp->stable = NFS_FILE_SYNC;
1183 } 1206 }
1184 rpc_restart_call(task); 1207 nfs4_restart_rpc(task, server->nfs_client);
1185 return -EAGAIN; 1208 return -EAGAIN;
1186 } 1209 }
1187 if (time_before(complain, jiffies)) { 1210 if (time_before(complain, jiffies)) {
@@ -1193,6 +1216,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1193 /* Can't do anything about it except throw an error. */ 1216 /* Can't do anything about it except throw an error. */
1194 task->tk_status = -EIO; 1217 task->tk_status = -EIO;
1195 } 1218 }
1219 nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res);
1196 return 0; 1220 return 0;
1197} 1221}
1198 1222
@@ -1349,6 +1373,9 @@ static void nfs_commit_release(void *calldata)
1349} 1373}
1350 1374
1351static const struct rpc_call_ops nfs_commit_ops = { 1375static const struct rpc_call_ops nfs_commit_ops = {
1376#if defined(CONFIG_NFS_V4_1)
1377 .rpc_call_prepare = nfs_write_prepare,
1378#endif /* CONFIG_NFS_V4_1 */
1352 .rpc_call_done = nfs_commit_done, 1379 .rpc_call_done = nfs_commit_done,
1353 .rpc_release = nfs_commit_release, 1380 .rpc_release = nfs_commit_release,
1354}; 1381};
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5839b229cd0e..b92a27629fb7 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -464,16 +464,11 @@ static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp)
464 if (err) 464 if (err)
465 return err; 465 return err;
466 /* 466 /*
467 * Just a quick sanity check; we could also try to check 467 * XXX: It would be nice to also check whether this
468 * whether this pseudoflavor is supported, but at worst 468 * pseudoflavor is supported, so we can discover the
469 * an unsupported pseudoflavor on the export would just 469 * problem at export time instead of when a client fails
470 * be a pseudoflavor that won't match the flavor of any 470 * to authenticate.
471 * authenticated request. The administrator will
472 * probably discover the problem when someone fails to
473 * authenticate.
474 */ 471 */
475 if (f->pseudoflavor < 0)
476 return -EINVAL;
477 err = get_int(mesg, &f->flags); 472 err = get_int(mesg, &f->flags);
478 if (err) 473 if (err)
479 return err; 474 return err;
@@ -847,9 +842,8 @@ exp_get_fsid_key(svc_client *clp, int fsid)
847 return exp_find_key(clp, FSID_NUM, fsidv, NULL); 842 return exp_find_key(clp, FSID_NUM, fsidv, NULL);
848} 843}
849 844
850static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt, 845static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
851 struct dentry *dentry, 846 struct cache_req *reqp)
852 struct cache_req *reqp)
853{ 847{
854 struct svc_export *exp, key; 848 struct svc_export *exp, key;
855 int err; 849 int err;
@@ -858,8 +852,7 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
858 return ERR_PTR(-ENOENT); 852 return ERR_PTR(-ENOENT);
859 853
860 key.ex_client = clp; 854 key.ex_client = clp;
861 key.ex_path.mnt = mnt; 855 key.ex_path = *path;
862 key.ex_path.dentry = dentry;
863 856
864 exp = svc_export_lookup(&key); 857 exp = svc_export_lookup(&key);
865 if (exp == NULL) 858 if (exp == NULL)
@@ -873,24 +866,19 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
873/* 866/*
874 * Find the export entry for a given dentry. 867 * Find the export entry for a given dentry.
875 */ 868 */
876static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt, 869static struct svc_export *exp_parent(svc_client *clp, struct path *path)
877 struct dentry *dentry,
878 struct cache_req *reqp)
879{ 870{
880 svc_export *exp; 871 struct dentry *saved = dget(path->dentry);
881 872 svc_export *exp = exp_get_by_name(clp, path, NULL);
882 dget(dentry); 873
883 exp = exp_get_by_name(clp, mnt, dentry, reqp); 874 while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
884 875 struct dentry *parent = dget_parent(path->dentry);
885 while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) { 876 dput(path->dentry);
886 struct dentry *parent; 877 path->dentry = parent;
887 878 exp = exp_get_by_name(clp, path, NULL);
888 parent = dget_parent(dentry);
889 dput(dentry);
890 dentry = parent;
891 exp = exp_get_by_name(clp, mnt, dentry, reqp);
892 } 879 }
893 dput(dentry); 880 dput(path->dentry);
881 path->dentry = saved;
894 return exp; 882 return exp;
895} 883}
896 884
@@ -1018,7 +1006,7 @@ exp_export(struct nfsctl_export *nxp)
1018 goto out_put_clp; 1006 goto out_put_clp;
1019 err = -EINVAL; 1007 err = -EINVAL;
1020 1008
1021 exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL); 1009 exp = exp_get_by_name(clp, &path, NULL);
1022 1010
1023 memset(&new, 0, sizeof(new)); 1011 memset(&new, 0, sizeof(new));
1024 1012
@@ -1135,7 +1123,7 @@ exp_unexport(struct nfsctl_export *nxp)
1135 goto out_domain; 1123 goto out_domain;
1136 1124
1137 err = -EINVAL; 1125 err = -EINVAL;
1138 exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL); 1126 exp = exp_get_by_name(dom, &path, NULL);
1139 path_put(&path); 1127 path_put(&path);
1140 if (IS_ERR(exp)) 1128 if (IS_ERR(exp))
1141 goto out_domain; 1129 goto out_domain;
@@ -1177,7 +1165,7 @@ exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize)
1177 dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", 1165 dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
1178 name, path.dentry, clp->name, 1166 name, path.dentry, clp->name,
1179 inode->i_sb->s_id, inode->i_ino); 1167 inode->i_sb->s_id, inode->i_ino);
1180 exp = exp_parent(clp, path.mnt, path.dentry, NULL); 1168 exp = exp_parent(clp, &path);
1181 if (IS_ERR(exp)) { 1169 if (IS_ERR(exp)) {
1182 err = PTR_ERR(exp); 1170 err = PTR_ERR(exp);
1183 goto out; 1171 goto out;
@@ -1207,7 +1195,7 @@ static struct svc_export *exp_find(struct auth_domain *clp, int fsid_type,
1207 if (IS_ERR(ek)) 1195 if (IS_ERR(ek))
1208 return ERR_CAST(ek); 1196 return ERR_CAST(ek);
1209 1197
1210 exp = exp_get_by_name(clp, ek->ek_path.mnt, ek->ek_path.dentry, reqp); 1198 exp = exp_get_by_name(clp, &ek->ek_path, reqp);
1211 cache_put(&ek->h, &svc_expkey_cache); 1199 cache_put(&ek->h, &svc_expkey_cache);
1212 1200
1213 if (IS_ERR(exp)) 1201 if (IS_ERR(exp))
@@ -1247,8 +1235,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
1247 * use exp_get_by_name() or exp_find(). 1235 * use exp_get_by_name() or exp_find().
1248 */ 1236 */
1249struct svc_export * 1237struct svc_export *
1250rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt, 1238rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
1251 struct dentry *dentry)
1252{ 1239{
1253 struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); 1240 struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
1254 1241
@@ -1256,8 +1243,7 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
1256 goto gss; 1243 goto gss;
1257 1244
1258 /* First try the auth_unix client: */ 1245 /* First try the auth_unix client: */
1259 exp = exp_get_by_name(rqstp->rq_client, mnt, dentry, 1246 exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle);
1260 &rqstp->rq_chandle);
1261 if (PTR_ERR(exp) == -ENOENT) 1247 if (PTR_ERR(exp) == -ENOENT)
1262 goto gss; 1248 goto gss;
1263 if (IS_ERR(exp)) 1249 if (IS_ERR(exp))
@@ -1269,8 +1255,7 @@ gss:
1269 /* Otherwise, try falling back on gss client */ 1255 /* Otherwise, try falling back on gss client */
1270 if (rqstp->rq_gssclient == NULL) 1256 if (rqstp->rq_gssclient == NULL)
1271 return exp; 1257 return exp;
1272 gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry, 1258 gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle);
1273 &rqstp->rq_chandle);
1274 if (PTR_ERR(gssexp) == -ENOENT) 1259 if (PTR_ERR(gssexp) == -ENOENT)
1275 return exp; 1260 return exp;
1276 if (!IS_ERR(exp)) 1261 if (!IS_ERR(exp))
@@ -1309,23 +1294,19 @@ gss:
1309} 1294}
1310 1295
1311struct svc_export * 1296struct svc_export *
1312rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt, 1297rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
1313 struct dentry *dentry)
1314{ 1298{
1315 struct svc_export *exp; 1299 struct dentry *saved = dget(path->dentry);
1316 1300 struct svc_export *exp = rqst_exp_get_by_name(rqstp, path);
1317 dget(dentry); 1301
1318 exp = rqst_exp_get_by_name(rqstp, mnt, dentry); 1302 while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
1319 1303 struct dentry *parent = dget_parent(path->dentry);
1320 while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) { 1304 dput(path->dentry);
1321 struct dentry *parent; 1305 path->dentry = parent;
1322 1306 exp = rqst_exp_get_by_name(rqstp, path);
1323 parent = dget_parent(dentry);
1324 dput(dentry);
1325 dentry = parent;
1326 exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
1327 } 1307 }
1328 dput(dentry); 1308 dput(path->dentry);
1309 path->dentry = saved;
1329 return exp; 1310 return exp;
1330} 1311}
1331 1312
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7c9fe838f038..a713c418a922 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -652,8 +652,6 @@ nfsd3_proc_commit(struct svc_rqst * rqstp, struct nfsd3_commitargs *argp,
652 * NFSv3 Server procedures. 652 * NFSv3 Server procedures.
653 * Only the results of non-idempotent operations are cached. 653 * Only the results of non-idempotent operations are cached.
654 */ 654 */
655#define nfs3svc_decode_voidargs NULL
656#define nfs3svc_release_void NULL
657#define nfs3svc_decode_fhandleargs nfs3svc_decode_fhandle 655#define nfs3svc_decode_fhandleargs nfs3svc_decode_fhandle
658#define nfs3svc_encode_attrstatres nfs3svc_encode_attrstat 656#define nfs3svc_encode_attrstatres nfs3svc_encode_attrstat
659#define nfs3svc_encode_wccstatres nfs3svc_encode_wccstat 657#define nfs3svc_encode_wccstatres nfs3svc_encode_wccstat
@@ -686,28 +684,219 @@ struct nfsd3_voidargs { int dummy; };
686#define WC (7+pAT) /* WCC attributes */ 684#define WC (7+pAT) /* WCC attributes */
687 685
688static struct svc_procedure nfsd_procedures3[22] = { 686static struct svc_procedure nfsd_procedures3[22] = {
689 PROC(null, void, void, void, RC_NOCACHE, ST), 687 [NFS3PROC_NULL] = {
690 PROC(getattr, fhandle, attrstat, fhandle, RC_NOCACHE, ST+AT), 688 .pc_func = (svc_procfunc) nfsd3_proc_null,
691 PROC(setattr, sattr, wccstat, fhandle, RC_REPLBUFF, ST+WC), 689 .pc_encode = (kxdrproc_t) nfs3svc_encode_voidres,
692 PROC(lookup, dirop, dirop, fhandle2, RC_NOCACHE, ST+FH+pAT+pAT), 690 .pc_argsize = sizeof(struct nfsd3_voidargs),
693 PROC(access, access, access, fhandle, RC_NOCACHE, ST+pAT+1), 691 .pc_ressize = sizeof(struct nfsd3_voidres),
694 PROC(readlink, readlink, readlink, fhandle, RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4), 692 .pc_cachetype = RC_NOCACHE,
695 PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4), 693 .pc_xdrressize = ST,
696 PROC(write, write, write, fhandle, RC_REPLBUFF, ST+WC+4), 694 },
697 PROC(create, create, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), 695 [NFS3PROC_GETATTR] = {
698 PROC(mkdir, mkdir, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), 696 .pc_func = (svc_procfunc) nfsd3_proc_getattr,
699 PROC(symlink, symlink, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), 697 .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
700 PROC(mknod, mknod, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), 698 .pc_encode = (kxdrproc_t) nfs3svc_encode_attrstatres,
701 PROC(remove, dirop, wccstat, fhandle, RC_REPLBUFF, ST+WC), 699 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
702 PROC(rmdir, dirop, wccstat, fhandle, RC_REPLBUFF, ST+WC), 700 .pc_argsize = sizeof(struct nfsd3_fhandleargs),
703 PROC(rename, rename, rename, fhandle2, RC_REPLBUFF, ST+WC+WC), 701 .pc_ressize = sizeof(struct nfsd3_attrstatres),
704 PROC(link, link, link, fhandle2, RC_REPLBUFF, ST+pAT+WC), 702 .pc_cachetype = RC_NOCACHE,
705 PROC(readdir, readdir, readdir, fhandle, RC_NOCACHE, 0), 703 .pc_xdrressize = ST+AT,
706 PROC(readdirplus,readdirplus, readdir, fhandle, RC_NOCACHE, 0), 704 },
707 PROC(fsstat, fhandle, fsstat, void, RC_NOCACHE, ST+pAT+2*6+1), 705 [NFS3PROC_SETATTR] = {
708 PROC(fsinfo, fhandle, fsinfo, void, RC_NOCACHE, ST+pAT+12), 706 .pc_func = (svc_procfunc) nfsd3_proc_setattr,
709 PROC(pathconf, fhandle, pathconf, void, RC_NOCACHE, ST+pAT+6), 707 .pc_decode = (kxdrproc_t) nfs3svc_decode_sattrargs,
710 PROC(commit, commit, commit, fhandle, RC_NOCACHE, ST+WC+2), 708 .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
709 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
710 .pc_argsize = sizeof(struct nfsd3_sattrargs),
711 .pc_ressize = sizeof(struct nfsd3_wccstatres),
712 .pc_cachetype = RC_REPLBUFF,
713 .pc_xdrressize = ST+WC,
714 },
715 [NFS3PROC_LOOKUP] = {
716 .pc_func = (svc_procfunc) nfsd3_proc_lookup,
717 .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
718 .pc_encode = (kxdrproc_t) nfs3svc_encode_diropres,
719 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
720 .pc_argsize = sizeof(struct nfsd3_diropargs),
721 .pc_ressize = sizeof(struct nfsd3_diropres),
722 .pc_cachetype = RC_NOCACHE,
723 .pc_xdrressize = ST+FH+pAT+pAT,
724 },
725 [NFS3PROC_ACCESS] = {
726 .pc_func = (svc_procfunc) nfsd3_proc_access,
727 .pc_decode = (kxdrproc_t) nfs3svc_decode_accessargs,
728 .pc_encode = (kxdrproc_t) nfs3svc_encode_accessres,
729 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
730 .pc_argsize = sizeof(struct nfsd3_accessargs),
731 .pc_ressize = sizeof(struct nfsd3_accessres),
732 .pc_cachetype = RC_NOCACHE,
733 .pc_xdrressize = ST+pAT+1,
734 },
735 [NFS3PROC_READLINK] = {
736 .pc_func = (svc_procfunc) nfsd3_proc_readlink,
737 .pc_decode = (kxdrproc_t) nfs3svc_decode_readlinkargs,
738 .pc_encode = (kxdrproc_t) nfs3svc_encode_readlinkres,
739 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
740 .pc_argsize = sizeof(struct nfsd3_readlinkargs),
741 .pc_ressize = sizeof(struct nfsd3_readlinkres),
742 .pc_cachetype = RC_NOCACHE,
743 .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4,
744 },
745 [NFS3PROC_READ] = {
746 .pc_func = (svc_procfunc) nfsd3_proc_read,
747 .pc_decode = (kxdrproc_t) nfs3svc_decode_readargs,
748 .pc_encode = (kxdrproc_t) nfs3svc_encode_readres,
749 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
750 .pc_argsize = sizeof(struct nfsd3_readargs),
751 .pc_ressize = sizeof(struct nfsd3_readres),
752 .pc_cachetype = RC_NOCACHE,
753 .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4,
754 },
755 [NFS3PROC_WRITE] = {
756 .pc_func = (svc_procfunc) nfsd3_proc_write,
757 .pc_decode = (kxdrproc_t) nfs3svc_decode_writeargs,
758 .pc_encode = (kxdrproc_t) nfs3svc_encode_writeres,
759 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
760 .pc_argsize = sizeof(struct nfsd3_writeargs),
761 .pc_ressize = sizeof(struct nfsd3_writeres),
762 .pc_cachetype = RC_REPLBUFF,
763 .pc_xdrressize = ST+WC+4,
764 },
765 [NFS3PROC_CREATE] = {
766 .pc_func = (svc_procfunc) nfsd3_proc_create,
767 .pc_decode = (kxdrproc_t) nfs3svc_decode_createargs,
768 .pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
769 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
770 .pc_argsize = sizeof(struct nfsd3_createargs),
771 .pc_ressize = sizeof(struct nfsd3_createres),
772 .pc_cachetype = RC_REPLBUFF,
773 .pc_xdrressize = ST+(1+FH+pAT)+WC,
774 },
775 [NFS3PROC_MKDIR] = {
776 .pc_func = (svc_procfunc) nfsd3_proc_mkdir,
777 .pc_decode = (kxdrproc_t) nfs3svc_decode_mkdirargs,
778 .pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
779 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
780 .pc_argsize = sizeof(struct nfsd3_mkdirargs),
781 .pc_ressize = sizeof(struct nfsd3_createres),
782 .pc_cachetype = RC_REPLBUFF,
783 .pc_xdrressize = ST+(1+FH+pAT)+WC,
784 },
785 [NFS3PROC_SYMLINK] = {
786 .pc_func = (svc_procfunc) nfsd3_proc_symlink,
787 .pc_decode = (kxdrproc_t) nfs3svc_decode_symlinkargs,
788 .pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
789 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
790 .pc_argsize = sizeof(struct nfsd3_symlinkargs),
791 .pc_ressize = sizeof(struct nfsd3_createres),
792 .pc_cachetype = RC_REPLBUFF,
793 .pc_xdrressize = ST+(1+FH+pAT)+WC,
794 },
795 [NFS3PROC_MKNOD] = {
796 .pc_func = (svc_procfunc) nfsd3_proc_mknod,
797 .pc_decode = (kxdrproc_t) nfs3svc_decode_mknodargs,
798 .pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
799 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
800 .pc_argsize = sizeof(struct nfsd3_mknodargs),
801 .pc_ressize = sizeof(struct nfsd3_createres),
802 .pc_cachetype = RC_REPLBUFF,
803 .pc_xdrressize = ST+(1+FH+pAT)+WC,
804 },
805 [NFS3PROC_REMOVE] = {
806 .pc_func = (svc_procfunc) nfsd3_proc_remove,
807 .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
808 .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
809 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
810 .pc_argsize = sizeof(struct nfsd3_diropargs),
811 .pc_ressize = sizeof(struct nfsd3_wccstatres),
812 .pc_cachetype = RC_REPLBUFF,
813 .pc_xdrressize = ST+WC,
814 },
815 [NFS3PROC_RMDIR] = {
816 .pc_func = (svc_procfunc) nfsd3_proc_rmdir,
817 .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
818 .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
819 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
820 .pc_argsize = sizeof(struct nfsd3_diropargs),
821 .pc_ressize = sizeof(struct nfsd3_wccstatres),
822 .pc_cachetype = RC_REPLBUFF,
823 .pc_xdrressize = ST+WC,
824 },
825 [NFS3PROC_RENAME] = {
826 .pc_func = (svc_procfunc) nfsd3_proc_rename,
827 .pc_decode = (kxdrproc_t) nfs3svc_decode_renameargs,
828 .pc_encode = (kxdrproc_t) nfs3svc_encode_renameres,
829 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
830 .pc_argsize = sizeof(struct nfsd3_renameargs),
831 .pc_ressize = sizeof(struct nfsd3_renameres),
832 .pc_cachetype = RC_REPLBUFF,
833 .pc_xdrressize = ST+WC+WC,
834 },
835 [NFS3PROC_LINK] = {
836 .pc_func = (svc_procfunc) nfsd3_proc_link,
837 .pc_decode = (kxdrproc_t) nfs3svc_decode_linkargs,
838 .pc_encode = (kxdrproc_t) nfs3svc_encode_linkres,
839 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
840 .pc_argsize = sizeof(struct nfsd3_linkargs),
841 .pc_ressize = sizeof(struct nfsd3_linkres),
842 .pc_cachetype = RC_REPLBUFF,
843 .pc_xdrressize = ST+pAT+WC,
844 },
845 [NFS3PROC_READDIR] = {
846 .pc_func = (svc_procfunc) nfsd3_proc_readdir,
847 .pc_decode = (kxdrproc_t) nfs3svc_decode_readdirargs,
848 .pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres,
849 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
850 .pc_argsize = sizeof(struct nfsd3_readdirargs),
851 .pc_ressize = sizeof(struct nfsd3_readdirres),
852 .pc_cachetype = RC_NOCACHE,
853 },
854 [NFS3PROC_READDIRPLUS] = {
855 .pc_func = (svc_procfunc) nfsd3_proc_readdirplus,
856 .pc_decode = (kxdrproc_t) nfs3svc_decode_readdirplusargs,
857 .pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres,
858 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
859 .pc_argsize = sizeof(struct nfsd3_readdirplusargs),
860 .pc_ressize = sizeof(struct nfsd3_readdirres),
861 .pc_cachetype = RC_NOCACHE,
862 },
863 [NFS3PROC_FSSTAT] = {
864 .pc_func = (svc_procfunc) nfsd3_proc_fsstat,
865 .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
866 .pc_encode = (kxdrproc_t) nfs3svc_encode_fsstatres,
867 .pc_argsize = sizeof(struct nfsd3_fhandleargs),
868 .pc_ressize = sizeof(struct nfsd3_fsstatres),
869 .pc_cachetype = RC_NOCACHE,
870 .pc_xdrressize = ST+pAT+2*6+1,
871 },
872 [NFS3PROC_FSINFO] = {
873 .pc_func = (svc_procfunc) nfsd3_proc_fsinfo,
874 .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
875 .pc_encode = (kxdrproc_t) nfs3svc_encode_fsinfores,
876 .pc_argsize = sizeof(struct nfsd3_fhandleargs),
877 .pc_ressize = sizeof(struct nfsd3_fsinfores),
878 .pc_cachetype = RC_NOCACHE,
879 .pc_xdrressize = ST+pAT+12,
880 },
881 [NFS3PROC_PATHCONF] = {
882 .pc_func = (svc_procfunc) nfsd3_proc_pathconf,
883 .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
884 .pc_encode = (kxdrproc_t) nfs3svc_encode_pathconfres,
885 .pc_argsize = sizeof(struct nfsd3_fhandleargs),
886 .pc_ressize = sizeof(struct nfsd3_pathconfres),
887 .pc_cachetype = RC_NOCACHE,
888 .pc_xdrressize = ST+pAT+6,
889 },
890 [NFS3PROC_COMMIT] = {
891 .pc_func = (svc_procfunc) nfsd3_proc_commit,
892 .pc_decode = (kxdrproc_t) nfs3svc_decode_commitargs,
893 .pc_encode = (kxdrproc_t) nfs3svc_encode_commitres,
894 .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
895 .pc_argsize = sizeof(struct nfsd3_commitargs),
896 .pc_ressize = sizeof(struct nfsd3_commitres),
897 .pc_cachetype = RC_NOCACHE,
898 .pc_xdrressize = ST+WC+2,
899 },
711}; 900};
712 901
713struct svc_version nfsd_version3 = { 902struct svc_version nfsd_version3 = {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 17d0dd997204..01d4ec1c88e0 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -272,6 +272,7 @@ void fill_post_wcc(struct svc_fh *fhp)
272 272
273 err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, 273 err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
274 &fhp->fh_post_attr); 274 &fhp->fh_post_attr);
275 fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
275 if (err) 276 if (err)
276 fhp->fh_post_saved = 0; 277 fhp->fh_post_saved = 0;
277 else 278 else
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 290289bd44f7..3fd23f7aceca 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -140,8 +140,10 @@ struct nfs4_cb_compound_hdr {
140 int status; 140 int status;
141 u32 ident; 141 u32 ident;
142 u32 nops; 142 u32 nops;
143 __be32 *nops_p;
144 u32 minorversion;
143 u32 taglen; 145 u32 taglen;
144 char * tag; 146 char *tag;
145}; 147};
146 148
147static struct { 149static struct {
@@ -201,33 +203,39 @@ nfs_cb_stat_to_errno(int stat)
201 * XDR encode 203 * XDR encode
202 */ 204 */
203 205
204static int 206static void
205encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) 207encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
206{ 208{
207 __be32 * p; 209 __be32 * p;
208 210
209 RESERVE_SPACE(16); 211 RESERVE_SPACE(16);
210 WRITE32(0); /* tag length is always 0 */ 212 WRITE32(0); /* tag length is always 0 */
211 WRITE32(NFS4_MINOR_VERSION); 213 WRITE32(hdr->minorversion);
212 WRITE32(hdr->ident); 214 WRITE32(hdr->ident);
215 hdr->nops_p = p;
213 WRITE32(hdr->nops); 216 WRITE32(hdr->nops);
214 return 0;
215} 217}
216 218
217static int 219static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
218encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) 220{
221 *hdr->nops_p = htonl(hdr->nops);
222}
223
224static void
225encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
226 struct nfs4_cb_compound_hdr *hdr)
219{ 227{
220 __be32 *p; 228 __be32 *p;
221 int len = cb_rec->cbr_fh.fh_size; 229 int len = dp->dl_fh.fh_size;
222 230
223 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); 231 RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len);
224 WRITE32(OP_CB_RECALL); 232 WRITE32(OP_CB_RECALL);
225 WRITE32(cb_rec->cbr_stateid.si_generation); 233 WRITE32(dp->dl_stateid.si_generation);
226 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); 234 WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t));
227 WRITE32(cb_rec->cbr_trunc); 235 WRITE32(0); /* truncate optimization not implemented */
228 WRITE32(len); 236 WRITE32(len);
229 WRITEMEM(&cb_rec->cbr_fh.fh_base, len); 237 WRITEMEM(&dp->dl_fh.fh_base, len);
230 return 0; 238 hdr->nops++;
231} 239}
232 240
233static int 241static int
@@ -241,17 +249,18 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
241} 249}
242 250
243static int 251static int
244nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_cb_recall *args) 252nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_delegation *args)
245{ 253{
246 struct xdr_stream xdr; 254 struct xdr_stream xdr;
247 struct nfs4_cb_compound_hdr hdr = { 255 struct nfs4_cb_compound_hdr hdr = {
248 .ident = args->cbr_ident, 256 .ident = args->dl_ident,
249 .nops = 1,
250 }; 257 };
251 258
252 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 259 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
253 encode_cb_compound_hdr(&xdr, &hdr); 260 encode_cb_compound_hdr(&xdr, &hdr);
254 return (encode_cb_recall(&xdr, args)); 261 encode_cb_recall(&xdr, args, &hdr);
262 encode_cb_nops(&hdr);
263 return 0;
255} 264}
256 265
257 266
@@ -358,18 +367,21 @@ static struct rpc_program cb_program = {
358 .pipe_dir_name = "/nfsd4_cb", 367 .pipe_dir_name = "/nfsd4_cb",
359}; 368};
360 369
370static int max_cb_time(void)
371{
372 return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ;
373}
374
361/* Reference counting, callback cleanup, etc., all look racy as heck. 375/* Reference counting, callback cleanup, etc., all look racy as heck.
362 * And why is cb_set an atomic? */ 376 * And why is cb_set an atomic? */
363 377
364static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp) 378int setup_callback_client(struct nfs4_client *clp)
365{ 379{
366 struct sockaddr_in addr; 380 struct sockaddr_in addr;
367 struct nfs4_callback *cb = &clp->cl_callback; 381 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
368 struct rpc_timeout timeparms = { 382 struct rpc_timeout timeparms = {
369 .to_initval = (NFSD_LEASE_TIME/4) * HZ, 383 .to_initval = max_cb_time(),
370 .to_retries = 5, 384 .to_retries = 0,
371 .to_maxval = (NFSD_LEASE_TIME/2) * HZ,
372 .to_exponential = 1,
373 }; 385 };
374 struct rpc_create_args args = { 386 struct rpc_create_args args = {
375 .protocol = IPPROTO_TCP, 387 .protocol = IPPROTO_TCP,
@@ -386,7 +398,7 @@ static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
386 struct rpc_clnt *client; 398 struct rpc_clnt *client;
387 399
388 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 400 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
389 return ERR_PTR(-EINVAL); 401 return -EINVAL;
390 402
391 /* Initialize address */ 403 /* Initialize address */
392 memset(&addr, 0, sizeof(addr)); 404 memset(&addr, 0, sizeof(addr));
@@ -396,48 +408,77 @@ static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
396 408
397 /* Create RPC client */ 409 /* Create RPC client */
398 client = rpc_create(&args); 410 client = rpc_create(&args);
399 if (IS_ERR(client)) 411 if (IS_ERR(client)) {
400 dprintk("NFSD: couldn't create callback client: %ld\n", 412 dprintk("NFSD: couldn't create callback client: %ld\n",
401 PTR_ERR(client)); 413 PTR_ERR(client));
402 return client; 414 return PTR_ERR(client);
415 }
416 cb->cb_client = client;
417 return 0;
418
419}
420
421static void warn_no_callback_path(struct nfs4_client *clp, int reason)
422{
423 dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
424 (int)clp->cl_name.len, clp->cl_name.data, reason);
425}
426
427static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
428{
429 struct nfs4_client *clp = calldata;
430
431 if (task->tk_status)
432 warn_no_callback_path(clp, task->tk_status);
433 else
434 atomic_set(&clp->cl_cb_conn.cb_set, 1);
435 put_nfs4_client(clp);
436}
437
438static const struct rpc_call_ops nfsd4_cb_probe_ops = {
439 .rpc_call_done = nfsd4_cb_probe_done,
440};
403 441
442static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb)
443{
444 struct auth_cred acred = {
445 .machine_cred = 1
446 };
447
448 /*
449 * Note in the gss case this doesn't actually have to wait for a
450 * gss upcall (or any calls to the client); this just creates a
451 * non-uptodate cred which the rpc state machine will fill in with
452 * a refresh_upcall later.
453 */
454 return rpcauth_lookup_credcache(cb->cb_client->cl_auth, &acred,
455 RPCAUTH_LOOKUP_NEW);
404} 456}
405 457
406static int do_probe_callback(void *data) 458void do_probe_callback(struct nfs4_client *clp)
407{ 459{
408 struct nfs4_client *clp = data; 460 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
409 struct nfs4_callback *cb = &clp->cl_callback;
410 struct rpc_message msg = { 461 struct rpc_message msg = {
411 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 462 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
412 .rpc_argp = clp, 463 .rpc_argp = clp,
413 }; 464 };
414 struct rpc_clnt *client; 465 struct rpc_cred *cred;
415 int status; 466 int status;
416 467
417 client = setup_callback_client(clp); 468 cred = lookup_cb_cred(cb);
418 if (IS_ERR(client)) { 469 if (IS_ERR(cred)) {
419 status = PTR_ERR(client); 470 status = PTR_ERR(cred);
420 dprintk("NFSD: couldn't create callback client: %d\n", 471 goto out;
421 status); 472 }
422 goto out_err; 473 cb->cb_cred = cred;
474 msg.rpc_cred = cb->cb_cred;
475 status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT,
476 &nfsd4_cb_probe_ops, (void *)clp);
477out:
478 if (status) {
479 warn_no_callback_path(clp, status);
480 put_nfs4_client(clp);
423 } 481 }
424
425 status = rpc_call_sync(client, &msg, RPC_TASK_SOFT);
426
427 if (status)
428 goto out_release_client;
429
430 cb->cb_client = client;
431 atomic_set(&cb->cb_set, 1);
432 put_nfs4_client(clp);
433 return 0;
434out_release_client:
435 rpc_shutdown_client(client);
436out_err:
437 dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
438 (int)clp->cl_name.len, clp->cl_name.data, status);
439 put_nfs4_client(clp);
440 return 0;
441} 482}
442 483
443/* 484/*
@@ -446,21 +487,65 @@ out_err:
446void 487void
447nfsd4_probe_callback(struct nfs4_client *clp) 488nfsd4_probe_callback(struct nfs4_client *clp)
448{ 489{
449 struct task_struct *t; 490 int status;
450 491
451 BUG_ON(atomic_read(&clp->cl_callback.cb_set)); 492 BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set));
493
494 status = setup_callback_client(clp);
495 if (status) {
496 warn_no_callback_path(clp, status);
497 return;
498 }
452 499
453 /* the task holds a reference to the nfs4_client struct */ 500 /* the task holds a reference to the nfs4_client struct */
454 atomic_inc(&clp->cl_count); 501 atomic_inc(&clp->cl_count);
455 502
456 t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe"); 503 do_probe_callback(clp);
504}
457 505
458 if (IS_ERR(t)) 506static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
459 atomic_dec(&clp->cl_count); 507{
508 struct nfs4_delegation *dp = calldata;
509 struct nfs4_client *clp = dp->dl_client;
460 510
461 return; 511 switch (task->tk_status) {
512 case -EIO:
513 /* Network partition? */
514 atomic_set(&clp->cl_cb_conn.cb_set, 0);
515 warn_no_callback_path(clp, task->tk_status);
516 case -EBADHANDLE:
517 case -NFS4ERR_BAD_STATEID:
518 /* Race: client probably got cb_recall
519 * before open reply granting delegation */
520 break;
521 default:
522 /* success, or error we can't handle */
523 return;
524 }
525 if (dp->dl_retries--) {
526 rpc_delay(task, 2*HZ);
527 task->tk_status = 0;
528 rpc_restart_call(task);
529 } else {
530 atomic_set(&clp->cl_cb_conn.cb_set, 0);
531 warn_no_callback_path(clp, task->tk_status);
532 }
533}
534
535static void nfsd4_cb_recall_release(void *calldata)
536{
537 struct nfs4_delegation *dp = calldata;
538 struct nfs4_client *clp = dp->dl_client;
539
540 nfs4_put_delegation(dp);
541 put_nfs4_client(clp);
462} 542}
463 543
544static const struct rpc_call_ops nfsd4_cb_recall_ops = {
545 .rpc_call_done = nfsd4_cb_recall_done,
546 .rpc_release = nfsd4_cb_recall_release,
547};
548
464/* 549/*
465 * called with dp->dl_count inc'ed. 550 * called with dp->dl_count inc'ed.
466 */ 551 */
@@ -468,41 +553,19 @@ void
468nfsd4_cb_recall(struct nfs4_delegation *dp) 553nfsd4_cb_recall(struct nfs4_delegation *dp)
469{ 554{
470 struct nfs4_client *clp = dp->dl_client; 555 struct nfs4_client *clp = dp->dl_client;
471 struct rpc_clnt *clnt = clp->cl_callback.cb_client; 556 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
472 struct nfs4_cb_recall *cbr = &dp->dl_recall;
473 struct rpc_message msg = { 557 struct rpc_message msg = {
474 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], 558 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
475 .rpc_argp = cbr, 559 .rpc_argp = dp,
560 .rpc_cred = clp->cl_cb_conn.cb_cred
476 }; 561 };
477 int retries = 1; 562 int status;
478 int status = 0; 563
479 564 dp->dl_retries = 1;
480 cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */ 565 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
481 cbr->cbr_dp = dp; 566 &nfsd4_cb_recall_ops, dp);
482 567 if (status) {
483 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT); 568 put_nfs4_client(clp);
484 while (retries--) { 569 nfs4_put_delegation(dp);
485 switch (status) {
486 case -EIO:
487 /* Network partition? */
488 atomic_set(&clp->cl_callback.cb_set, 0);
489 case -EBADHANDLE:
490 case -NFS4ERR_BAD_STATEID:
491 /* Race: client probably got cb_recall
492 * before open reply granting delegation */
493 break;
494 default:
495 goto out_put_cred;
496 }
497 ssleep(2);
498 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
499 } 570 }
500out_put_cred:
501 /*
502 * Success or failure, now we're either waiting for lease expiration
503 * or deleg_return.
504 */
505 put_nfs4_client(clp);
506 nfs4_put_delegation(dp);
507 return;
508} 571}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b2883e9c6381..7c8801769a3c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -51,6 +51,78 @@
51 51
52#define NFSDDBG_FACILITY NFSDDBG_PROC 52#define NFSDDBG_FACILITY NFSDDBG_PROC
53 53
54static u32 nfsd_attrmask[] = {
55 NFSD_WRITEABLE_ATTRS_WORD0,
56 NFSD_WRITEABLE_ATTRS_WORD1,
57 NFSD_WRITEABLE_ATTRS_WORD2
58};
59
60static u32 nfsd41_ex_attrmask[] = {
61 NFSD_SUPPATTR_EXCLCREAT_WORD0,
62 NFSD_SUPPATTR_EXCLCREAT_WORD1,
63 NFSD_SUPPATTR_EXCLCREAT_WORD2
64};
65
66static __be32
67check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
68 u32 *bmval, u32 *writable)
69{
70 struct dentry *dentry = cstate->current_fh.fh_dentry;
71 struct svc_export *exp = cstate->current_fh.fh_export;
72
73 /*
74 * Check about attributes are supported by the NFSv4 server or not.
75 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP.
76 */
77 if ((bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) ||
78 (bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) ||
79 (bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
80 return nfserr_attrnotsupp;
81
82 /*
83 * Check FATTR4_WORD0_ACL & FATTR4_WORD0_FS_LOCATIONS can be supported
84 * in current environment or not.
85 */
86 if (bmval[0] & FATTR4_WORD0_ACL) {
87 if (!IS_POSIXACL(dentry->d_inode))
88 return nfserr_attrnotsupp;
89 }
90 if (bmval[0] & FATTR4_WORD0_FS_LOCATIONS) {
91 if (exp->ex_fslocs.locations == NULL)
92 return nfserr_attrnotsupp;
93 }
94
95 /*
96 * According to spec, read-only attributes return ERR_INVAL.
97 */
98 if (writable) {
99 if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
100 (bmval[2] & ~writable[2]))
101 return nfserr_inval;
102 }
103
104 return nfs_ok;
105}
106
107static __be32
108nfsd4_check_open_attributes(struct svc_rqst *rqstp,
109 struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
110{
111 __be32 status = nfs_ok;
112
113 if (open->op_create == NFS4_OPEN_CREATE) {
114 if (open->op_createmode == NFS4_CREATE_UNCHECKED
115 || open->op_createmode == NFS4_CREATE_GUARDED)
116 status = check_attr_support(rqstp, cstate,
117 open->op_bmval, nfsd_attrmask);
118 else if (open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1)
119 status = check_attr_support(rqstp, cstate,
120 open->op_bmval, nfsd41_ex_attrmask);
121 }
122
123 return status;
124}
125
54static inline void 126static inline void
55fh_dup2(struct svc_fh *dst, struct svc_fh *src) 127fh_dup2(struct svc_fh *dst, struct svc_fh *src)
56{ 128{
@@ -225,6 +297,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
225 if (status) 297 if (status)
226 goto out; 298 goto out;
227 299
300 status = nfsd4_check_open_attributes(rqstp, cstate, open);
301 if (status)
302 goto out;
303
228 /* Openowner is now set, so sequence id will get bumped. Now we need 304 /* Openowner is now set, so sequence id will get bumped. Now we need
229 * these checks before we do any creates: */ 305 * these checks before we do any creates: */
230 status = nfserr_grace; 306 status = nfserr_grace;
@@ -395,6 +471,11 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
395 if (status) 471 if (status)
396 return status; 472 return status;
397 473
474 status = check_attr_support(rqstp, cstate, create->cr_bmval,
475 nfsd_attrmask);
476 if (status)
477 return status;
478
398 switch (create->cr_type) { 479 switch (create->cr_type) {
399 case NF4LNK: 480 case NF4LNK:
400 /* ugh! we have to null-terminate the linktext, or 481 /* ugh! we have to null-terminate the linktext, or
@@ -689,6 +770,12 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
689 if (status) 770 if (status)
690 return status; 771 return status;
691 status = nfs_ok; 772 status = nfs_ok;
773
774 status = check_attr_support(rqstp, cstate, setattr->sa_bmval,
775 nfsd_attrmask);
776 if (status)
777 goto out;
778
692 if (setattr->sa_acl != NULL) 779 if (setattr->sa_acl != NULL)
693 status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh, 780 status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
694 setattr->sa_acl); 781 setattr->sa_acl);
@@ -763,10 +850,10 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
763 if (status) 850 if (status)
764 return status; 851 return status;
765 852
766 if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) 853 status = check_attr_support(rqstp, cstate, verify->ve_bmval, NULL);
767 || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) 854 if (status)
768 || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion))) 855 return status;
769 return nfserr_attrnotsupp; 856
770 if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR) 857 if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
771 || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)) 858 || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
772 return nfserr_inval; 859 return nfserr_inval;
@@ -1226,24 +1313,9 @@ static const char *nfsd4_op_name(unsigned opnum)
1226 return "unknown_operation"; 1313 return "unknown_operation";
1227} 1314}
1228 1315
1229#define nfs4svc_decode_voidargs NULL
1230#define nfs4svc_release_void NULL
1231#define nfsd4_voidres nfsd4_voidargs 1316#define nfsd4_voidres nfsd4_voidargs
1232#define nfs4svc_release_compound NULL
1233struct nfsd4_voidargs { int dummy; }; 1317struct nfsd4_voidargs { int dummy; };
1234 1318
1235#define PROC(name, argt, rest, relt, cache, respsize) \
1236 { (svc_procfunc) nfsd4_proc_##name, \
1237 (kxdrproc_t) nfs4svc_decode_##argt##args, \
1238 (kxdrproc_t) nfs4svc_encode_##rest##res, \
1239 (kxdrproc_t) nfs4svc_release_##relt, \
1240 sizeof(struct nfsd4_##argt##args), \
1241 sizeof(struct nfsd4_##rest##res), \
1242 0, \
1243 cache, \
1244 respsize, \
1245 }
1246
1247/* 1319/*
1248 * TODO: At the present time, the NFSv4 server does not do XID caching 1320 * TODO: At the present time, the NFSv4 server does not do XID caching
1249 * of requests. Implementing XID caching would not be a serious problem, 1321 * of requests. Implementing XID caching would not be a serious problem,
@@ -1255,8 +1327,23 @@ struct nfsd4_voidargs { int dummy; };
1255 * better XID's. 1327 * better XID's.
1256 */ 1328 */
1257static struct svc_procedure nfsd_procedures4[2] = { 1329static struct svc_procedure nfsd_procedures4[2] = {
1258 PROC(null, void, void, void, RC_NOCACHE, 1), 1330 [NFSPROC4_NULL] = {
1259 PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE/4) 1331 .pc_func = (svc_procfunc) nfsd4_proc_null,
1332 .pc_encode = (kxdrproc_t) nfs4svc_encode_voidres,
1333 .pc_argsize = sizeof(struct nfsd4_voidargs),
1334 .pc_ressize = sizeof(struct nfsd4_voidres),
1335 .pc_cachetype = RC_NOCACHE,
1336 .pc_xdrressize = 1,
1337 },
1338 [NFSPROC4_COMPOUND] = {
1339 .pc_func = (svc_procfunc) nfsd4_proc_compound,
1340 .pc_decode = (kxdrproc_t) nfs4svc_decode_compoundargs,
1341 .pc_encode = (kxdrproc_t) nfs4svc_encode_compoundres,
1342 .pc_argsize = sizeof(struct nfsd4_compoundargs),
1343 .pc_ressize = sizeof(struct nfsd4_compoundres),
1344 .pc_cachetype = RC_NOCACHE,
1345 .pc_xdrressize = NFSD_BUFSIZE/4,
1346 },
1260}; 1347};
1261 1348
1262struct svc_version nfsd_version4 = { 1349struct svc_version nfsd_version4 = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3b711f5147a7..980a216a48c8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -182,7 +182,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
182{ 182{
183 struct nfs4_delegation *dp; 183 struct nfs4_delegation *dp;
184 struct nfs4_file *fp = stp->st_file; 184 struct nfs4_file *fp = stp->st_file;
185 struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback; 185 struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
186 186
187 dprintk("NFSD alloc_init_deleg\n"); 187 dprintk("NFSD alloc_init_deleg\n");
188 if (fp->fi_had_conflict) 188 if (fp->fi_had_conflict)
@@ -203,10 +203,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
203 get_file(stp->st_vfs_file); 203 get_file(stp->st_vfs_file);
204 dp->dl_vfs_file = stp->st_vfs_file; 204 dp->dl_vfs_file = stp->st_vfs_file;
205 dp->dl_type = type; 205 dp->dl_type = type;
206 dp->dl_recall.cbr_dp = NULL; 206 dp->dl_ident = cb->cb_ident;
207 dp->dl_recall.cbr_ident = cb->cb_ident; 207 dp->dl_stateid.si_boot = get_seconds();
208 dp->dl_recall.cbr_trunc = 0;
209 dp->dl_stateid.si_boot = boot_time;
210 dp->dl_stateid.si_stateownerid = current_delegid++; 208 dp->dl_stateid.si_stateownerid = current_delegid++;
211 dp->dl_stateid.si_fileid = 0; 209 dp->dl_stateid.si_fileid = 0;
212 dp->dl_stateid.si_generation = 0; 210 dp->dl_stateid.si_generation = 0;
@@ -427,6 +425,11 @@ static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
427{ 425{
428 int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT; 426 int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
429 427
428 if (fchan->maxreqs < 1)
429 return nfserr_inval;
430 else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
431 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
432
430 spin_lock(&nfsd_serv->sv_lock); 433 spin_lock(&nfsd_serv->sv_lock);
431 if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages) 434 if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
432 np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used; 435 np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
@@ -446,8 +449,8 @@ static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
446 * fchan holds the client values on input, and the server values on output 449 * fchan holds the client values on input, and the server values on output
447 */ 450 */
448static int init_forechannel_attrs(struct svc_rqst *rqstp, 451static int init_forechannel_attrs(struct svc_rqst *rqstp,
449 struct nfsd4_session *session, 452 struct nfsd4_channel_attrs *session_fchan,
450 struct nfsd4_channel_attrs *fchan) 453 struct nfsd4_channel_attrs *fchan)
451{ 454{
452 int status = 0; 455 int status = 0;
453 __u32 maxcount = svc_max_payload(rqstp); 456 __u32 maxcount = svc_max_payload(rqstp);
@@ -457,21 +460,21 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
457 /* Use the client's max request and max response size if possible */ 460 /* Use the client's max request and max response size if possible */
458 if (fchan->maxreq_sz > maxcount) 461 if (fchan->maxreq_sz > maxcount)
459 fchan->maxreq_sz = maxcount; 462 fchan->maxreq_sz = maxcount;
460 session->se_fmaxreq_sz = fchan->maxreq_sz; 463 session_fchan->maxreq_sz = fchan->maxreq_sz;
461 464
462 if (fchan->maxresp_sz > maxcount) 465 if (fchan->maxresp_sz > maxcount)
463 fchan->maxresp_sz = maxcount; 466 fchan->maxresp_sz = maxcount;
464 session->se_fmaxresp_sz = fchan->maxresp_sz; 467 session_fchan->maxresp_sz = fchan->maxresp_sz;
465 468
466 /* Set the max response cached size our default which is 469 /* Set the max response cached size our default which is
467 * a multiple of PAGE_SIZE and small */ 470 * a multiple of PAGE_SIZE and small */
468 session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE; 471 session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
469 fchan->maxresp_cached = session->se_fmaxresp_cached; 472 fchan->maxresp_cached = session_fchan->maxresp_cached;
470 473
471 /* Use the client's maxops if possible */ 474 /* Use the client's maxops if possible */
472 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) 475 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
473 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; 476 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
474 session->se_fmaxops = fchan->maxops; 477 session_fchan->maxops = fchan->maxops;
475 478
476 /* try to use the client requested number of slots */ 479 /* try to use the client requested number of slots */
477 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION) 480 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
@@ -483,7 +486,7 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
483 */ 486 */
484 status = set_forechannel_maxreqs(fchan); 487 status = set_forechannel_maxreqs(fchan);
485 488
486 session->se_fnumslots = fchan->maxreqs; 489 session_fchan->maxreqs = fchan->maxreqs;
487 return status; 490 return status;
488} 491}
489 492
@@ -497,12 +500,14 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
497 memset(&tmp, 0, sizeof(tmp)); 500 memset(&tmp, 0, sizeof(tmp));
498 501
499 /* FIXME: For now, we just accept the client back channel attributes. */ 502 /* FIXME: For now, we just accept the client back channel attributes. */
500 status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel); 503 tmp.se_bchannel = cses->back_channel;
504 status = init_forechannel_attrs(rqstp, &tmp.se_fchannel,
505 &cses->fore_channel);
501 if (status) 506 if (status)
502 goto out; 507 goto out;
503 508
504 /* allocate struct nfsd4_session and slot table in one piece */ 509 /* allocate struct nfsd4_session and slot table in one piece */
505 slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot); 510 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot);
506 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); 511 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
507 if (!new) 512 if (!new)
508 goto out; 513 goto out;
@@ -576,7 +581,7 @@ free_session(struct kref *kref)
576 int i; 581 int i;
577 582
578 ses = container_of(kref, struct nfsd4_session, se_ref); 583 ses = container_of(kref, struct nfsd4_session, se_ref);
579 for (i = 0; i < ses->se_fnumslots; i++) { 584 for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
580 struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry; 585 struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
581 nfsd4_release_respages(e->ce_respages, e->ce_resused); 586 nfsd4_release_respages(e->ce_respages, e->ce_resused);
582 } 587 }
@@ -632,16 +637,20 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
632static void 637static void
633shutdown_callback_client(struct nfs4_client *clp) 638shutdown_callback_client(struct nfs4_client *clp)
634{ 639{
635 struct rpc_clnt *clnt = clp->cl_callback.cb_client; 640 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
636 641
637 if (clnt) { 642 if (clnt) {
638 /* 643 /*
639 * Callback threads take a reference on the client, so there 644 * Callback threads take a reference on the client, so there
640 * should be no outstanding callbacks at this point. 645 * should be no outstanding callbacks at this point.
641 */ 646 */
642 clp->cl_callback.cb_client = NULL; 647 clp->cl_cb_conn.cb_client = NULL;
643 rpc_shutdown_client(clnt); 648 rpc_shutdown_client(clnt);
644 } 649 }
650 if (clp->cl_cb_conn.cb_cred) {
651 put_rpccred(clp->cl_cb_conn.cb_cred);
652 clp->cl_cb_conn.cb_cred = NULL;
653 }
645} 654}
646 655
647static inline void 656static inline void
@@ -714,7 +723,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
714 return NULL; 723 return NULL;
715 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 724 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
716 atomic_set(&clp->cl_count, 1); 725 atomic_set(&clp->cl_count, 1);
717 atomic_set(&clp->cl_callback.cb_set, 0); 726 atomic_set(&clp->cl_cb_conn.cb_set, 0);
718 INIT_LIST_HEAD(&clp->cl_idhash); 727 INIT_LIST_HEAD(&clp->cl_idhash);
719 INIT_LIST_HEAD(&clp->cl_strhash); 728 INIT_LIST_HEAD(&clp->cl_strhash);
720 INIT_LIST_HEAD(&clp->cl_openowners); 729 INIT_LIST_HEAD(&clp->cl_openowners);
@@ -966,7 +975,7 @@ parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigne
966static void 975static void
967gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) 976gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
968{ 977{
969 struct nfs4_callback *cb = &clp->cl_callback; 978 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
970 979
971 /* Currently, we only support tcp for the callback channel */ 980 /* Currently, we only support tcp for the callback channel */
972 if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3)) 981 if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3))
@@ -975,6 +984,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
975 if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val, 984 if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
976 &cb->cb_addr, &cb->cb_port))) 985 &cb->cb_addr, &cb->cb_port)))
977 goto out_err; 986 goto out_err;
987 cb->cb_minorversion = 0;
978 cb->cb_prog = se->se_callback_prog; 988 cb->cb_prog = se->se_callback_prog;
979 cb->cb_ident = se->se_callback_ident; 989 cb->cb_ident = se->se_callback_ident;
980 return; 990 return;
@@ -1128,7 +1138,7 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
1128 * is sent (lease renewal). 1138 * is sent (lease renewal).
1129 */ 1139 */
1130 if (seq && nfsd4_not_cached(resp)) { 1140 if (seq && nfsd4_not_cached(resp)) {
1131 seq->maxslots = resp->cstate.session->se_fnumslots; 1141 seq->maxslots = resp->cstate.session->se_fchannel.maxreqs;
1132 return nfs_ok; 1142 return nfs_ok;
1133 } 1143 }
1134 1144
@@ -1238,12 +1248,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1238 expire_client(conf); 1248 expire_client(conf);
1239 goto out_new; 1249 goto out_new;
1240 } 1250 }
1241 if (ip_addr != conf->cl_addr &&
1242 !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
1243 /* Client collision. 18.35.4 case 3 */
1244 status = nfserr_clid_inuse;
1245 goto out;
1246 }
1247 /* 1251 /*
1248 * Set bit when the owner id and verifier map to an already 1252 * Set bit when the owner id and verifier map to an already
1249 * confirmed client id (18.35.3). 1253 * confirmed client id (18.35.3).
@@ -1257,12 +1261,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1257 copy_verf(conf, &verf); 1261 copy_verf(conf, &verf);
1258 new = conf; 1262 new = conf;
1259 goto out_copy; 1263 goto out_copy;
1260 } else { 1264 }
1261 /* 18.35.4 case 7 */ 1265
1262 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { 1266 /* 18.35.4 case 7 */
1263 status = nfserr_noent; 1267 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1264 goto out; 1268 status = nfserr_noent;
1265 } 1269 goto out;
1266 } 1270 }
1267 1271
1268 unconf = find_unconfirmed_client_by_str(dname, strhashval, true); 1272 unconf = find_unconfirmed_client_by_str(dname, strhashval, true);
@@ -1471,7 +1475,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1471 goto out; 1475 goto out;
1472 1476
1473 status = nfserr_badslot; 1477 status = nfserr_badslot;
1474 if (seq->slotid >= session->se_fnumslots) 1478 if (seq->slotid >= session->se_fchannel.maxreqs)
1475 goto out; 1479 goto out;
1476 1480
1477 slot = &session->se_slots[seq->slotid]; 1481 slot = &session->se_slots[seq->slotid];
@@ -1686,9 +1690,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1686 else { 1690 else {
1687 /* XXX: We just turn off callbacks until we can handle 1691 /* XXX: We just turn off callbacks until we can handle
1688 * change request correctly. */ 1692 * change request correctly. */
1689 atomic_set(&conf->cl_callback.cb_set, 0); 1693 atomic_set(&conf->cl_cb_conn.cb_set, 0);
1690 gen_confirm(conf);
1691 nfsd4_remove_clid_dir(unconf);
1692 expire_client(unconf); 1694 expire_client(unconf);
1693 status = nfs_ok; 1695 status = nfs_ok;
1694 1696
@@ -1882,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
1882 stp->st_stateowner = sop; 1884 stp->st_stateowner = sop;
1883 get_nfs4_file(fp); 1885 get_nfs4_file(fp);
1884 stp->st_file = fp; 1886 stp->st_file = fp;
1885 stp->st_stateid.si_boot = boot_time; 1887 stp->st_stateid.si_boot = get_seconds();
1886 stp->st_stateid.si_stateownerid = sop->so_id; 1888 stp->st_stateid.si_stateownerid = sop->so_id;
1887 stp->st_stateid.si_fileid = fp->fi_id; 1889 stp->st_stateid.si_fileid = fp->fi_id;
1888 stp->st_stateid.si_generation = 0; 1890 stp->st_stateid.si_generation = 0;
@@ -2059,19 +2061,6 @@ nfs4_file_downgrade(struct file *filp, unsigned int share_access)
2059} 2061}
2060 2062
2061/* 2063/*
2062 * Recall a delegation
2063 */
2064static int
2065do_recall(void *__dp)
2066{
2067 struct nfs4_delegation *dp = __dp;
2068
2069 dp->dl_file->fi_had_conflict = true;
2070 nfsd4_cb_recall(dp);
2071 return 0;
2072}
2073
2074/*
2075 * Spawn a thread to perform a recall on the delegation represented 2064 * Spawn a thread to perform a recall on the delegation represented
2076 * by the lease (file_lock) 2065 * by the lease (file_lock)
2077 * 2066 *
@@ -2082,8 +2071,7 @@ do_recall(void *__dp)
2082static 2071static
2083void nfsd_break_deleg_cb(struct file_lock *fl) 2072void nfsd_break_deleg_cb(struct file_lock *fl)
2084{ 2073{
2085 struct nfs4_delegation *dp= (struct nfs4_delegation *)fl->fl_owner; 2074 struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
2086 struct task_struct *t;
2087 2075
2088 dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl); 2076 dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl);
2089 if (!dp) 2077 if (!dp)
@@ -2111,16 +2099,8 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2111 */ 2099 */
2112 fl->fl_break_time = 0; 2100 fl->fl_break_time = 0;
2113 2101
2114 t = kthread_run(do_recall, dp, "%s", "nfs4_cb_recall"); 2102 dp->dl_file->fi_had_conflict = true;
2115 if (IS_ERR(t)) { 2103 nfsd4_cb_recall(dp);
2116 struct nfs4_client *clp = dp->dl_client;
2117
2118 printk(KERN_INFO "NFSD: Callback thread failed for "
2119 "for client (clientid %08x/%08x)\n",
2120 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
2121 put_nfs4_client(dp->dl_client);
2122 nfs4_put_delegation(dp);
2123 }
2124} 2104}
2125 2105
2126/* 2106/*
@@ -2422,7 +2402,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2422{ 2402{
2423 struct nfs4_delegation *dp; 2403 struct nfs4_delegation *dp;
2424 struct nfs4_stateowner *sop = stp->st_stateowner; 2404 struct nfs4_stateowner *sop = stp->st_stateowner;
2425 struct nfs4_callback *cb = &sop->so_client->cl_callback; 2405 struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn;
2426 struct file_lock fl, *flp = &fl; 2406 struct file_lock fl, *flp = &fl;
2427 int status, flag = 0; 2407 int status, flag = 0;
2428 2408
@@ -2614,7 +2594,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2614 renew_client(clp); 2594 renew_client(clp);
2615 status = nfserr_cb_path_down; 2595 status = nfserr_cb_path_down;
2616 if (!list_empty(&clp->cl_delegations) 2596 if (!list_empty(&clp->cl_delegations)
2617 && !atomic_read(&clp->cl_callback.cb_set)) 2597 && !atomic_read(&clp->cl_cb_conn.cb_set))
2618 goto out; 2598 goto out;
2619 status = nfs_ok; 2599 status = nfs_ok;
2620out: 2600out:
@@ -2738,12 +2718,42 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
2738static int 2718static int
2739STALE_STATEID(stateid_t *stateid) 2719STALE_STATEID(stateid_t *stateid)
2740{ 2720{
2741 if (stateid->si_boot == boot_time) 2721 if (time_after((unsigned long)boot_time,
2742 return 0; 2722 (unsigned long)stateid->si_boot)) {
2743 dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n", 2723 dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n",
2744 stateid->si_boot, stateid->si_stateownerid, stateid->si_fileid, 2724 stateid->si_boot, stateid->si_stateownerid,
2745 stateid->si_generation); 2725 stateid->si_fileid, stateid->si_generation);
2746 return 1; 2726 return 1;
2727 }
2728 return 0;
2729}
2730
2731static int
2732EXPIRED_STATEID(stateid_t *stateid)
2733{
2734 if (time_before((unsigned long)boot_time,
2735 ((unsigned long)stateid->si_boot)) &&
2736 time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
2737 dprintk("NFSD: expired stateid (%08x/%08x/%08x/%08x)!\n",
2738 stateid->si_boot, stateid->si_stateownerid,
2739 stateid->si_fileid, stateid->si_generation);
2740 return 1;
2741 }
2742 return 0;
2743}
2744
2745static __be32
2746stateid_error_map(stateid_t *stateid)
2747{
2748 if (STALE_STATEID(stateid))
2749 return nfserr_stale_stateid;
2750 if (EXPIRED_STATEID(stateid))
2751 return nfserr_expired;
2752
2753 dprintk("NFSD: bad stateid (%08x/%08x/%08x/%08x)!\n",
2754 stateid->si_boot, stateid->si_stateownerid,
2755 stateid->si_fileid, stateid->si_generation);
2756 return nfserr_bad_stateid;
2747} 2757}
2748 2758
2749static inline int 2759static inline int
@@ -2867,8 +2877,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2867 status = nfserr_bad_stateid; 2877 status = nfserr_bad_stateid;
2868 if (is_delegation_stateid(stateid)) { 2878 if (is_delegation_stateid(stateid)) {
2869 dp = find_delegation_stateid(ino, stateid); 2879 dp = find_delegation_stateid(ino, stateid);
2870 if (!dp) 2880 if (!dp) {
2881 status = stateid_error_map(stateid);
2871 goto out; 2882 goto out;
2883 }
2872 status = check_stateid_generation(stateid, &dp->dl_stateid, 2884 status = check_stateid_generation(stateid, &dp->dl_stateid,
2873 flags); 2885 flags);
2874 if (status) 2886 if (status)
@@ -2881,8 +2893,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2881 *filpp = dp->dl_vfs_file; 2893 *filpp = dp->dl_vfs_file;
2882 } else { /* open or lock stateid */ 2894 } else { /* open or lock stateid */
2883 stp = find_stateid(stateid, flags); 2895 stp = find_stateid(stateid, flags);
2884 if (!stp) 2896 if (!stp) {
2897 status = stateid_error_map(stateid);
2885 goto out; 2898 goto out;
2899 }
2886 if (nfs4_check_fh(current_fh, stp)) 2900 if (nfs4_check_fh(current_fh, stp))
2887 goto out; 2901 goto out;
2888 if (!stp->st_stateowner->so_confirmed) 2902 if (!stp->st_stateowner->so_confirmed)
@@ -2956,7 +2970,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
2956 */ 2970 */
2957 sop = search_close_lru(stateid->si_stateownerid, flags); 2971 sop = search_close_lru(stateid->si_stateownerid, flags);
2958 if (sop == NULL) 2972 if (sop == NULL)
2959 return nfserr_bad_stateid; 2973 return stateid_error_map(stateid);
2960 *sopp = sop; 2974 *sopp = sop;
2961 goto check_replay; 2975 goto check_replay;
2962 } 2976 }
@@ -3227,8 +3241,10 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3227 if (!is_delegation_stateid(stateid)) 3241 if (!is_delegation_stateid(stateid))
3228 goto out; 3242 goto out;
3229 dp = find_delegation_stateid(inode, stateid); 3243 dp = find_delegation_stateid(inode, stateid);
3230 if (!dp) 3244 if (!dp) {
3245 status = stateid_error_map(stateid);
3231 goto out; 3246 goto out;
3247 }
3232 status = check_stateid_generation(stateid, &dp->dl_stateid, flags); 3248 status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
3233 if (status) 3249 if (status)
3234 goto out; 3250 goto out;
@@ -3455,7 +3471,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
3455 stp->st_stateowner = sop; 3471 stp->st_stateowner = sop;
3456 get_nfs4_file(fp); 3472 get_nfs4_file(fp);
3457 stp->st_file = fp; 3473 stp->st_file = fp;
3458 stp->st_stateid.si_boot = boot_time; 3474 stp->st_stateid.si_boot = get_seconds();
3459 stp->st_stateid.si_stateownerid = sop->so_id; 3475 stp->st_stateid.si_stateownerid = sop->so_id;
3460 stp->st_stateid.si_fileid = fp->fi_id; 3476 stp->st_stateid.si_fileid = fp->fi_id;
3461 stp->st_stateid.si_generation = 0; 3477 stp->st_stateid.si_generation = 0;
@@ -3987,6 +4003,7 @@ nfs4_state_init(void)
3987 INIT_LIST_HEAD(&conf_str_hashtbl[i]); 4003 INIT_LIST_HEAD(&conf_str_hashtbl[i]);
3988 INIT_LIST_HEAD(&unconf_str_hashtbl[i]); 4004 INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
3989 INIT_LIST_HEAD(&unconf_id_hashtbl[i]); 4005 INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
4006 INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
3990 } 4007 }
3991 for (i = 0; i < SESSION_HASH_SIZE; i++) 4008 for (i = 0; i < SESSION_HASH_SIZE; i++)
3992 INIT_LIST_HEAD(&sessionid_hashtbl[i]); 4009 INIT_LIST_HEAD(&sessionid_hashtbl[i]);
@@ -4009,8 +4026,6 @@ nfs4_state_init(void)
4009 INIT_LIST_HEAD(&close_lru); 4026 INIT_LIST_HEAD(&close_lru);
4010 INIT_LIST_HEAD(&client_lru); 4027 INIT_LIST_HEAD(&client_lru);
4011 INIT_LIST_HEAD(&del_recall_lru); 4028 INIT_LIST_HEAD(&del_recall_lru);
4012 for (i = 0; i < CLIENT_HASH_SIZE; i++)
4013 INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
4014 reclaim_str_hashtbl_size = 0; 4029 reclaim_str_hashtbl_size = 0;
4015 return 0; 4030 return 0;
4016} 4031}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b73549d293be..2dcc7feaa6ff 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -83,16 +83,6 @@ check_filename(char *str, int len, __be32 err)
83 return 0; 83 return 0;
84} 84}
85 85
86/*
87 * START OF "GENERIC" DECODE ROUTINES.
88 * These may look a little ugly since they are imported from a "generic"
89 * set of XDR encode/decode routines which are intended to be shared by
90 * all of our NFSv4 implementations (OpenBSD, MacOS X...).
91 *
92 * If the pain of reading these is too great, it should be a straightforward
93 * task to translate them into Linux-specific versions which are more
94 * consistent with the style used in NFSv2/v3...
95 */
96#define DECODE_HEAD \ 86#define DECODE_HEAD \
97 __be32 *p; \ 87 __be32 *p; \
98 __be32 status 88 __be32 status
@@ -254,20 +244,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
254 DECODE_TAIL; 244 DECODE_TAIL;
255} 245}
256 246
257static u32 nfsd_attrmask[] = {
258 NFSD_WRITEABLE_ATTRS_WORD0,
259 NFSD_WRITEABLE_ATTRS_WORD1,
260 NFSD_WRITEABLE_ATTRS_WORD2
261};
262
263static u32 nfsd41_ex_attrmask[] = {
264 NFSD_SUPPATTR_EXCLCREAT_WORD0,
265 NFSD_SUPPATTR_EXCLCREAT_WORD1,
266 NFSD_SUPPATTR_EXCLCREAT_WORD2
267};
268
269static __be32 247static __be32
270nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable, 248nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
271 struct iattr *iattr, struct nfs4_acl **acl) 249 struct iattr *iattr, struct nfs4_acl **acl)
272{ 250{
273 int expected_len, len = 0; 251 int expected_len, len = 0;
@@ -280,18 +258,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
280 if ((status = nfsd4_decode_bitmap(argp, bmval))) 258 if ((status = nfsd4_decode_bitmap(argp, bmval)))
281 return status; 259 return status;
282 260
283 /*
284 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
285 * read-only attributes return ERR_INVAL.
286 */
287 if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
288 (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
289 (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
290 return nfserr_attrnotsupp;
291 if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
292 (bmval[2] & ~writable[2]))
293 return nfserr_inval;
294
295 READ_BUF(4); 261 READ_BUF(4);
296 READ32(expected_len); 262 READ32(expected_len);
297 263
@@ -424,8 +390,11 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
424 goto xdr_error; 390 goto xdr_error;
425 } 391 }
426 } 392 }
427 BUG_ON(bmval[2]); /* no such writeable attr supported yet */ 393 if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
428 if (len != expected_len) 394 || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
395 || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
396 READ_BUF(expected_len - len);
397 else if (len != expected_len)
429 goto xdr_error; 398 goto xdr_error;
430 399
431 DECODE_TAIL; 400 DECODE_TAIL;
@@ -518,8 +487,8 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
518 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) 487 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
519 return status; 488 return status;
520 489
521 status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask, 490 status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
522 &create->cr_iattr, &create->cr_acl); 491 &create->cr_acl);
523 if (status) 492 if (status)
524 goto out; 493 goto out;
525 494
@@ -682,7 +651,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
682 case NFS4_CREATE_UNCHECKED: 651 case NFS4_CREATE_UNCHECKED:
683 case NFS4_CREATE_GUARDED: 652 case NFS4_CREATE_GUARDED:
684 status = nfsd4_decode_fattr(argp, open->op_bmval, 653 status = nfsd4_decode_fattr(argp, open->op_bmval,
685 nfsd_attrmask, &open->op_iattr, &open->op_acl); 654 &open->op_iattr, &open->op_acl);
686 if (status) 655 if (status)
687 goto out; 656 goto out;
688 break; 657 break;
@@ -696,8 +665,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
696 READ_BUF(8); 665 READ_BUF(8);
697 COPYMEM(open->op_verf.data, 8); 666 COPYMEM(open->op_verf.data, 8);
698 status = nfsd4_decode_fattr(argp, open->op_bmval, 667 status = nfsd4_decode_fattr(argp, open->op_bmval,
699 nfsd41_ex_attrmask, &open->op_iattr, 668 &open->op_iattr, &open->op_acl);
700 &open->op_acl);
701 if (status) 669 if (status)
702 goto out; 670 goto out;
703 break; 671 break;
@@ -893,8 +861,8 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
893 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); 861 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
894 if (status) 862 if (status)
895 return status; 863 return status;
896 return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask, 864 return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
897 &setattr->sa_iattr, &setattr->sa_acl); 865 &setattr->sa_acl);
898} 866}
899 867
900static __be32 868static __be32
@@ -1328,64 +1296,64 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1328}; 1296};
1329 1297
1330static nfsd4_dec nfsd41_dec_ops[] = { 1298static nfsd4_dec nfsd41_dec_ops[] = {
1331 [OP_ACCESS] (nfsd4_dec)nfsd4_decode_access, 1299 [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access,
1332 [OP_CLOSE] (nfsd4_dec)nfsd4_decode_close, 1300 [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close,
1333 [OP_COMMIT] (nfsd4_dec)nfsd4_decode_commit, 1301 [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit,
1334 [OP_CREATE] (nfsd4_dec)nfsd4_decode_create, 1302 [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create,
1335 [OP_DELEGPURGE] (nfsd4_dec)nfsd4_decode_notsupp, 1303 [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp,
1336 [OP_DELEGRETURN] (nfsd4_dec)nfsd4_decode_delegreturn, 1304 [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn,
1337 [OP_GETATTR] (nfsd4_dec)nfsd4_decode_getattr, 1305 [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr,
1338 [OP_GETFH] (nfsd4_dec)nfsd4_decode_noop, 1306 [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop,
1339 [OP_LINK] (nfsd4_dec)nfsd4_decode_link, 1307 [OP_LINK] = (nfsd4_dec)nfsd4_decode_link,
1340 [OP_LOCK] (nfsd4_dec)nfsd4_decode_lock, 1308 [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock,
1341 [OP_LOCKT] (nfsd4_dec)nfsd4_decode_lockt, 1309 [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt,
1342 [OP_LOCKU] (nfsd4_dec)nfsd4_decode_locku, 1310 [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku,
1343 [OP_LOOKUP] (nfsd4_dec)nfsd4_decode_lookup, 1311 [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup,
1344 [OP_LOOKUPP] (nfsd4_dec)nfsd4_decode_noop, 1312 [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop,
1345 [OP_NVERIFY] (nfsd4_dec)nfsd4_decode_verify, 1313 [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify,
1346 [OP_OPEN] (nfsd4_dec)nfsd4_decode_open, 1314 [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open,
1347 [OP_OPENATTR] (nfsd4_dec)nfsd4_decode_notsupp, 1315 [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp,
1348 [OP_OPEN_CONFIRM] (nfsd4_dec)nfsd4_decode_notsupp, 1316 [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_notsupp,
1349 [OP_OPEN_DOWNGRADE] (nfsd4_dec)nfsd4_decode_open_downgrade, 1317 [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
1350 [OP_PUTFH] (nfsd4_dec)nfsd4_decode_putfh, 1318 [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
1351 [OP_PUTPUBFH] (nfsd4_dec)nfsd4_decode_notsupp, 1319 [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp,
1352 [OP_PUTROOTFH] (nfsd4_dec)nfsd4_decode_noop, 1320 [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
1353 [OP_READ] (nfsd4_dec)nfsd4_decode_read, 1321 [OP_READ] = (nfsd4_dec)nfsd4_decode_read,
1354 [OP_READDIR] (nfsd4_dec)nfsd4_decode_readdir, 1322 [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
1355 [OP_READLINK] (nfsd4_dec)nfsd4_decode_noop, 1323 [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop,
1356 [OP_REMOVE] (nfsd4_dec)nfsd4_decode_remove, 1324 [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove,
1357 [OP_RENAME] (nfsd4_dec)nfsd4_decode_rename, 1325 [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename,
1358 [OP_RENEW] (nfsd4_dec)nfsd4_decode_notsupp, 1326 [OP_RENEW] = (nfsd4_dec)nfsd4_decode_notsupp,
1359 [OP_RESTOREFH] (nfsd4_dec)nfsd4_decode_noop, 1327 [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop,
1360 [OP_SAVEFH] (nfsd4_dec)nfsd4_decode_noop, 1328 [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop,
1361 [OP_SECINFO] (nfsd4_dec)nfsd4_decode_secinfo, 1329 [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo,
1362 [OP_SETATTR] (nfsd4_dec)nfsd4_decode_setattr, 1330 [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr,
1363 [OP_SETCLIENTID] (nfsd4_dec)nfsd4_decode_notsupp, 1331 [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp,
1364 [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp, 1332 [OP_SETCLIENTID_CONFIRM]= (nfsd4_dec)nfsd4_decode_notsupp,
1365 [OP_VERIFY] (nfsd4_dec)nfsd4_decode_verify, 1333 [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify,
1366 [OP_WRITE] (nfsd4_dec)nfsd4_decode_write, 1334 [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write,
1367 [OP_RELEASE_LOCKOWNER] (nfsd4_dec)nfsd4_decode_notsupp, 1335 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_notsupp,
1368 1336
1369 /* new operations for NFSv4.1 */ 1337 /* new operations for NFSv4.1 */
1370 [OP_BACKCHANNEL_CTL] (nfsd4_dec)nfsd4_decode_notsupp, 1338 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp,
1371 [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp, 1339 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp,
1372 [OP_EXCHANGE_ID] (nfsd4_dec)nfsd4_decode_exchange_id, 1340 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
1373 [OP_CREATE_SESSION] (nfsd4_dec)nfsd4_decode_create_session, 1341 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
1374 [OP_DESTROY_SESSION] (nfsd4_dec)nfsd4_decode_destroy_session, 1342 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
1375 [OP_FREE_STATEID] (nfsd4_dec)nfsd4_decode_notsupp, 1343 [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
1376 [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp, 1344 [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1377 [OP_GETDEVICEINFO] (nfsd4_dec)nfsd4_decode_notsupp, 1345 [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
1378 [OP_GETDEVICELIST] (nfsd4_dec)nfsd4_decode_notsupp, 1346 [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
1379 [OP_LAYOUTCOMMIT] (nfsd4_dec)nfsd4_decode_notsupp, 1347 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
1380 [OP_LAYOUTGET] (nfsd4_dec)nfsd4_decode_notsupp, 1348 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
1381 [OP_LAYOUTRETURN] (nfsd4_dec)nfsd4_decode_notsupp, 1349 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
1382 [OP_SECINFO_NO_NAME] (nfsd4_dec)nfsd4_decode_notsupp, 1350 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp,
1383 [OP_SEQUENCE] (nfsd4_dec)nfsd4_decode_sequence, 1351 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
1384 [OP_SET_SSV] (nfsd4_dec)nfsd4_decode_notsupp, 1352 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
1385 [OP_TEST_STATEID] (nfsd4_dec)nfsd4_decode_notsupp, 1353 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
1386 [OP_WANT_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp, 1354 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1387 [OP_DESTROY_CLIENTID] (nfsd4_dec)nfsd4_decode_notsupp, 1355 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp,
1388 [OP_RECLAIM_COMPLETE] (nfsd4_dec)nfsd4_decode_notsupp, 1356 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_notsupp,
1389}; 1357};
1390 1358
1391struct nfsd4_minorversion_ops { 1359struct nfsd4_minorversion_ops {
@@ -1489,21 +1457,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1489 1457
1490 DECODE_TAIL; 1458 DECODE_TAIL;
1491} 1459}
1492/*
1493 * END OF "GENERIC" DECODE ROUTINES.
1494 */
1495
1496/*
1497 * START OF "GENERIC" ENCODE ROUTINES.
1498 * These may look a little ugly since they are imported from a "generic"
1499 * set of XDR encode/decode routines which are intended to be shared by
1500 * all of our NFSv4 implementations (OpenBSD, MacOS X...).
1501 *
1502 * If the pain of reading these is too great, it should be a straightforward
1503 * task to translate them into Linux-specific versions which are more
1504 * consistent with the style used in NFSv2/v3...
1505 */
1506#define ENCODE_HEAD __be32 *p
1507 1460
1508#define WRITE32(n) *p++ = htonl(n) 1461#define WRITE32(n) *p++ = htonl(n)
1509#define WRITE64(n) do { \ 1462#define WRITE64(n) do { \
@@ -1515,13 +1468,41 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1515 memcpy(p, ptr, nbytes); \ 1468 memcpy(p, ptr, nbytes); \
1516 p += XDR_QUADLEN(nbytes); \ 1469 p += XDR_QUADLEN(nbytes); \
1517}} while (0) 1470}} while (0)
1518#define WRITECINFO(c) do { \ 1471
1519 *p++ = htonl(c.atomic); \ 1472static void write32(__be32 **p, u32 n)
1520 *p++ = htonl(c.before_ctime_sec); \ 1473{
1521 *p++ = htonl(c.before_ctime_nsec); \ 1474 *(*p)++ = n;
1522 *p++ = htonl(c.after_ctime_sec); \ 1475}
1523 *p++ = htonl(c.after_ctime_nsec); \ 1476
1524} while (0) 1477static void write64(__be32 **p, u64 n)
1478{
1479 write32(p, (u32)(n >> 32));
1480 write32(p, (u32)n);
1481}
1482
1483static void write_change(__be32 **p, struct kstat *stat, struct inode *inode)
1484{
1485 if (IS_I_VERSION(inode)) {
1486 write64(p, inode->i_version);
1487 } else {
1488 write32(p, stat->ctime.tv_sec);
1489 write32(p, stat->ctime.tv_nsec);
1490 }
1491}
1492
1493static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
1494{
1495 write32(p, c->atomic);
1496 if (c->change_supported) {
1497 write64(p, c->before_change);
1498 write64(p, c->after_change);
1499 } else {
1500 write32(p, c->before_ctime_sec);
1501 write32(p, c->before_ctime_nsec);
1502 write32(p, c->after_ctime_sec);
1503 write32(p, c->after_ctime_nsec);
1504 }
1505}
1525 1506
1526#define RESERVE_SPACE(nbytes) do { \ 1507#define RESERVE_SPACE(nbytes) do { \
1527 p = resp->p; \ 1508 p = resp->p; \
@@ -1874,16 +1855,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1874 WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME); 1855 WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME);
1875 } 1856 }
1876 if (bmval0 & FATTR4_WORD0_CHANGE) { 1857 if (bmval0 & FATTR4_WORD0_CHANGE) {
1877 /*
1878 * Note: This _must_ be consistent with the scheme for writing
1879 * change_info, so any changes made here must be reflected there
1880 * as well. (See xdr4.h:set_change_info() and the WRITECINFO()
1881 * macro above.)
1882 */
1883 if ((buflen -= 8) < 0) 1858 if ((buflen -= 8) < 0)
1884 goto out_resource; 1859 goto out_resource;
1885 WRITE32(stat.ctime.tv_sec); 1860 write_change(&p, &stat, dentry->d_inode);
1886 WRITE32(stat.ctime.tv_nsec);
1887 } 1861 }
1888 if (bmval0 & FATTR4_WORD0_SIZE) { 1862 if (bmval0 & FATTR4_WORD0_SIZE) {
1889 if ((buflen -= 8) < 0) 1863 if ((buflen -= 8) < 0)
@@ -2348,7 +2322,7 @@ fail:
2348static void 2322static void
2349nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid) 2323nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
2350{ 2324{
2351 ENCODE_HEAD; 2325 __be32 *p;
2352 2326
2353 RESERVE_SPACE(sizeof(stateid_t)); 2327 RESERVE_SPACE(sizeof(stateid_t));
2354 WRITE32(sid->si_generation); 2328 WRITE32(sid->si_generation);
@@ -2359,7 +2333,7 @@ nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
2359static __be32 2333static __be32
2360nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) 2334nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
2361{ 2335{
2362 ENCODE_HEAD; 2336 __be32 *p;
2363 2337
2364 if (!nfserr) { 2338 if (!nfserr) {
2365 RESERVE_SPACE(8); 2339 RESERVE_SPACE(8);
@@ -2386,7 +2360,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
2386static __be32 2360static __be32
2387nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) 2361nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
2388{ 2362{
2389 ENCODE_HEAD; 2363 __be32 *p;
2390 2364
2391 if (!nfserr) { 2365 if (!nfserr) {
2392 RESERVE_SPACE(8); 2366 RESERVE_SPACE(8);
@@ -2399,11 +2373,11 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
2399static __be32 2373static __be32
2400nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) 2374nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
2401{ 2375{
2402 ENCODE_HEAD; 2376 __be32 *p;
2403 2377
2404 if (!nfserr) { 2378 if (!nfserr) {
2405 RESERVE_SPACE(32); 2379 RESERVE_SPACE(32);
2406 WRITECINFO(create->cr_cinfo); 2380 write_cinfo(&p, &create->cr_cinfo);
2407 WRITE32(2); 2381 WRITE32(2);
2408 WRITE32(create->cr_bmval[0]); 2382 WRITE32(create->cr_bmval[0]);
2409 WRITE32(create->cr_bmval[1]); 2383 WRITE32(create->cr_bmval[1]);
@@ -2435,7 +2409,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
2435{ 2409{
2436 struct svc_fh *fhp = *fhpp; 2410 struct svc_fh *fhp = *fhpp;
2437 unsigned int len; 2411 unsigned int len;
2438 ENCODE_HEAD; 2412 __be32 *p;
2439 2413
2440 if (!nfserr) { 2414 if (!nfserr) {
2441 len = fhp->fh_handle.fh_size; 2415 len = fhp->fh_handle.fh_size;
@@ -2454,7 +2428,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
2454static void 2428static void
2455nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld) 2429nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
2456{ 2430{
2457 ENCODE_HEAD; 2431 __be32 *p;
2458 2432
2459 RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0)); 2433 RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0));
2460 WRITE64(ld->ld_start); 2434 WRITE64(ld->ld_start);
@@ -2510,11 +2484,11 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
2510static __be32 2484static __be32
2511nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) 2485nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
2512{ 2486{
2513 ENCODE_HEAD; 2487 __be32 *p;
2514 2488
2515 if (!nfserr) { 2489 if (!nfserr) {
2516 RESERVE_SPACE(20); 2490 RESERVE_SPACE(20);
2517 WRITECINFO(link->li_cinfo); 2491 write_cinfo(&p, &link->li_cinfo);
2518 ADJUST_ARGS(); 2492 ADJUST_ARGS();
2519 } 2493 }
2520 return nfserr; 2494 return nfserr;
@@ -2524,7 +2498,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
2524static __be32 2498static __be32
2525nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) 2499nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
2526{ 2500{
2527 ENCODE_HEAD; 2501 __be32 *p;
2528 ENCODE_SEQID_OP_HEAD; 2502 ENCODE_SEQID_OP_HEAD;
2529 2503
2530 if (nfserr) 2504 if (nfserr)
@@ -2532,7 +2506,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
2532 2506
2533 nfsd4_encode_stateid(resp, &open->op_stateid); 2507 nfsd4_encode_stateid(resp, &open->op_stateid);
2534 RESERVE_SPACE(40); 2508 RESERVE_SPACE(40);
2535 WRITECINFO(open->op_cinfo); 2509 write_cinfo(&p, &open->op_cinfo);
2536 WRITE32(open->op_rflags); 2510 WRITE32(open->op_rflags);
2537 WRITE32(2); 2511 WRITE32(2);
2538 WRITE32(open->op_bmval[0]); 2512 WRITE32(open->op_bmval[0]);
@@ -2619,7 +2593,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
2619 int v, pn; 2593 int v, pn;
2620 unsigned long maxcount; 2594 unsigned long maxcount;
2621 long len; 2595 long len;
2622 ENCODE_HEAD; 2596 __be32 *p;
2623 2597
2624 if (nfserr) 2598 if (nfserr)
2625 return nfserr; 2599 return nfserr;
@@ -2681,7 +2655,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
2681{ 2655{
2682 int maxcount; 2656 int maxcount;
2683 char *page; 2657 char *page;
2684 ENCODE_HEAD; 2658 __be32 *p;
2685 2659
2686 if (nfserr) 2660 if (nfserr)
2687 return nfserr; 2661 return nfserr;
@@ -2730,7 +2704,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
2730 int maxcount; 2704 int maxcount;
2731 loff_t offset; 2705 loff_t offset;
2732 __be32 *page, *savep, *tailbase; 2706 __be32 *page, *savep, *tailbase;
2733 ENCODE_HEAD; 2707 __be32 *p;
2734 2708
2735 if (nfserr) 2709 if (nfserr)
2736 return nfserr; 2710 return nfserr;
@@ -2806,11 +2780,11 @@ err_no_verf:
2806static __be32 2780static __be32
2807nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) 2781nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
2808{ 2782{
2809 ENCODE_HEAD; 2783 __be32 *p;
2810 2784
2811 if (!nfserr) { 2785 if (!nfserr) {
2812 RESERVE_SPACE(20); 2786 RESERVE_SPACE(20);
2813 WRITECINFO(remove->rm_cinfo); 2787 write_cinfo(&p, &remove->rm_cinfo);
2814 ADJUST_ARGS(); 2788 ADJUST_ARGS();
2815 } 2789 }
2816 return nfserr; 2790 return nfserr;
@@ -2819,12 +2793,12 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
2819static __be32 2793static __be32
2820nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) 2794nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
2821{ 2795{
2822 ENCODE_HEAD; 2796 __be32 *p;
2823 2797
2824 if (!nfserr) { 2798 if (!nfserr) {
2825 RESERVE_SPACE(40); 2799 RESERVE_SPACE(40);
2826 WRITECINFO(rename->rn_sinfo); 2800 write_cinfo(&p, &rename->rn_sinfo);
2827 WRITECINFO(rename->rn_tinfo); 2801 write_cinfo(&p, &rename->rn_tinfo);
2828 ADJUST_ARGS(); 2802 ADJUST_ARGS();
2829 } 2803 }
2830 return nfserr; 2804 return nfserr;
@@ -2839,7 +2813,7 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
2839 u32 nflavs; 2813 u32 nflavs;
2840 struct exp_flavor_info *flavs; 2814 struct exp_flavor_info *flavs;
2841 struct exp_flavor_info def_flavs[2]; 2815 struct exp_flavor_info def_flavs[2];
2842 ENCODE_HEAD; 2816 __be32 *p;
2843 2817
2844 if (nfserr) 2818 if (nfserr)
2845 goto out; 2819 goto out;
@@ -2904,7 +2878,7 @@ out:
2904static __be32 2878static __be32
2905nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) 2879nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
2906{ 2880{
2907 ENCODE_HEAD; 2881 __be32 *p;
2908 2882
2909 RESERVE_SPACE(12); 2883 RESERVE_SPACE(12);
2910 if (nfserr) { 2884 if (nfserr) {
@@ -2924,7 +2898,7 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
2924static __be32 2898static __be32
2925nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) 2899nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
2926{ 2900{
2927 ENCODE_HEAD; 2901 __be32 *p;
2928 2902
2929 if (!nfserr) { 2903 if (!nfserr) {
2930 RESERVE_SPACE(8 + sizeof(nfs4_verifier)); 2904 RESERVE_SPACE(8 + sizeof(nfs4_verifier));
@@ -2944,7 +2918,7 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
2944static __be32 2918static __be32
2945nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) 2919nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
2946{ 2920{
2947 ENCODE_HEAD; 2921 __be32 *p;
2948 2922
2949 if (!nfserr) { 2923 if (!nfserr) {
2950 RESERVE_SPACE(16); 2924 RESERVE_SPACE(16);
@@ -2960,7 +2934,7 @@ static __be32
2960nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr, 2934nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
2961 struct nfsd4_exchange_id *exid) 2935 struct nfsd4_exchange_id *exid)
2962{ 2936{
2963 ENCODE_HEAD; 2937 __be32 *p;
2964 char *major_id; 2938 char *major_id;
2965 char *server_scope; 2939 char *server_scope;
2966 int major_id_sz; 2940 int major_id_sz;
@@ -3015,7 +2989,7 @@ static __be32
3015nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr, 2989nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
3016 struct nfsd4_create_session *sess) 2990 struct nfsd4_create_session *sess)
3017{ 2991{
3018 ENCODE_HEAD; 2992 __be32 *p;
3019 2993
3020 if (nfserr) 2994 if (nfserr)
3021 return nfserr; 2995 return nfserr;
@@ -3071,7 +3045,7 @@ __be32
3071nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, 3045nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3072 struct nfsd4_sequence *seq) 3046 struct nfsd4_sequence *seq)
3073{ 3047{
3074 ENCODE_HEAD; 3048 __be32 *p;
3075 3049
3076 if (nfserr) 3050 if (nfserr)
3077 return nfserr; 3051 return nfserr;
@@ -3209,7 +3183,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
3209 dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, 3183 dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
3210 length, xb->page_len, tlen, pad); 3184 length, xb->page_len, tlen, pad);
3211 3185
3212 if (length <= session->se_fmaxresp_cached) 3186 if (length <= session->se_fchannel.maxresp_cached)
3213 return status; 3187 return status;
3214 else 3188 else
3215 return nfserr_rep_too_big_to_cache; 3189 return nfserr_rep_too_big_to_cache;
@@ -3219,7 +3193,7 @@ void
3219nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) 3193nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
3220{ 3194{
3221 __be32 *statp; 3195 __be32 *statp;
3222 ENCODE_HEAD; 3196 __be32 *p;
3223 3197
3224 RESERVE_SPACE(8); 3198 RESERVE_SPACE(8);
3225 WRITE32(op->opnum); 3199 WRITE32(op->opnum);
@@ -3253,7 +3227,7 @@ status:
3253void 3227void
3254nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op) 3228nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
3255{ 3229{
3256 ENCODE_HEAD; 3230 __be32 *p;
3257 struct nfs4_replay *rp = op->replay; 3231 struct nfs4_replay *rp = op->replay;
3258 3232
3259 BUG_ON(!rp); 3233 BUG_ON(!rp);
@@ -3268,10 +3242,6 @@ nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
3268 ADJUST_ARGS(); 3242 ADJUST_ARGS();
3269} 3243}
3270 3244
3271/*
3272 * END OF "GENERIC" ENCODE ROUTINES.
3273 */
3274
3275int 3245int
3276nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy) 3246nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
3277{ 3247{
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 5bfc2ac60d54..4638635c5d87 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -29,15 +29,24 @@
29 */ 29 */
30#define CACHESIZE 1024 30#define CACHESIZE 1024
31#define HASHSIZE 64 31#define HASHSIZE 64
32#define REQHASH(xid) (((((__force __u32)xid) >> 24) ^ ((__force __u32)xid)) & (HASHSIZE-1))
33 32
34static struct hlist_head * hash_list; 33static struct hlist_head * cache_hash;
35static struct list_head lru_head; 34static struct list_head lru_head;
36static int cache_disabled = 1; 35static int cache_disabled = 1;
37 36
37/*
38 * Calculate the hash index from an XID.
39 */
40static inline u32 request_hash(u32 xid)
41{
42 u32 h = xid;
43 h ^= (xid >> 24);
44 return h & (HASHSIZE-1);
45}
46
38static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); 47static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
39 48
40/* 49/*
41 * locking for the reply cache: 50 * locking for the reply cache:
42 * A cache entry is "single use" if c_state == RC_INPROG 51 * A cache entry is "single use" if c_state == RC_INPROG
43 * Otherwise, it when accessing _prev or _next, the lock must be held. 52 * Otherwise, it when accessing _prev or _next, the lock must be held.
@@ -62,8 +71,8 @@ int nfsd_reply_cache_init(void)
62 i--; 71 i--;
63 } 72 }
64 73
65 hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); 74 cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
66 if (!hash_list) 75 if (!cache_hash)
67 goto out_nomem; 76 goto out_nomem;
68 77
69 cache_disabled = 0; 78 cache_disabled = 0;
@@ -88,8 +97,8 @@ void nfsd_reply_cache_shutdown(void)
88 97
89 cache_disabled = 1; 98 cache_disabled = 1;
90 99
91 kfree (hash_list); 100 kfree (cache_hash);
92 hash_list = NULL; 101 cache_hash = NULL;
93} 102}
94 103
95/* 104/*
@@ -108,7 +117,7 @@ static void
108hash_refile(struct svc_cacherep *rp) 117hash_refile(struct svc_cacherep *rp)
109{ 118{
110 hlist_del_init(&rp->c_hash); 119 hlist_del_init(&rp->c_hash);
111 hlist_add_head(&rp->c_hash, hash_list + REQHASH(rp->c_xid)); 120 hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
112} 121}
113 122
114/* 123/*
@@ -138,7 +147,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
138 spin_lock(&cache_lock); 147 spin_lock(&cache_lock);
139 rtn = RC_DOIT; 148 rtn = RC_DOIT;
140 149
141 rh = &hash_list[REQHASH(xid)]; 150 rh = &cache_hash[request_hash(xid)];
142 hlist_for_each_entry(rp, hn, rh, c_hash) { 151 hlist_for_each_entry(rp, hn, rh, c_hash) {
143 if (rp->c_state != RC_UNUSED && 152 if (rp->c_state != RC_UNUSED &&
144 xid == rp->c_xid && proc == rp->c_proc && 153 xid == rp->c_xid && proc == rp->c_proc &&
@@ -165,8 +174,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
165 } 174 }
166 } 175 }
167 176
168 /* This should not happen */ 177 /* All entries on the LRU are in-progress. This should not happen */
169 if (rp == NULL) { 178 if (&rp->c_lru == &lru_head) {
170 static int complaints; 179 static int complaints;
171 180
172 printk(KERN_WARNING "nfsd: all repcache entries locked!\n"); 181 printk(KERN_WARNING "nfsd: all repcache entries locked!\n");
@@ -264,7 +273,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
264 273
265 len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); 274 len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
266 len >>= 2; 275 len >>= 2;
267 276
268 /* Don't cache excessive amounts of data and XDR failures */ 277 /* Don't cache excessive amounts of data and XDR failures */
269 if (!statp || len > (256 >> 2)) { 278 if (!statp || len > (256 >> 2)) {
270 rp->c_state = RC_UNUSED; 279 rp->c_state = RC_UNUSED;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index af16849d243a..6d0847562d87 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -25,7 +25,6 @@
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/inet.h> 26#include <linux/inet.h>
27#include <linux/string.h> 27#include <linux/string.h>
28#include <linux/smp_lock.h>
29#include <linux/ctype.h> 28#include <linux/ctype.h>
30 29
31#include <linux/nfs.h> 30#include <linux/nfs.h>
@@ -207,10 +206,14 @@ static struct file_operations pool_stats_operations = {
207static ssize_t write_svc(struct file *file, char *buf, size_t size) 206static ssize_t write_svc(struct file *file, char *buf, size_t size)
208{ 207{
209 struct nfsctl_svc *data; 208 struct nfsctl_svc *data;
209 int err;
210 if (size < sizeof(*data)) 210 if (size < sizeof(*data))
211 return -EINVAL; 211 return -EINVAL;
212 data = (struct nfsctl_svc*) buf; 212 data = (struct nfsctl_svc*) buf;
213 return nfsd_svc(data->svc_port, data->svc_nthreads); 213 err = nfsd_svc(data->svc_port, data->svc_nthreads);
214 if (err < 0)
215 return err;
216 return 0;
214} 217}
215 218
216/** 219/**
@@ -692,11 +695,12 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
692 if (newthreads < 0) 695 if (newthreads < 0)
693 return -EINVAL; 696 return -EINVAL;
694 rv = nfsd_svc(NFS_PORT, newthreads); 697 rv = nfsd_svc(NFS_PORT, newthreads);
695 if (rv) 698 if (rv < 0)
696 return rv; 699 return rv;
697 } 700 } else
698 sprintf(buf, "%d\n", nfsd_nrthreads()); 701 rv = nfsd_nrthreads();
699 return strlen(buf); 702
703 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);
700} 704}
701 705
702/** 706/**
@@ -793,7 +797,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
793{ 797{
794 char *mesg = buf; 798 char *mesg = buf;
795 char *vers, *minorp, sign; 799 char *vers, *minorp, sign;
796 int len, num; 800 int len, num, remaining;
797 unsigned minor; 801 unsigned minor;
798 ssize_t tlen = 0; 802 ssize_t tlen = 0;
799 char *sep; 803 char *sep;
@@ -840,32 +844,50 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
840 } 844 }
841 next: 845 next:
842 vers += len + 1; 846 vers += len + 1;
843 tlen += len;
844 } while ((len = qword_get(&mesg, vers, size)) > 0); 847 } while ((len = qword_get(&mesg, vers, size)) > 0);
845 /* If all get turned off, turn them back on, as 848 /* If all get turned off, turn them back on, as
846 * having no versions is BAD 849 * having no versions is BAD
847 */ 850 */
848 nfsd_reset_versions(); 851 nfsd_reset_versions();
849 } 852 }
853
850 /* Now write current state into reply buffer */ 854 /* Now write current state into reply buffer */
851 len = 0; 855 len = 0;
852 sep = ""; 856 sep = "";
857 remaining = SIMPLE_TRANSACTION_LIMIT;
853 for (num=2 ; num <= 4 ; num++) 858 for (num=2 ; num <= 4 ; num++)
854 if (nfsd_vers(num, NFSD_AVAIL)) { 859 if (nfsd_vers(num, NFSD_AVAIL)) {
855 len += sprintf(buf+len, "%s%c%d", sep, 860 len = snprintf(buf, remaining, "%s%c%d", sep,
856 nfsd_vers(num, NFSD_TEST)?'+':'-', 861 nfsd_vers(num, NFSD_TEST)?'+':'-',
857 num); 862 num);
858 sep = " "; 863 sep = " ";
864
865 if (len > remaining)
866 break;
867 remaining -= len;
868 buf += len;
869 tlen += len;
859 } 870 }
860 if (nfsd_vers(4, NFSD_AVAIL)) 871 if (nfsd_vers(4, NFSD_AVAIL))
861 for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++) 872 for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION;
862 len += sprintf(buf+len, " %c4.%u", 873 minor++) {
874 len = snprintf(buf, remaining, " %c4.%u",
863 (nfsd_vers(4, NFSD_TEST) && 875 (nfsd_vers(4, NFSD_TEST) &&
864 nfsd_minorversion(minor, NFSD_TEST)) ? 876 nfsd_minorversion(minor, NFSD_TEST)) ?
865 '+' : '-', 877 '+' : '-',
866 minor); 878 minor);
867 len += sprintf(buf+len, "\n"); 879
868 return len; 880 if (len > remaining)
881 break;
882 remaining -= len;
883 buf += len;
884 tlen += len;
885 }
886
887 len = snprintf(buf, remaining, "\n");
888 if (len > remaining)
889 return -EINVAL;
890 return tlen + len;
869} 891}
870 892
871/** 893/**
@@ -910,104 +932,143 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
910 return rv; 932 return rv;
911} 933}
912 934
913static ssize_t __write_ports(struct file *file, char *buf, size_t size) 935/*
936 * Zero-length write. Return a list of NFSD's current listener
937 * transports.
938 */
939static ssize_t __write_ports_names(char *buf)
914{ 940{
915 if (size == 0) { 941 if (nfsd_serv == NULL)
916 int len = 0; 942 return 0;
943 return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
944}
917 945
918 if (nfsd_serv) 946/*
919 len = svc_xprt_names(nfsd_serv, buf, 0); 947 * A single 'fd' number was written, in which case it must be for
920 return len; 948 * a socket of a supported family/protocol, and we use it as an
921 } 949 * nfsd listener.
922 /* Either a single 'fd' number is written, in which 950 */
923 * case it must be for a socket of a supported family/protocol, 951static ssize_t __write_ports_addfd(char *buf)
924 * and we use it as an nfsd socket, or 952{
925 * A '-' followed by the 'name' of a socket in which case 953 char *mesg = buf;
926 * we close the socket. 954 int fd, err;
927 */ 955
928 if (isdigit(buf[0])) { 956 err = get_int(&mesg, &fd);
929 char *mesg = buf; 957 if (err != 0 || fd < 0)
930 int fd; 958 return -EINVAL;
931 int err; 959
932 err = get_int(&mesg, &fd); 960 err = nfsd_create_serv();
933 if (err) 961 if (err != 0)
934 return -EINVAL; 962 return err;
935 if (fd < 0) 963
936 return -EINVAL; 964 err = lockd_up();
937 err = nfsd_create_serv(); 965 if (err != 0)
938 if (!err) { 966 goto out;
939 err = svc_addsock(nfsd_serv, fd, buf); 967
940 if (err >= 0) { 968 err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
941 err = lockd_up(); 969 if (err < 0)
942 if (err < 0) 970 lockd_down();
943 svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf); 971
944 } 972out:
945 /* Decrease the count, but don't shutdown the 973 /* Decrease the count, but don't shut down the service */
946 * the service 974 nfsd_serv->sv_nrthreads--;
947 */ 975 return err;
948 nfsd_serv->sv_nrthreads--; 976}
949 } 977
950 return err < 0 ? err : 0; 978/*
951 } 979 * A '-' followed by the 'name' of a socket means we close the socket.
952 if (buf[0] == '-' && isdigit(buf[1])) { 980 */
953 char *toclose = kstrdup(buf+1, GFP_KERNEL); 981static ssize_t __write_ports_delfd(char *buf)
954 int len = 0; 982{
955 if (!toclose) 983 char *toclose;
956 return -ENOMEM; 984 int len = 0;
957 if (nfsd_serv) 985
958 len = svc_sock_names(buf, nfsd_serv, toclose); 986 toclose = kstrdup(buf + 1, GFP_KERNEL);
959 if (len >= 0) 987 if (toclose == NULL)
960 lockd_down(); 988 return -ENOMEM;
961 kfree(toclose); 989
962 return len; 990 if (nfsd_serv != NULL)
963 } 991 len = svc_sock_names(nfsd_serv, buf,
964 /* 992 SIMPLE_TRANSACTION_LIMIT, toclose);
965 * Add a transport listener by writing it's transport name 993 if (len >= 0)
966 */ 994 lockd_down();
967 if (isalpha(buf[0])) { 995
968 int err; 996 kfree(toclose);
969 char transport[16]; 997 return len;
970 int port; 998}
971 if (sscanf(buf, "%15s %4d", transport, &port) == 2) { 999
972 if (port < 1 || port > 65535) 1000/*
973 return -EINVAL; 1001 * A transport listener is added by writing it's transport name and
974 err = nfsd_create_serv(); 1002 * a port number.
975 if (!err) { 1003 */
976 err = svc_create_xprt(nfsd_serv, 1004static ssize_t __write_ports_addxprt(char *buf)
977 transport, PF_INET, port, 1005{
978 SVC_SOCK_ANONYMOUS); 1006 char transport[16];
979 if (err == -ENOENT) 1007 int port, err;
980 /* Give a reasonable perror msg for 1008
981 * bad transport string */ 1009 if (sscanf(buf, "%15s %4u", transport, &port) != 2)
982 err = -EPROTONOSUPPORT; 1010 return -EINVAL;
983 } 1011
984 return err < 0 ? err : 0; 1012 if (port < 1 || port > USHORT_MAX)
985 } 1013 return -EINVAL;
986 } 1014
987 /* 1015 err = nfsd_create_serv();
988 * Remove a transport by writing it's transport name and port number 1016 if (err != 0)
989 */ 1017 return err;
990 if (buf[0] == '-' && isalpha(buf[1])) { 1018
991 struct svc_xprt *xprt; 1019 err = svc_create_xprt(nfsd_serv, transport,
992 int err = -EINVAL; 1020 PF_INET, port, SVC_SOCK_ANONYMOUS);
993 char transport[16]; 1021 if (err < 0) {
994 int port; 1022 /* Give a reasonable perror msg for bad transport string */
995 if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { 1023 if (err == -ENOENT)
996 if (port < 1 || port > 65535) 1024 err = -EPROTONOSUPPORT;
997 return -EINVAL; 1025 return err;
998 if (nfsd_serv) {
999 xprt = svc_find_xprt(nfsd_serv, transport,
1000 AF_UNSPEC, port);
1001 if (xprt) {
1002 svc_close_xprt(xprt);
1003 svc_xprt_put(xprt);
1004 err = 0;
1005 } else
1006 err = -ENOTCONN;
1007 }
1008 return err < 0 ? err : 0;
1009 }
1010 } 1026 }
1027 return 0;
1028}
1029
1030/*
1031 * A transport listener is removed by writing a "-", it's transport
1032 * name, and it's port number.
1033 */
1034static ssize_t __write_ports_delxprt(char *buf)
1035{
1036 struct svc_xprt *xprt;
1037 char transport[16];
1038 int port;
1039
1040 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
1041 return -EINVAL;
1042
1043 if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL)
1044 return -EINVAL;
1045
1046 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
1047 if (xprt == NULL)
1048 return -ENOTCONN;
1049
1050 svc_close_xprt(xprt);
1051 svc_xprt_put(xprt);
1052 return 0;
1053}
1054
1055static ssize_t __write_ports(struct file *file, char *buf, size_t size)
1056{
1057 if (size == 0)
1058 return __write_ports_names(buf);
1059
1060 if (isdigit(buf[0]))
1061 return __write_ports_addfd(buf);
1062
1063 if (buf[0] == '-' && isdigit(buf[1]))
1064 return __write_ports_delfd(buf);
1065
1066 if (isalpha(buf[0]))
1067 return __write_ports_addxprt(buf);
1068
1069 if (buf[0] == '-' && isalpha(buf[1]))
1070 return __write_ports_delxprt(buf);
1071
1011 return -EINVAL; 1072 return -EINVAL;
1012} 1073}
1013 1074
@@ -1030,7 +1091,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
1030 * buf: C string containing an unsigned 1091 * buf: C string containing an unsigned
1031 * integer value representing a bound 1092 * integer value representing a bound
1032 * but unconnected socket that is to be 1093 * but unconnected socket that is to be
1033 * used as an NFSD listener 1094 * used as an NFSD listener; listen(3)
1095 * must be called for a SOCK_STREAM
1096 * socket, otherwise it is ignored
1034 * size: non-zero length of C string in @buf 1097 * size: non-zero length of C string in @buf
1035 * Output: 1098 * Output:
1036 * On success: NFS service is started; 1099 * On success: NFS service is started;
@@ -1138,7 +1201,9 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
1138 nfsd_max_blksize = bsize; 1201 nfsd_max_blksize = bsize;
1139 mutex_unlock(&nfsd_mutex); 1202 mutex_unlock(&nfsd_mutex);
1140 } 1203 }
1141 return sprintf(buf, "%d\n", nfsd_max_blksize); 1204
1205 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n",
1206 nfsd_max_blksize);
1142} 1207}
1143 1208
1144#ifdef CONFIG_NFSD_V4 1209#ifdef CONFIG_NFSD_V4
@@ -1162,8 +1227,9 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
1162 return -EINVAL; 1227 return -EINVAL;
1163 nfs4_reset_lease(lease); 1228 nfs4_reset_lease(lease);
1164 } 1229 }
1165 sprintf(buf, "%ld\n", nfs4_lease_time()); 1230
1166 return strlen(buf); 1231 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n",
1232 nfs4_lease_time());
1167} 1233}
1168 1234
1169/** 1235/**
@@ -1219,8 +1285,9 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
1219 1285
1220 status = nfs4_reset_recoverydir(recdir); 1286 status = nfs4_reset_recoverydir(recdir);
1221 } 1287 }
1222 sprintf(buf, "%s\n", nfs4_recoverydir()); 1288
1223 return strlen(buf); 1289 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n",
1290 nfs4_recoverydir());
1224} 1291}
1225 1292
1226/** 1293/**
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 9f1ca17293d3..8847f3fbfc1e 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -27,9 +27,6 @@
27#define NFSDDBG_FACILITY NFSDDBG_FH 27#define NFSDDBG_FACILITY NFSDDBG_FH
28 28
29 29
30static int nfsd_nr_verified;
31static int nfsd_nr_put;
32
33/* 30/*
34 * our acceptability function. 31 * our acceptability function.
35 * if NOSUBTREECHECK, accept anything 32 * if NOSUBTREECHECK, accept anything
@@ -251,7 +248,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
251 248
252 fhp->fh_dentry = dentry; 249 fhp->fh_dentry = dentry;
253 fhp->fh_export = exp; 250 fhp->fh_export = exp;
254 nfsd_nr_verified++;
255 return 0; 251 return 0;
256out: 252out:
257 exp_put(exp); 253 exp_put(exp);
@@ -552,7 +548,6 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
552 return nfserr_opnotsupp; 548 return nfserr_opnotsupp;
553 } 549 }
554 550
555 nfsd_nr_verified++;
556 return 0; 551 return 0;
557} 552}
558 553
@@ -609,7 +604,6 @@ fh_put(struct svc_fh *fhp)
609 fhp->fh_pre_saved = 0; 604 fhp->fh_pre_saved = 0;
610 fhp->fh_post_saved = 0; 605 fhp->fh_post_saved = 0;
611#endif 606#endif
612 nfsd_nr_put++;
613 } 607 }
614 if (exp) { 608 if (exp) {
615 cache_put(&exp->h, &svc_export_cache); 609 cache_put(&exp->h, &svc_export_cache);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index e298e260b5f1..0eb9c820b7a6 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -533,45 +533,179 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
533 * NFSv2 Server procedures. 533 * NFSv2 Server procedures.
534 * Only the results of non-idempotent operations are cached. 534 * Only the results of non-idempotent operations are cached.
535 */ 535 */
536#define nfsd_proc_none NULL
537#define nfssvc_release_none NULL
538struct nfsd_void { int dummy; }; 536struct nfsd_void { int dummy; };
539 537
540#define PROC(name, argt, rest, relt, cache, respsize) \
541 { (svc_procfunc) nfsd_proc_##name, \
542 (kxdrproc_t) nfssvc_decode_##argt, \
543 (kxdrproc_t) nfssvc_encode_##rest, \
544 (kxdrproc_t) nfssvc_release_##relt, \
545 sizeof(struct nfsd_##argt), \
546 sizeof(struct nfsd_##rest), \
547 0, \
548 cache, \
549 respsize, \
550 }
551
552#define ST 1 /* status */ 538#define ST 1 /* status */
553#define FH 8 /* filehandle */ 539#define FH 8 /* filehandle */
554#define AT 18 /* attributes */ 540#define AT 18 /* attributes */
555 541
556static struct svc_procedure nfsd_procedures2[18] = { 542static struct svc_procedure nfsd_procedures2[18] = {
557 PROC(null, void, void, none, RC_NOCACHE, ST), 543 [NFSPROC_NULL] = {
558 PROC(getattr, fhandle, attrstat, fhandle, RC_NOCACHE, ST+AT), 544 .pc_func = (svc_procfunc) nfsd_proc_null,
559 PROC(setattr, sattrargs, attrstat, fhandle, RC_REPLBUFF, ST+AT), 545 .pc_decode = (kxdrproc_t) nfssvc_decode_void,
560 PROC(none, void, void, none, RC_NOCACHE, ST), 546 .pc_encode = (kxdrproc_t) nfssvc_encode_void,
561 PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT), 547 .pc_argsize = sizeof(struct nfsd_void),
562 PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4), 548 .pc_ressize = sizeof(struct nfsd_void),
563 PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4), 549 .pc_cachetype = RC_NOCACHE,
564 PROC(none, void, void, none, RC_NOCACHE, ST), 550 .pc_xdrressize = ST,
565 PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT), 551 },
566 PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT), 552 [NFSPROC_GETATTR] = {
567 PROC(remove, diropargs, void, none, RC_REPLSTAT, ST), 553 .pc_func = (svc_procfunc) nfsd_proc_getattr,
568 PROC(rename, renameargs, void, none, RC_REPLSTAT, ST), 554 .pc_decode = (kxdrproc_t) nfssvc_decode_fhandle,
569 PROC(link, linkargs, void, none, RC_REPLSTAT, ST), 555 .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
570 PROC(symlink, symlinkargs, void, none, RC_REPLSTAT, ST), 556 .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
571 PROC(mkdir, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT), 557 .pc_argsize = sizeof(struct nfsd_fhandle),
572 PROC(rmdir, diropargs, void, none, RC_REPLSTAT, ST), 558 .pc_ressize = sizeof(struct nfsd_attrstat),
573 PROC(readdir, readdirargs, readdirres, none, RC_NOCACHE, 0), 559 .pc_cachetype = RC_NOCACHE,
574 PROC(statfs, fhandle, statfsres, none, RC_NOCACHE, ST+5), 560 .pc_xdrressize = ST+AT,
561 },
562 [NFSPROC_SETATTR] = {
563 .pc_func = (svc_procfunc) nfsd_proc_setattr,
564 .pc_decode = (kxdrproc_t) nfssvc_decode_sattrargs,
565 .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
566 .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
567 .pc_argsize = sizeof(struct nfsd_sattrargs),
568 .pc_ressize = sizeof(struct nfsd_attrstat),
569 .pc_cachetype = RC_REPLBUFF,
570 .pc_xdrressize = ST+AT,
571 },
572 [NFSPROC_ROOT] = {
573 .pc_decode = (kxdrproc_t) nfssvc_decode_void,
574 .pc_encode = (kxdrproc_t) nfssvc_encode_void,
575 .pc_argsize = sizeof(struct nfsd_void),
576 .pc_ressize = sizeof(struct nfsd_void),
577 .pc_cachetype = RC_NOCACHE,
578 .pc_xdrressize = ST,
579 },
580 [NFSPROC_LOOKUP] = {
581 .pc_func = (svc_procfunc) nfsd_proc_lookup,
582 .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
583 .pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
584 .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
585 .pc_argsize = sizeof(struct nfsd_diropargs),
586 .pc_ressize = sizeof(struct nfsd_diropres),
587 .pc_cachetype = RC_NOCACHE,
588 .pc_xdrressize = ST+FH+AT,
589 },
590 [NFSPROC_READLINK] = {
591 .pc_func = (svc_procfunc) nfsd_proc_readlink,
592 .pc_decode = (kxdrproc_t) nfssvc_decode_readlinkargs,
593 .pc_encode = (kxdrproc_t) nfssvc_encode_readlinkres,
594 .pc_argsize = sizeof(struct nfsd_readlinkargs),
595 .pc_ressize = sizeof(struct nfsd_readlinkres),
596 .pc_cachetype = RC_NOCACHE,
597 .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
598 },
599 [NFSPROC_READ] = {
600 .pc_func = (svc_procfunc) nfsd_proc_read,
601 .pc_decode = (kxdrproc_t) nfssvc_decode_readargs,
602 .pc_encode = (kxdrproc_t) nfssvc_encode_readres,
603 .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
604 .pc_argsize = sizeof(struct nfsd_readargs),
605 .pc_ressize = sizeof(struct nfsd_readres),
606 .pc_cachetype = RC_NOCACHE,
607 .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
608 },
609 [NFSPROC_WRITECACHE] = {
610 .pc_decode = (kxdrproc_t) nfssvc_decode_void,
611 .pc_encode = (kxdrproc_t) nfssvc_encode_void,
612 .pc_argsize = sizeof(struct nfsd_void),
613 .pc_ressize = sizeof(struct nfsd_void),
614 .pc_cachetype = RC_NOCACHE,
615 .pc_xdrressize = ST,
616 },
617 [NFSPROC_WRITE] = {
618 .pc_func = (svc_procfunc) nfsd_proc_write,
619 .pc_decode = (kxdrproc_t) nfssvc_decode_writeargs,
620 .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
621 .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
622 .pc_argsize = sizeof(struct nfsd_writeargs),
623 .pc_ressize = sizeof(struct nfsd_attrstat),
624 .pc_cachetype = RC_REPLBUFF,
625 .pc_xdrressize = ST+AT,
626 },
627 [NFSPROC_CREATE] = {
628 .pc_func = (svc_procfunc) nfsd_proc_create,
629 .pc_decode = (kxdrproc_t) nfssvc_decode_createargs,
630 .pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
631 .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
632 .pc_argsize = sizeof(struct nfsd_createargs),
633 .pc_ressize = sizeof(struct nfsd_diropres),
634 .pc_cachetype = RC_REPLBUFF,
635 .pc_xdrressize = ST+FH+AT,
636 },
637 [NFSPROC_REMOVE] = {
638 .pc_func = (svc_procfunc) nfsd_proc_remove,
639 .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
640 .pc_encode = (kxdrproc_t) nfssvc_encode_void,
641 .pc_argsize = sizeof(struct nfsd_diropargs),
642 .pc_ressize = sizeof(struct nfsd_void),
643 .pc_cachetype = RC_REPLSTAT,
644 .pc_xdrressize = ST,
645 },
646 [NFSPROC_RENAME] = {
647 .pc_func = (svc_procfunc) nfsd_proc_rename,
648 .pc_decode = (kxdrproc_t) nfssvc_decode_renameargs,
649 .pc_encode = (kxdrproc_t) nfssvc_encode_void,
650 .pc_argsize = sizeof(struct nfsd_renameargs),
651 .pc_ressize = sizeof(struct nfsd_void),
652 .pc_cachetype = RC_REPLSTAT,
653 .pc_xdrressize = ST,
654 },
655 [NFSPROC_LINK] = {
656 .pc_func = (svc_procfunc) nfsd_proc_link,
657 .pc_decode = (kxdrproc_t) nfssvc_decode_linkargs,
658 .pc_encode = (kxdrproc_t) nfssvc_encode_void,
659 .pc_argsize = sizeof(struct nfsd_linkargs),
660 .pc_ressize = sizeof(struct nfsd_void),
661 .pc_cachetype = RC_REPLSTAT,
662 .pc_xdrressize = ST,
663 },
664 [NFSPROC_SYMLINK] = {
665 .pc_func = (svc_procfunc) nfsd_proc_symlink,
666 .pc_decode = (kxdrproc_t) nfssvc_decode_symlinkargs,
667 .pc_encode = (kxdrproc_t) nfssvc_encode_void,
668 .pc_argsize = sizeof(struct nfsd_symlinkargs),
669 .pc_ressize = sizeof(struct nfsd_void),
670 .pc_cachetype = RC_REPLSTAT,
671 .pc_xdrressize = ST,
672 },
673 [NFSPROC_MKDIR] = {
674 .pc_func = (svc_procfunc) nfsd_proc_mkdir,
675 .pc_decode = (kxdrproc_t) nfssvc_decode_createargs,
676 .pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
677 .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
678 .pc_argsize = sizeof(struct nfsd_createargs),
679 .pc_ressize = sizeof(struct nfsd_diropres),
680 .pc_cachetype = RC_REPLBUFF,
681 .pc_xdrressize = ST+FH+AT,
682 },
683 [NFSPROC_RMDIR] = {
684 .pc_func = (svc_procfunc) nfsd_proc_rmdir,
685 .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
686 .pc_encode = (kxdrproc_t) nfssvc_encode_void,
687 .pc_argsize = sizeof(struct nfsd_diropargs),
688 .pc_ressize = sizeof(struct nfsd_void),
689 .pc_cachetype = RC_REPLSTAT,
690 .pc_xdrressize = ST,
691 },
692 [NFSPROC_READDIR] = {
693 .pc_func = (svc_procfunc) nfsd_proc_readdir,
694 .pc_decode = (kxdrproc_t) nfssvc_decode_readdirargs,
695 .pc_encode = (kxdrproc_t) nfssvc_encode_readdirres,
696 .pc_argsize = sizeof(struct nfsd_readdirargs),
697 .pc_ressize = sizeof(struct nfsd_readdirres),
698 .pc_cachetype = RC_NOCACHE,
699 },
700 [NFSPROC_STATFS] = {
701 .pc_func = (svc_procfunc) nfsd_proc_statfs,
702 .pc_decode = (kxdrproc_t) nfssvc_decode_fhandle,
703 .pc_encode = (kxdrproc_t) nfssvc_encode_statfsres,
704 .pc_argsize = sizeof(struct nfsd_fhandle),
705 .pc_ressize = sizeof(struct nfsd_statfsres),
706 .pc_cachetype = RC_NOCACHE,
707 .pc_xdrressize = ST+5,
708 },
575}; 709};
576 710
577 711
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index cbba4a935786..492c79b7800b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -18,7 +18,6 @@
18#include <linux/unistd.h> 18#include <linux/unistd.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/smp.h> 20#include <linux/smp.h>
21#include <linux/smp_lock.h>
22#include <linux/freezer.h> 21#include <linux/freezer.h>
23#include <linux/fs_struct.h> 22#include <linux/fs_struct.h>
24#include <linux/kthread.h> 23#include <linux/kthread.h>
@@ -390,12 +389,14 @@ nfsd_svc(unsigned short port, int nrservs)
390 389
391 mutex_lock(&nfsd_mutex); 390 mutex_lock(&nfsd_mutex);
392 dprintk("nfsd: creating service\n"); 391 dprintk("nfsd: creating service\n");
393 error = -EINVAL;
394 if (nrservs <= 0) 392 if (nrservs <= 0)
395 nrservs = 0; 393 nrservs = 0;
396 if (nrservs > NFSD_MAXSERVS) 394 if (nrservs > NFSD_MAXSERVS)
397 nrservs = NFSD_MAXSERVS; 395 nrservs = NFSD_MAXSERVS;
398 396 error = 0;
397 if (nrservs == 0 && nfsd_serv == NULL)
398 goto out;
399
399 /* Readahead param cache - will no-op if it already exists */ 400 /* Readahead param cache - will no-op if it already exists */
400 error = nfsd_racache_init(2*nrservs); 401 error = nfsd_racache_init(2*nrservs);
401 if (error<0) 402 if (error<0)
@@ -413,6 +414,12 @@ nfsd_svc(unsigned short port, int nrservs)
413 goto failure; 414 goto failure;
414 415
415 error = svc_set_num_threads(nfsd_serv, NULL, nrservs); 416 error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
417 if (error == 0)
418 /* We are holding a reference to nfsd_serv which
419 * we don't want to count in the return value,
420 * so subtract 1
421 */
422 error = nfsd_serv->sv_nrthreads - 1;
416 failure: 423 failure:
417 svc_destroy(nfsd_serv); /* Release server */ 424 svc_destroy(nfsd_serv); /* Release server */
418 out: 425 out:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b660435978d2..23341c1063bc 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -55,6 +55,7 @@
55#include <linux/security.h> 55#include <linux/security.h>
56#endif /* CONFIG_NFSD_V4 */ 56#endif /* CONFIG_NFSD_V4 */
57#include <linux/jhash.h> 57#include <linux/jhash.h>
58#include <linux/ima.h>
58 59
59#include <asm/uaccess.h> 60#include <asm/uaccess.h>
60 61
@@ -100,36 +101,35 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
100{ 101{
101 struct svc_export *exp = *expp, *exp2 = NULL; 102 struct svc_export *exp = *expp, *exp2 = NULL;
102 struct dentry *dentry = *dpp; 103 struct dentry *dentry = *dpp;
103 struct vfsmount *mnt = mntget(exp->ex_path.mnt); 104 struct path path = {.mnt = mntget(exp->ex_path.mnt),
104 struct dentry *mounts = dget(dentry); 105 .dentry = dget(dentry)};
105 int err = 0; 106 int err = 0;
106 107
107 while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); 108 while (d_mountpoint(path.dentry) && follow_down(&path))
109 ;
108 110
109 exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts); 111 exp2 = rqst_exp_get_by_name(rqstp, &path);
110 if (IS_ERR(exp2)) { 112 if (IS_ERR(exp2)) {
111 if (PTR_ERR(exp2) != -ENOENT) 113 if (PTR_ERR(exp2) != -ENOENT)
112 err = PTR_ERR(exp2); 114 err = PTR_ERR(exp2);
113 dput(mounts); 115 path_put(&path);
114 mntput(mnt);
115 goto out; 116 goto out;
116 } 117 }
117 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { 118 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
118 /* successfully crossed mount point */ 119 /* successfully crossed mount point */
119 /* 120 /*
120 * This is subtle: dentry is *not* under mnt at this point. 121 * This is subtle: path.dentry is *not* on path.mnt
121 * The only reason we are safe is that original mnt is pinned 122 * at this point. The only reason we are safe is that
122 * down by exp, so we should dput before putting exp. 123 * original mnt is pinned down by exp, so we should
124 * put path *before* putting exp
123 */ 125 */
124 dput(dentry); 126 *dpp = path.dentry;
125 *dpp = mounts; 127 path.dentry = dentry;
126 exp_put(exp);
127 *expp = exp2; 128 *expp = exp2;
128 } else { 129 exp2 = exp;
129 exp_put(exp2);
130 dput(mounts);
131 } 130 }
132 mntput(mnt); 131 path_put(&path);
132 exp_put(exp2);
133out: 133out:
134 return err; 134 return err;
135} 135}
@@ -168,28 +168,29 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
168 /* checking mountpoint crossing is very different when stepping up */ 168 /* checking mountpoint crossing is very different when stepping up */
169 struct svc_export *exp2 = NULL; 169 struct svc_export *exp2 = NULL;
170 struct dentry *dp; 170 struct dentry *dp;
171 struct vfsmount *mnt = mntget(exp->ex_path.mnt); 171 struct path path = {.mnt = mntget(exp->ex_path.mnt),
172 dentry = dget(dparent); 172 .dentry = dget(dparent)};
173 while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry)) 173
174 while (path.dentry == path.mnt->mnt_root &&
175 follow_up(&path))
174 ; 176 ;
175 dp = dget_parent(dentry); 177 dp = dget_parent(path.dentry);
176 dput(dentry); 178 dput(path.dentry);
177 dentry = dp; 179 path.dentry = dp;
178 180
179 exp2 = rqst_exp_parent(rqstp, mnt, dentry); 181 exp2 = rqst_exp_parent(rqstp, &path);
180 if (PTR_ERR(exp2) == -ENOENT) { 182 if (PTR_ERR(exp2) == -ENOENT) {
181 dput(dentry);
182 dentry = dget(dparent); 183 dentry = dget(dparent);
183 } else if (IS_ERR(exp2)) { 184 } else if (IS_ERR(exp2)) {
184 host_err = PTR_ERR(exp2); 185 host_err = PTR_ERR(exp2);
185 dput(dentry); 186 path_put(&path);
186 mntput(mnt);
187 goto out_nfserr; 187 goto out_nfserr;
188 } else { 188 } else {
189 dentry = dget(path.dentry);
189 exp_put(exp); 190 exp_put(exp);
190 exp = exp2; 191 exp = exp2;
191 } 192 }
192 mntput(mnt); 193 path_put(&path);
193 } 194 }
194 } else { 195 } else {
195 fh_lock(fhp); 196 fh_lock(fhp);
@@ -677,7 +678,6 @@ __be32
677nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, 678nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
678 int access, struct file **filp) 679 int access, struct file **filp)
679{ 680{
680 const struct cred *cred = current_cred();
681 struct dentry *dentry; 681 struct dentry *dentry;
682 struct inode *inode; 682 struct inode *inode;
683 int flags = O_RDONLY|O_LARGEFILE; 683 int flags = O_RDONLY|O_LARGEFILE;
@@ -732,9 +732,11 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
732 vfs_dq_init(inode); 732 vfs_dq_init(inode);
733 } 733 }
734 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), 734 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
735 flags, cred); 735 flags, current_cred());
736 if (IS_ERR(*filp)) 736 if (IS_ERR(*filp))
737 host_err = PTR_ERR(*filp); 737 host_err = PTR_ERR(*filp);
738 else
739 ima_counts_get(*filp);
738out_nfserr: 740out_nfserr:
739 err = nfserrno(host_err); 741 err = nfserrno(host_err);
740out: 742out:
@@ -963,6 +965,43 @@ static void kill_suid(struct dentry *dentry)
963 mutex_unlock(&dentry->d_inode->i_mutex); 965 mutex_unlock(&dentry->d_inode->i_mutex);
964} 966}
965 967
968/*
969 * Gathered writes: If another process is currently writing to the file,
970 * there's a high chance this is another nfsd (triggered by a bulk write
971 * from a client's biod). Rather than syncing the file with each write
972 * request, we sleep for 10 msec.
973 *
974 * I don't know if this roughly approximates C. Juszak's idea of
975 * gathered writes, but it's a nice and simple solution (IMHO), and it
976 * seems to work:-)
977 *
978 * Note: we do this only in the NFSv2 case, since v3 and higher have a
979 * better tool (separate unstable writes and commits) for solving this
980 * problem.
981 */
982static int wait_for_concurrent_writes(struct file *file)
983{
984 struct inode *inode = file->f_path.dentry->d_inode;
985 static ino_t last_ino;
986 static dev_t last_dev;
987 int err = 0;
988
989 if (atomic_read(&inode->i_writecount) > 1
990 || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
991 dprintk("nfsd: write defer %d\n", task_pid_nr(current));
992 msleep(10);
993 dprintk("nfsd: write resume %d\n", task_pid_nr(current));
994 }
995
996 if (inode->i_state & I_DIRTY) {
997 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
998 err = nfsd_sync(file);
999 }
1000 last_ino = inode->i_ino;
1001 last_dev = inode->i_sb->s_dev;
1002 return err;
1003}
1004
966static __be32 1005static __be32
967nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1006nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
968 loff_t offset, struct kvec *vec, int vlen, 1007 loff_t offset, struct kvec *vec, int vlen,
@@ -975,6 +1014,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
975 __be32 err = 0; 1014 __be32 err = 0;
976 int host_err; 1015 int host_err;
977 int stable = *stablep; 1016 int stable = *stablep;
1017 int use_wgather;
978 1018
979#ifdef MSNFS 1019#ifdef MSNFS
980 err = nfserr_perm; 1020 err = nfserr_perm;
@@ -993,9 +1033,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
993 * - the sync export option has been set, or 1033 * - the sync export option has been set, or
994 * - the client requested O_SYNC behavior (NFSv3 feature). 1034 * - the client requested O_SYNC behavior (NFSv3 feature).
995 * - The file system doesn't support fsync(). 1035 * - The file system doesn't support fsync().
996 * When gathered writes have been configured for this volume, 1036 * When NFSv2 gathered writes have been configured for this volume,
997 * flushing the data to disk is handled separately below. 1037 * flushing the data to disk is handled separately below.
998 */ 1038 */
1039 use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
999 1040
1000 if (!file->f_op->fsync) {/* COMMIT3 cannot work */ 1041 if (!file->f_op->fsync) {/* COMMIT3 cannot work */
1001 stable = 2; 1042 stable = 2;
@@ -1004,7 +1045,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1004 1045
1005 if (!EX_ISSYNC(exp)) 1046 if (!EX_ISSYNC(exp))
1006 stable = 0; 1047 stable = 0;
1007 if (stable && !EX_WGATHER(exp)) { 1048 if (stable && !use_wgather) {
1008 spin_lock(&file->f_lock); 1049 spin_lock(&file->f_lock);
1009 file->f_flags |= O_SYNC; 1050 file->f_flags |= O_SYNC;
1010 spin_unlock(&file->f_lock); 1051 spin_unlock(&file->f_lock);
@@ -1014,52 +1055,20 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1014 oldfs = get_fs(); set_fs(KERNEL_DS); 1055 oldfs = get_fs(); set_fs(KERNEL_DS);
1015 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); 1056 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
1016 set_fs(oldfs); 1057 set_fs(oldfs);
1017 if (host_err >= 0) { 1058 if (host_err < 0)
1018 *cnt = host_err; 1059 goto out_nfserr;
1019 nfsdstats.io_write += host_err; 1060 *cnt = host_err;
1020 fsnotify_modify(file->f_path.dentry); 1061 nfsdstats.io_write += host_err;
1021 } 1062 fsnotify_modify(file->f_path.dentry);
1022 1063
1023 /* clear setuid/setgid flag after write */ 1064 /* clear setuid/setgid flag after write */
1024 if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) 1065 if (inode->i_mode & (S_ISUID | S_ISGID))
1025 kill_suid(dentry); 1066 kill_suid(dentry);
1026 1067
1027 if (host_err >= 0 && stable) { 1068 if (stable && use_wgather)
1028 static ino_t last_ino; 1069 host_err = wait_for_concurrent_writes(file);
1029 static dev_t last_dev;
1030
1031 /*
1032 * Gathered writes: If another process is currently
1033 * writing to the file, there's a high chance
1034 * this is another nfsd (triggered by a bulk write
1035 * from a client's biod). Rather than syncing the
1036 * file with each write request, we sleep for 10 msec.
1037 *
1038 * I don't know if this roughly approximates
1039 * C. Juszak's idea of gathered writes, but it's a
1040 * nice and simple solution (IMHO), and it seems to
1041 * work:-)
1042 */
1043 if (EX_WGATHER(exp)) {
1044 if (atomic_read(&inode->i_writecount) > 1
1045 || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
1046 dprintk("nfsd: write defer %d\n", task_pid_nr(current));
1047 msleep(10);
1048 dprintk("nfsd: write resume %d\n", task_pid_nr(current));
1049 }
1050
1051 if (inode->i_state & I_DIRTY) {
1052 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
1053 host_err=nfsd_sync(file);
1054 }
1055#if 0
1056 wake_up(&inode->i_wait);
1057#endif
1058 }
1059 last_ino = inode->i_ino;
1060 last_dev = inode->i_sb->s_dev;
1061 }
1062 1070
1071out_nfserr:
1063 dprintk("nfsd: write complete host_err=%d\n", host_err); 1072 dprintk("nfsd: write complete host_err=%d\n", host_err);
1064 if (host_err >= 0) 1073 if (host_err >= 0)
1065 err = 0; 1074 err = 0;
@@ -2024,6 +2033,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2024 struct dentry *dentry, int acc) 2033 struct dentry *dentry, int acc)
2025{ 2034{
2026 struct inode *inode = dentry->d_inode; 2035 struct inode *inode = dentry->d_inode;
2036 struct path path;
2027 int err; 2037 int err;
2028 2038
2029 if (acc == NFSD_MAY_NOP) 2039 if (acc == NFSD_MAY_NOP)
@@ -2096,7 +2106,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2096 if (err == -EACCES && S_ISREG(inode->i_mode) && 2106 if (err == -EACCES && S_ISREG(inode->i_mode) &&
2097 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) 2107 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
2098 err = inode_permission(inode, MAY_EXEC); 2108 err = inode_permission(inode, MAY_EXEC);
2109 if (err)
2110 goto nfsd_out;
2099 2111
2112 /* Do integrity (permission) checking now, but defer incrementing
2113 * IMA counts to the actual file open.
2114 */
2115 path.mnt = exp->ex_path.mnt;
2116 path.dentry = dentry;
2117 err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
2118 IMA_COUNT_LEAVE);
2119nfsd_out:
2100 return err? nfserrno(err) : 0; 2120 return err? nfserrno(err) : 0;
2101} 2121}
2102 2122
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
new file mode 100644
index 000000000000..72da095d4009
--- /dev/null
+++ b/fs/nilfs2/Kconfig
@@ -0,0 +1,25 @@
1config NILFS2_FS
2 tristate "NILFS2 file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 select CRC32
5 help
6 NILFS2 is a log-structured file system (LFS) supporting continuous
7 snapshotting. In addition to versioning capability of the entire
8 file system, users can even restore files mistakenly overwritten or
9 destroyed just a few seconds ago. Since this file system can keep
10 consistency like conventional LFS, it achieves quick recovery after
11 system crashes.
12
13 NILFS2 creates a number of checkpoints every few seconds or per
14 synchronous write basis (unless there is no change). Users can
15 select significant versions among continuously created checkpoints,
16 and can change them into snapshots which will be preserved for long
17 periods until they are changed back to checkpoints. Each
18 snapshot is mountable as a read-only file system concurrently with
19 its writable mount, and this feature is convenient for online backup.
20
21 Some features including atime, extended attributes, and POSIX ACLs,
22 are not supported yet.
23
24 To compile this file system support as a module, choose M here: the
25 module will be called nilfs2. If unsure, say N.
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 064279e33bbb..99d58a028b94 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -31,21 +31,26 @@
31#include "dat.h" 31#include "dat.h"
32#include "alloc.h" 32#include "alloc.h"
33 33
34struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
35{
36 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
37}
38
34int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, 39int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
35 __u64 *ptrp) 40 __u64 *ptrp)
36{ 41{
37 __u64 ptr; 42 sector_t blocknr;
38 int ret; 43 int ret;
39 44
40 down_read(&bmap->b_sem); 45 down_read(&bmap->b_sem);
41 ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); 46 ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
42 if (ret < 0) 47 if (ret < 0)
43 goto out; 48 goto out;
44 if (bmap->b_pops->bpop_translate != NULL) { 49 if (NILFS_BMAP_USE_VBN(bmap)) {
45 ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr); 50 ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
46 if (ret < 0) 51 &blocknr);
47 goto out; 52 if (!ret)
48 *ptrp = ptr; 53 *ptrp = blocknr;
49 } 54 }
50 55
51 out: 56 out:
@@ -53,6 +58,16 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
53 return ret; 58 return ret;
54} 59}
55 60
61int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
62 unsigned maxblocks)
63{
64 int ret;
65
66 down_read(&bmap->b_sem);
67 ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
68 up_read(&bmap->b_sem);
69 return ret;
70}
56 71
57/** 72/**
58 * nilfs_bmap_lookup - find a record 73 * nilfs_bmap_lookup - find a record
@@ -101,8 +116,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
101 if (n < 0) 116 if (n < 0)
102 return n; 117 return n;
103 ret = nilfs_btree_convert_and_insert( 118 ret = nilfs_btree_convert_and_insert(
104 bmap, key, ptr, keys, ptrs, n, 119 bmap, key, ptr, keys, ptrs, n);
105 NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
106 if (ret == 0) 120 if (ret == 0)
107 bmap->b_u.u_flags |= NILFS_BMAP_LARGE; 121 bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
108 122
@@ -158,8 +172,7 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
158 if (n < 0) 172 if (n < 0)
159 return n; 173 return n;
160 ret = nilfs_direct_delete_and_convert( 174 ret = nilfs_direct_delete_and_convert(
161 bmap, key, keys, ptrs, n, 175 bmap, key, keys, ptrs, n);
162 NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
163 if (ret == 0) 176 if (ret == 0)
164 bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE; 177 bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
165 178
@@ -417,38 +430,6 @@ void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
417 mark_inode_dirty(bmap->b_inode); 430 mark_inode_dirty(bmap->b_inode);
418} 431}
419 432
420int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
421 struct buffer_head **bhp)
422{
423 return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
424 ptr, 0, bhp, 0);
425}
426
427void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
428 struct buffer_head *bh)
429{
430 brelse(bh);
431}
432
433int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
434 struct buffer_head **bhp)
435{
436 int ret;
437
438 ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
439 ptr, 0, bhp, 1);
440 if (ret < 0)
441 return ret;
442 set_buffer_nilfs_volatile(*bhp);
443 return 0;
444}
445
446void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
447 struct buffer_head *bh)
448{
449 nilfs_btnode_delete(bh);
450}
451
452__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, 433__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
453 const struct buffer_head *bh) 434 const struct buffer_head *bh)
454{ 435{
@@ -476,11 +457,6 @@ __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
476 return NILFS_BMAP_INVALID_PTR; 457 return NILFS_BMAP_INVALID_PTR;
477} 458}
478 459
479static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
480{
481 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
482}
483
484#define NILFS_BMAP_GROUP_DIV 8 460#define NILFS_BMAP_GROUP_DIV 8
485__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap) 461__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
486{ 462{
@@ -493,64 +469,51 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
493 (entries_per_group / NILFS_BMAP_GROUP_DIV); 469 (entries_per_group / NILFS_BMAP_GROUP_DIV);
494} 470}
495 471
496static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap, 472int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
497 union nilfs_bmap_ptr_req *req) 473 union nilfs_bmap_ptr_req *req)
498{ 474{
499 return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); 475 return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
500} 476}
501 477
502static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap, 478void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
503 union nilfs_bmap_ptr_req *req) 479 union nilfs_bmap_ptr_req *req)
504{ 480{
505 nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); 481 nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
506} 482}
507 483
508static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap, 484void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
509 union nilfs_bmap_ptr_req *req) 485 union nilfs_bmap_ptr_req *req)
510{ 486{
511 nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); 487 nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
512} 488}
513 489
514static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap, 490int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
515 union nilfs_bmap_ptr_req *req) 491 sector_t blocknr)
516{ 492{
517 return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req); 493 struct inode *dat = nilfs_bmap_get_dat(bmap);
518} 494 int ret;
519
520static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
521 union nilfs_bmap_ptr_req *req,
522 sector_t blocknr)
523{
524 nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
525 blocknr);
526}
527 495
528static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap, 496 ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
529 union nilfs_bmap_ptr_req *req) 497 if (likely(!ret))
530{ 498 nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
531 nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req); 499 return ret;
532} 500}
533 501
534static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap, 502int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
535 union nilfs_bmap_ptr_req *req) 503 union nilfs_bmap_ptr_req *req)
536{ 504{
537 return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); 505 return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
538} 506}
539 507
540static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap, 508void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
541 union nilfs_bmap_ptr_req *req) 509 union nilfs_bmap_ptr_req *req)
542{
543 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
544}
545
546static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
547 union nilfs_bmap_ptr_req *req)
548{ 510{
549 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1); 511 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
512 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
550} 513}
551 514
552static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap, 515void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
553 union nilfs_bmap_ptr_req *req) 516 union nilfs_bmap_ptr_req *req)
554{ 517{
555 nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); 518 nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
556} 519}
@@ -566,129 +529,46 @@ int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
566 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr); 529 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
567} 530}
568 531
569int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap, 532int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
570 union nilfs_bmap_ptr_req *oldreq, 533 union nilfs_bmap_ptr_req *oldreq,
571 union nilfs_bmap_ptr_req *newreq) 534 union nilfs_bmap_ptr_req *newreq)
572{ 535{
536 struct inode *dat = nilfs_bmap_get_dat(bmap);
573 int ret; 537 int ret;
574 538
575 ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq); 539 ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
576 if (ret < 0) 540 if (ret < 0)
577 return ret; 541 return ret;
578 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq); 542 ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
579 if (ret < 0) 543 if (ret < 0)
580 bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq); 544 nilfs_dat_abort_end(dat, &oldreq->bpr_req);
581 545
582 return ret; 546 return ret;
583} 547}
584 548
585void nilfs_bmap_commit_update(struct nilfs_bmap *bmap, 549void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
586 union nilfs_bmap_ptr_req *oldreq, 550 union nilfs_bmap_ptr_req *oldreq,
587 union nilfs_bmap_ptr_req *newreq) 551 union nilfs_bmap_ptr_req *newreq)
588{ 552{
589 bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq); 553 struct inode *dat = nilfs_bmap_get_dat(bmap);
590 bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
591}
592 554
593void nilfs_bmap_abort_update(struct nilfs_bmap *bmap, 555 nilfs_dat_commit_end(dat, &oldreq->bpr_req,
594 union nilfs_bmap_ptr_req *oldreq, 556 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
595 union nilfs_bmap_ptr_req *newreq) 557 nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
596{
597 bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
598 bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
599} 558}
600 559
601static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr, 560void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
602 __u64 *ptrp) 561 union nilfs_bmap_ptr_req *oldreq,
562 union nilfs_bmap_ptr_req *newreq)
603{ 563{
604 sector_t blocknr; 564 struct inode *dat = nilfs_bmap_get_dat(bmap);
605 int ret;
606
607 ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
608 if (ret < 0)
609 return ret;
610 if (ptrp != NULL)
611 *ptrp = blocknr;
612 return 0;
613}
614 565
615static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap, 566 nilfs_dat_abort_end(dat, &oldreq->bpr_req);
616 union nilfs_bmap_ptr_req *req) 567 nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
617{
618 /* ignore target ptr */
619 req->bpr_ptr = bmap->b_last_allocated_ptr++;
620 return 0;
621} 568}
622 569
623static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
624 union nilfs_bmap_ptr_req *req)
625{
626 /* do nothing */
627}
628
629static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
630 union nilfs_bmap_ptr_req *req)
631{
632 bmap->b_last_allocated_ptr--;
633}
634
635static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
636 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v,
637 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v,
638 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v,
639 .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v,
640 .bpop_commit_start_ptr = nilfs_bmap_commit_start_v,
641 .bpop_abort_start_ptr = nilfs_bmap_abort_start_v,
642 .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v,
643 .bpop_commit_end_ptr = nilfs_bmap_commit_end_v,
644 .bpop_abort_end_ptr = nilfs_bmap_abort_end_v,
645
646 .bpop_translate = nilfs_bmap_translate_v,
647};
648
649static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
650 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v,
651 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v,
652 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v,
653 .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v,
654 .bpop_commit_start_ptr = nilfs_bmap_commit_start_v,
655 .bpop_abort_start_ptr = nilfs_bmap_abort_start_v,
656 .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v,
657 .bpop_commit_end_ptr = nilfs_bmap_commit_end_vmdt,
658 .bpop_abort_end_ptr = nilfs_bmap_abort_end_v,
659
660 .bpop_translate = nilfs_bmap_translate_v,
661};
662
663static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
664 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_p,
665 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_p,
666 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_p,
667 .bpop_prepare_start_ptr = NULL,
668 .bpop_commit_start_ptr = NULL,
669 .bpop_abort_start_ptr = NULL,
670 .bpop_prepare_end_ptr = NULL,
671 .bpop_commit_end_ptr = NULL,
672 .bpop_abort_end_ptr = NULL,
673
674 .bpop_translate = NULL,
675};
676
677static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
678 .bpop_prepare_alloc_ptr = NULL,
679 .bpop_commit_alloc_ptr = NULL,
680 .bpop_abort_alloc_ptr = NULL,
681 .bpop_prepare_start_ptr = NULL,
682 .bpop_commit_start_ptr = NULL,
683 .bpop_abort_start_ptr = NULL,
684 .bpop_prepare_end_ptr = NULL,
685 .bpop_commit_end_ptr = NULL,
686 .bpop_abort_end_ptr = NULL,
687
688 .bpop_translate = NULL,
689};
690
691static struct lock_class_key nilfs_bmap_dat_lock_key; 570static struct lock_class_key nilfs_bmap_dat_lock_key;
571static struct lock_class_key nilfs_bmap_mdt_lock_key;
692 572
693/** 573/**
694 * nilfs_bmap_read - read a bmap from an inode 574 * nilfs_bmap_read - read a bmap from an inode
@@ -714,31 +594,30 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
714 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; 594 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
715 switch (bmap->b_inode->i_ino) { 595 switch (bmap->b_inode->i_ino) {
716 case NILFS_DAT_INO: 596 case NILFS_DAT_INO:
717 bmap->b_pops = &nilfs_bmap_ptr_ops_p; 597 bmap->b_ptr_type = NILFS_BMAP_PTR_P;
718 bmap->b_last_allocated_key = 0; /* XXX: use macro */ 598 bmap->b_last_allocated_key = 0;
719 bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; 599 bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
720 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 600 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
721 break; 601 break;
722 case NILFS_CPFILE_INO: 602 case NILFS_CPFILE_INO:
723 case NILFS_SUFILE_INO: 603 case NILFS_SUFILE_INO:
724 bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt; 604 bmap->b_ptr_type = NILFS_BMAP_PTR_VS;
725 bmap->b_last_allocated_key = 0; /* XXX: use macro */ 605 bmap->b_last_allocated_key = 0;
726 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; 606 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
607 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
727 break; 608 break;
609 case NILFS_IFILE_INO:
610 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
611 /* Fall through */
728 default: 612 default:
729 bmap->b_pops = &nilfs_bmap_ptr_ops_v; 613 bmap->b_ptr_type = NILFS_BMAP_PTR_VM;
730 bmap->b_last_allocated_key = 0; /* XXX: use macro */ 614 bmap->b_last_allocated_key = 0;
731 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; 615 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
732 break; 616 break;
733 } 617 }
734 618
735 return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ? 619 return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
736 nilfs_btree_init(bmap, 620 nilfs_btree_init(bmap) : nilfs_direct_init(bmap);
737 NILFS_BMAP_LARGE_LOW,
738 NILFS_BMAP_LARGE_HIGH) :
739 nilfs_direct_init(bmap,
740 NILFS_BMAP_SMALL_LOW,
741 NILFS_BMAP_SMALL_HIGH);
742} 621}
743 622
744/** 623/**
@@ -764,7 +643,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
764 memset(&bmap->b_u, 0, NILFS_BMAP_SIZE); 643 memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
765 init_rwsem(&bmap->b_sem); 644 init_rwsem(&bmap->b_sem);
766 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; 645 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
767 bmap->b_pops = &nilfs_bmap_ptr_ops_gc; 646 bmap->b_ptr_type = NILFS_BMAP_PTR_U;
768 bmap->b_last_allocated_key = 0; 647 bmap->b_last_allocated_key = 0;
769 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; 648 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
770 bmap->b_state = 0; 649 bmap->b_state = 0;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 4f2708abb1ba..b2890cdcef12 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -64,6 +64,8 @@ struct nilfs_bmap_stats {
64 */ 64 */
65struct nilfs_bmap_operations { 65struct nilfs_bmap_operations {
66 int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *); 66 int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
67 int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *,
68 unsigned);
67 int (*bop_insert)(struct nilfs_bmap *, __u64, __u64); 69 int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
68 int (*bop_delete)(struct nilfs_bmap *, __u64); 70 int (*bop_delete)(struct nilfs_bmap *, __u64);
69 void (*bop_clear)(struct nilfs_bmap *); 71 void (*bop_clear)(struct nilfs_bmap *);
@@ -86,34 +88,6 @@ struct nilfs_bmap_operations {
86}; 88};
87 89
88 90
89/**
90 * struct nilfs_bmap_ptr_operations - bmap ptr operation table
91 */
92struct nilfs_bmap_ptr_operations {
93 int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
94 union nilfs_bmap_ptr_req *);
95 void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
96 union nilfs_bmap_ptr_req *);
97 void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
98 union nilfs_bmap_ptr_req *);
99 int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
100 union nilfs_bmap_ptr_req *);
101 void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
102 union nilfs_bmap_ptr_req *,
103 sector_t);
104 void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
105 union nilfs_bmap_ptr_req *);
106 int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
107 union nilfs_bmap_ptr_req *);
108 void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
109 union nilfs_bmap_ptr_req *);
110 void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
111 union nilfs_bmap_ptr_req *);
112
113 int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
114};
115
116
117#define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64)) 91#define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64))
118#define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */) 92#define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */)
119#define NILFS_BMAP_NEW_PTR_INIT \ 93#define NILFS_BMAP_NEW_PTR_INIT \
@@ -131,11 +105,9 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
131 * @b_sem: semaphore 105 * @b_sem: semaphore
132 * @b_inode: owner of bmap 106 * @b_inode: owner of bmap
133 * @b_ops: bmap operation table 107 * @b_ops: bmap operation table
134 * @b_pops: bmap ptr operation table
135 * @b_low: low watermark of conversion
136 * @b_high: high watermark of conversion
137 * @b_last_allocated_key: last allocated key for data block 108 * @b_last_allocated_key: last allocated key for data block
138 * @b_last_allocated_ptr: last allocated ptr for data block 109 * @b_last_allocated_ptr: last allocated ptr for data block
110 * @b_ptr_type: pointer type
139 * @b_state: state 111 * @b_state: state
140 */ 112 */
141struct nilfs_bmap { 113struct nilfs_bmap {
@@ -146,14 +118,22 @@ struct nilfs_bmap {
146 struct rw_semaphore b_sem; 118 struct rw_semaphore b_sem;
147 struct inode *b_inode; 119 struct inode *b_inode;
148 const struct nilfs_bmap_operations *b_ops; 120 const struct nilfs_bmap_operations *b_ops;
149 const struct nilfs_bmap_ptr_operations *b_pops;
150 __u64 b_low;
151 __u64 b_high;
152 __u64 b_last_allocated_key; 121 __u64 b_last_allocated_key;
153 __u64 b_last_allocated_ptr; 122 __u64 b_last_allocated_ptr;
123 int b_ptr_type;
154 int b_state; 124 int b_state;
155}; 125};
156 126
127/* pointer type */
128#define NILFS_BMAP_PTR_P 0 /* physical block number (i.e. LBN) */
129#define NILFS_BMAP_PTR_VS 1 /* virtual block number (single
130 version) */
131#define NILFS_BMAP_PTR_VM 2 /* virtual block number (has multiple
132 versions) */
133#define NILFS_BMAP_PTR_U (-1) /* never perform pointer operations */
134
135#define NILFS_BMAP_USE_VBN(bmap) ((bmap)->b_ptr_type > 0)
136
157/* state */ 137/* state */
158#define NILFS_BMAP_DIRTY 0x00000001 138#define NILFS_BMAP_DIRTY 0x00000001
159 139
@@ -162,6 +142,7 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
162int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); 142int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
163void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); 143void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
164int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *); 144int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
145int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
165int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); 146int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
166int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); 147int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
167int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *); 148int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
@@ -182,7 +163,67 @@ void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
182/* 163/*
183 * Internal use only 164 * Internal use only
184 */ 165 */
166struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
167int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
168 union nilfs_bmap_ptr_req *);
169void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
170 union nilfs_bmap_ptr_req *);
171void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
172 union nilfs_bmap_ptr_req *);
185 173
174static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
175 union nilfs_bmap_ptr_req *req)
176{
177 if (NILFS_BMAP_USE_VBN(bmap))
178 return nilfs_bmap_prepare_alloc_v(bmap, req);
179 /* ignore target ptr */
180 req->bpr_ptr = bmap->b_last_allocated_ptr++;
181 return 0;
182}
183
184static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
185 union nilfs_bmap_ptr_req *req)
186{
187 if (NILFS_BMAP_USE_VBN(bmap))
188 nilfs_bmap_commit_alloc_v(bmap, req);
189}
190
191static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
192 union nilfs_bmap_ptr_req *req)
193{
194 if (NILFS_BMAP_USE_VBN(bmap))
195 nilfs_bmap_abort_alloc_v(bmap, req);
196 else
197 bmap->b_last_allocated_ptr--;
198}
199
200int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
201void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
202void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
203
204static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
205 union nilfs_bmap_ptr_req *req)
206{
207 return NILFS_BMAP_USE_VBN(bmap) ?
208 nilfs_bmap_prepare_end_v(bmap, req) : 0;
209}
210
211static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
212 union nilfs_bmap_ptr_req *req)
213{
214 if (NILFS_BMAP_USE_VBN(bmap))
215 nilfs_bmap_commit_end_v(bmap, req);
216}
217
218static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
219 union nilfs_bmap_ptr_req *req)
220{
221 if (NILFS_BMAP_USE_VBN(bmap))
222 nilfs_bmap_abort_end_v(bmap, req);
223}
224
225int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
226 sector_t);
186int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t); 227int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
187int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64); 228int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
188 229
@@ -193,28 +234,20 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
193__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); 234__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
194__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); 235__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
195 236
196int nilfs_bmap_prepare_update(struct nilfs_bmap *, 237int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
197 union nilfs_bmap_ptr_req *, 238 union nilfs_bmap_ptr_req *,
198 union nilfs_bmap_ptr_req *); 239 union nilfs_bmap_ptr_req *);
199void nilfs_bmap_commit_update(struct nilfs_bmap *, 240void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
200 union nilfs_bmap_ptr_req *, 241 union nilfs_bmap_ptr_req *,
201 union nilfs_bmap_ptr_req *); 242 union nilfs_bmap_ptr_req *);
202void nilfs_bmap_abort_update(struct nilfs_bmap *, 243void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
203 union nilfs_bmap_ptr_req *, 244 union nilfs_bmap_ptr_req *,
204 union nilfs_bmap_ptr_req *); 245 union nilfs_bmap_ptr_req *);
205 246
206void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int); 247void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
207void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int); 248void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
208 249
209 250
210int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
211 struct buffer_head **);
212void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
213int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
214 struct buffer_head **);
215void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
216
217
218/* Assume that bmap semaphore is locked. */ 251/* Assume that bmap semaphore is locked. */
219static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap) 252static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
220{ 253{
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 4cc07b2c30e0..7e0b61be212e 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -46,15 +46,18 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
46 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear); 46 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
47} 47}
48 48
49static struct address_space_operations def_btnode_aops; 49static struct address_space_operations def_btnode_aops = {
50 .sync_page = block_sync_page,
51};
50 52
51void nilfs_btnode_cache_init(struct address_space *btnc) 53void nilfs_btnode_cache_init(struct address_space *btnc,
54 struct backing_dev_info *bdi)
52{ 55{
53 btnc->host = NULL; /* can safely set to host inode ? */ 56 btnc->host = NULL; /* can safely set to host inode ? */
54 btnc->flags = 0; 57 btnc->flags = 0;
55 mapping_set_gfp_mask(btnc, GFP_NOFS); 58 mapping_set_gfp_mask(btnc, GFP_NOFS);
56 btnc->assoc_mapping = NULL; 59 btnc->assoc_mapping = NULL;
57 btnc->backing_dev_info = &default_backing_dev_info; 60 btnc->backing_dev_info = bdi;
58 btnc->a_ops = &def_btnode_aops; 61 btnc->a_ops = &def_btnode_aops;
59} 62}
60 63
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 35faa86444a7..3e2275172ed6 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -38,7 +38,7 @@ struct nilfs_btnode_chkey_ctxt {
38}; 38};
39 39
40void nilfs_btnode_cache_init_once(struct address_space *); 40void nilfs_btnode_cache_init_once(struct address_space *);
41void nilfs_btnode_cache_init(struct address_space *); 41void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
42void nilfs_btnode_cache_clear(struct address_space *); 42void nilfs_btnode_cache_clear(struct address_space *);
43int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, 43int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
44 struct buffer_head **, int); 44 struct buffer_head **, int);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 6b37a2767293..aa412724b64e 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -29,6 +29,7 @@
29#include "btnode.h" 29#include "btnode.h"
30#include "btree.h" 30#include "btree.h"
31#include "alloc.h" 31#include "alloc.h"
32#include "dat.h"
32 33
33/** 34/**
34 * struct nilfs_btree_path - A path on which B-tree operations are executed 35 * struct nilfs_btree_path - A path on which B-tree operations are executed
@@ -109,8 +110,7 @@ static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
109 level < NILFS_BTREE_LEVEL_MAX; 110 level < NILFS_BTREE_LEVEL_MAX;
110 level++) { 111 level++) {
111 if (path[level].bp_bh != NULL) { 112 if (path[level].bp_bh != NULL) {
112 nilfs_bmap_put_block(&btree->bt_bmap, 113 brelse(path[level].bp_bh);
113 path[level].bp_bh);
114 path[level].bp_bh = NULL; 114 path[level].bp_bh = NULL;
115 } 115 }
116 /* sib_bh is released or deleted by prepare or commit 116 /* sib_bh is released or deleted by prepare or commit
@@ -123,10 +123,29 @@ static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
123 } 123 }
124} 124}
125 125
126
127/* 126/*
128 * B-tree node operations 127 * B-tree node operations
129 */ 128 */
129static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
130 struct buffer_head **bhp)
131{
132 struct address_space *btnc =
133 &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
134 return nilfs_btnode_get(btnc, ptr, 0, bhp, 0);
135}
136
137static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
138 __u64 ptr, struct buffer_head **bhp)
139{
140 struct address_space *btnc =
141 &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
142 int ret;
143
144 ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1);
145 if (!ret)
146 set_buffer_nilfs_volatile(*bhp);
147 return ret;
148}
130 149
131static inline int 150static inline int
132nilfs_btree_node_get_flags(const struct nilfs_btree *btree, 151nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
@@ -488,8 +507,7 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
488 path[level].bp_index = index; 507 path[level].bp_index = index;
489 508
490 for (level--; level >= minlevel; level--) { 509 for (level--; level >= minlevel; level--) {
491 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, 510 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
492 &path[level].bp_bh);
493 if (ret < 0) 511 if (ret < 0)
494 return ret; 512 return ret;
495 node = nilfs_btree_get_nonroot_node(btree, path, level); 513 node = nilfs_btree_get_nonroot_node(btree, path, level);
@@ -535,8 +553,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
535 path[level].bp_index = index; 553 path[level].bp_index = index;
536 554
537 for (level--; level > 0; level--) { 555 for (level--; level > 0; level--) {
538 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, 556 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
539 &path[level].bp_bh);
540 if (ret < 0) 557 if (ret < 0)
541 return ret; 558 return ret;
542 node = nilfs_btree_get_nonroot_node(btree, path, level); 559 node = nilfs_btree_get_nonroot_node(btree, path, level);
@@ -579,6 +596,87 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
579 return ret; 596 return ret;
580} 597}
581 598
599static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
600 __u64 key, __u64 *ptrp, unsigned maxblocks)
601{
602 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
603 struct nilfs_btree_path *path;
604 struct nilfs_btree_node *node;
605 struct inode *dat = NULL;
606 __u64 ptr, ptr2;
607 sector_t blocknr;
608 int level = NILFS_BTREE_LEVEL_NODE_MIN;
609 int ret, cnt, index, maxlevel;
610
611 path = nilfs_btree_alloc_path(btree);
612 if (path == NULL)
613 return -ENOMEM;
614 nilfs_btree_init_path(btree, path);
615 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
616 if (ret < 0)
617 goto out;
618
619 if (NILFS_BMAP_USE_VBN(bmap)) {
620 dat = nilfs_bmap_get_dat(bmap);
621 ret = nilfs_dat_translate(dat, ptr, &blocknr);
622 if (ret < 0)
623 goto out;
624 ptr = blocknr;
625 }
626 cnt = 1;
627 if (cnt == maxblocks)
628 goto end;
629
630 maxlevel = nilfs_btree_height(btree) - 1;
631 node = nilfs_btree_get_node(btree, path, level);
632 index = path[level].bp_index + 1;
633 for (;;) {
634 while (index < nilfs_btree_node_get_nchildren(btree, node)) {
635 if (nilfs_btree_node_get_key(btree, node, index) !=
636 key + cnt)
637 goto end;
638 ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
639 if (dat) {
640 ret = nilfs_dat_translate(dat, ptr2, &blocknr);
641 if (ret < 0)
642 goto out;
643 ptr2 = blocknr;
644 }
645 if (ptr2 != ptr + cnt || ++cnt == maxblocks)
646 goto end;
647 index++;
648 continue;
649 }
650 if (level == maxlevel)
651 break;
652
653 /* look-up right sibling node */
654 node = nilfs_btree_get_node(btree, path, level + 1);
655 index = path[level + 1].bp_index + 1;
656 if (index >= nilfs_btree_node_get_nchildren(btree, node) ||
657 nilfs_btree_node_get_key(btree, node, index) != key + cnt)
658 break;
659 ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
660 path[level + 1].bp_index = index;
661
662 brelse(path[level].bp_bh);
663 path[level].bp_bh = NULL;
664 ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
665 if (ret < 0)
666 goto out;
667 node = nilfs_btree_get_nonroot_node(btree, path, level);
668 index = 0;
669 path[level].bp_index = index;
670 }
671 end:
672 *ptrp = ptr;
673 ret = cnt;
674 out:
675 nilfs_btree_clear_path(btree, path);
676 nilfs_btree_free_path(btree, path);
677 return ret;
678}
679
582static void nilfs_btree_promote_key(struct nilfs_btree *btree, 680static void nilfs_btree_promote_key(struct nilfs_btree *btree,
583 struct nilfs_btree_path *path, 681 struct nilfs_btree_path *path,
584 int level, __u64 key) 682 int level, __u64 key)
@@ -669,13 +767,13 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
669 nilfs_btree_node_get_key(btree, node, 0)); 767 nilfs_btree_node_get_key(btree, node, 0));
670 768
671 if (move) { 769 if (move) {
672 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); 770 brelse(path[level].bp_bh);
673 path[level].bp_bh = path[level].bp_sib_bh; 771 path[level].bp_bh = path[level].bp_sib_bh;
674 path[level].bp_sib_bh = NULL; 772 path[level].bp_sib_bh = NULL;
675 path[level].bp_index += lnchildren; 773 path[level].bp_index += lnchildren;
676 path[level + 1].bp_index--; 774 path[level + 1].bp_index--;
677 } else { 775 } else {
678 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); 776 brelse(path[level].bp_sib_bh);
679 path[level].bp_sib_bh = NULL; 777 path[level].bp_sib_bh = NULL;
680 path[level].bp_index -= n; 778 path[level].bp_index -= n;
681 } 779 }
@@ -722,14 +820,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
722 path[level + 1].bp_index--; 820 path[level + 1].bp_index--;
723 821
724 if (move) { 822 if (move) {
725 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); 823 brelse(path[level].bp_bh);
726 path[level].bp_bh = path[level].bp_sib_bh; 824 path[level].bp_bh = path[level].bp_sib_bh;
727 path[level].bp_sib_bh = NULL; 825 path[level].bp_sib_bh = NULL;
728 path[level].bp_index -= 826 path[level].bp_index -=
729 nilfs_btree_node_get_nchildren(btree, node); 827 nilfs_btree_node_get_nchildren(btree, node);
730 path[level + 1].bp_index++; 828 path[level + 1].bp_index++;
731 } else { 829 } else {
732 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); 830 brelse(path[level].bp_sib_bh);
733 path[level].bp_sib_bh = NULL; 831 path[level].bp_sib_bh = NULL;
734 } 832 }
735 833
@@ -781,7 +879,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
781 *keyp = nilfs_btree_node_get_key(btree, right, 0); 879 *keyp = nilfs_btree_node_get_key(btree, right, 0);
782 *ptrp = path[level].bp_newreq.bpr_ptr; 880 *ptrp = path[level].bp_newreq.bpr_ptr;
783 881
784 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); 882 brelse(path[level].bp_bh);
785 path[level].bp_bh = path[level].bp_sib_bh; 883 path[level].bp_bh = path[level].bp_sib_bh;
786 path[level].bp_sib_bh = NULL; 884 path[level].bp_sib_bh = NULL;
787 } else { 885 } else {
@@ -790,7 +888,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
790 *keyp = nilfs_btree_node_get_key(btree, right, 0); 888 *keyp = nilfs_btree_node_get_key(btree, right, 0);
791 *ptrp = path[level].bp_newreq.bpr_ptr; 889 *ptrp = path[level].bp_newreq.bpr_ptr;
792 890
793 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); 891 brelse(path[level].bp_sib_bh);
794 path[level].bp_sib_bh = NULL; 892 path[level].bp_sib_bh = NULL;
795 } 893 }
796 894
@@ -897,12 +995,12 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
897 level = NILFS_BTREE_LEVEL_DATA; 995 level = NILFS_BTREE_LEVEL_DATA;
898 996
899 /* allocate a new ptr for data block */ 997 /* allocate a new ptr for data block */
900 if (btree->bt_ops->btop_find_target != NULL) 998 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
901 path[level].bp_newreq.bpr_ptr = 999 path[level].bp_newreq.bpr_ptr =
902 btree->bt_ops->btop_find_target(btree, path, key); 1000 nilfs_btree_find_target_v(btree, path, key);
903 1001
904 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( 1002 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
905 &btree->bt_bmap, &path[level].bp_newreq); 1003 &path[level].bp_newreq);
906 if (ret < 0) 1004 if (ret < 0)
907 goto err_out_data; 1005 goto err_out_data;
908 1006
@@ -924,8 +1022,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
924 if (pindex > 0) { 1022 if (pindex > 0) {
925 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1023 sibptr = nilfs_btree_node_get_ptr(btree, parent,
926 pindex - 1); 1024 pindex - 1);
927 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, 1025 ret = nilfs_btree_get_block(btree, sibptr, &bh);
928 &bh);
929 if (ret < 0) 1026 if (ret < 0)
930 goto err_out_child_node; 1027 goto err_out_child_node;
931 sib = (struct nilfs_btree_node *)bh->b_data; 1028 sib = (struct nilfs_btree_node *)bh->b_data;
@@ -936,7 +1033,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
936 stats->bs_nblocks++; 1033 stats->bs_nblocks++;
937 goto out; 1034 goto out;
938 } else 1035 } else
939 nilfs_bmap_put_block(&btree->bt_bmap, bh); 1036 brelse(bh);
940 } 1037 }
941 1038
942 /* right sibling */ 1039 /* right sibling */
@@ -944,8 +1041,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
944 nilfs_btree_node_get_nchildren(btree, parent) - 1) { 1041 nilfs_btree_node_get_nchildren(btree, parent) - 1) {
945 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1042 sibptr = nilfs_btree_node_get_ptr(btree, parent,
946 pindex + 1); 1043 pindex + 1);
947 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, 1044 ret = nilfs_btree_get_block(btree, sibptr, &bh);
948 &bh);
949 if (ret < 0) 1045 if (ret < 0)
950 goto err_out_child_node; 1046 goto err_out_child_node;
951 sib = (struct nilfs_btree_node *)bh->b_data; 1047 sib = (struct nilfs_btree_node *)bh->b_data;
@@ -956,19 +1052,19 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
956 stats->bs_nblocks++; 1052 stats->bs_nblocks++;
957 goto out; 1053 goto out;
958 } else 1054 } else
959 nilfs_bmap_put_block(&btree->bt_bmap, bh); 1055 brelse(bh);
960 } 1056 }
961 1057
962 /* split */ 1058 /* split */
963 path[level].bp_newreq.bpr_ptr = 1059 path[level].bp_newreq.bpr_ptr =
964 path[level - 1].bp_newreq.bpr_ptr + 1; 1060 path[level - 1].bp_newreq.bpr_ptr + 1;
965 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( 1061 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
966 &btree->bt_bmap, &path[level].bp_newreq); 1062 &path[level].bp_newreq);
967 if (ret < 0) 1063 if (ret < 0)
968 goto err_out_child_node; 1064 goto err_out_child_node;
969 ret = nilfs_bmap_get_new_block(&btree->bt_bmap, 1065 ret = nilfs_btree_get_new_block(btree,
970 path[level].bp_newreq.bpr_ptr, 1066 path[level].bp_newreq.bpr_ptr,
971 &bh); 1067 &bh);
972 if (ret < 0) 1068 if (ret < 0)
973 goto err_out_curr_node; 1069 goto err_out_curr_node;
974 1070
@@ -994,12 +1090,12 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
994 1090
995 /* grow */ 1091 /* grow */
996 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; 1092 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
997 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( 1093 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
998 &btree->bt_bmap, &path[level].bp_newreq); 1094 &path[level].bp_newreq);
999 if (ret < 0) 1095 if (ret < 0)
1000 goto err_out_child_node; 1096 goto err_out_child_node;
1001 ret = nilfs_bmap_get_new_block(&btree->bt_bmap, 1097 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
1002 path[level].bp_newreq.bpr_ptr, &bh); 1098 &bh);
1003 if (ret < 0) 1099 if (ret < 0)
1004 goto err_out_curr_node; 1100 goto err_out_curr_node;
1005 1101
@@ -1023,18 +1119,16 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1023 1119
1024 /* error */ 1120 /* error */
1025 err_out_curr_node: 1121 err_out_curr_node:
1026 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap, 1122 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
1027 &path[level].bp_newreq);
1028 err_out_child_node: 1123 err_out_child_node:
1029 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { 1124 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
1030 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh); 1125 nilfs_btnode_delete(path[level].bp_sib_bh);
1031 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr( 1126 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
1032 &btree->bt_bmap, &path[level].bp_newreq); 1127 &path[level].bp_newreq);
1033 1128
1034 } 1129 }
1035 1130
1036 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap, 1131 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
1037 &path[level].bp_newreq);
1038 err_out_data: 1132 err_out_data:
1039 *levelp = level; 1133 *levelp = level;
1040 stats->bs_nblocks = 0; 1134 stats->bs_nblocks = 0;
@@ -1049,14 +1143,12 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
1049 1143
1050 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1144 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1051 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; 1145 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
1052 if (btree->bt_ops->btop_set_target != NULL) 1146 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
1053 btree->bt_ops->btop_set_target(btree, key, ptr); 1147 nilfs_btree_set_target_v(btree, key, ptr);
1054 1148
1055 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1149 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1056 if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) { 1150 nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
1057 btree->bt_bmap.b_pops->bpop_commit_alloc_ptr( 1151 &path[level - 1].bp_newreq);
1058 &btree->bt_bmap, &path[level - 1].bp_newreq);
1059 }
1060 path[level].bp_op(btree, path, level, &key, &ptr); 1152 path[level].bp_op(btree, path, level, &key, &ptr);
1061 } 1153 }
1062 1154
@@ -1153,7 +1245,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1153 nilfs_btree_promote_key(btree, path, level + 1, 1245 nilfs_btree_promote_key(btree, path, level + 1,
1154 nilfs_btree_node_get_key(btree, node, 0)); 1246 nilfs_btree_node_get_key(btree, node, 0));
1155 1247
1156 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); 1248 brelse(path[level].bp_sib_bh);
1157 path[level].bp_sib_bh = NULL; 1249 path[level].bp_sib_bh = NULL;
1158 path[level].bp_index += n; 1250 path[level].bp_index += n;
1159} 1251}
@@ -1192,7 +1284,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1192 nilfs_btree_node_get_key(btree, right, 0)); 1284 nilfs_btree_node_get_key(btree, right, 0));
1193 path[level + 1].bp_index--; 1285 path[level + 1].bp_index--;
1194 1286
1195 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); 1287 brelse(path[level].bp_sib_bh);
1196 path[level].bp_sib_bh = NULL; 1288 path[level].bp_sib_bh = NULL;
1197} 1289}
1198 1290
@@ -1221,7 +1313,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1221 unlock_buffer(path[level].bp_bh); 1313 unlock_buffer(path[level].bp_bh);
1222 unlock_buffer(path[level].bp_sib_bh); 1314 unlock_buffer(path[level].bp_sib_bh);
1223 1315
1224 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh); 1316 nilfs_btnode_delete(path[level].bp_bh);
1225 path[level].bp_bh = path[level].bp_sib_bh; 1317 path[level].bp_bh = path[level].bp_sib_bh;
1226 path[level].bp_sib_bh = NULL; 1318 path[level].bp_sib_bh = NULL;
1227 path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left); 1319 path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
@@ -1252,7 +1344,7 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1252 unlock_buffer(path[level].bp_bh); 1344 unlock_buffer(path[level].bp_bh);
1253 unlock_buffer(path[level].bp_sib_bh); 1345 unlock_buffer(path[level].bp_sib_bh);
1254 1346
1255 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh); 1347 nilfs_btnode_delete(path[level].bp_sib_bh);
1256 path[level].bp_sib_bh = NULL; 1348 path[level].bp_sib_bh = NULL;
1257 path[level + 1].bp_index++; 1349 path[level + 1].bp_index++;
1258} 1350}
@@ -1276,7 +1368,7 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
1276 nilfs_btree_node_move_left(btree, root, child, n); 1368 nilfs_btree_node_move_left(btree, root, child, n);
1277 unlock_buffer(path[level].bp_bh); 1369 unlock_buffer(path[level].bp_bh);
1278 1370
1279 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh); 1371 nilfs_btnode_delete(path[level].bp_bh);
1280 path[level].bp_bh = NULL; 1372 path[level].bp_bh = NULL;
1281} 1373}
1282 1374
@@ -1300,12 +1392,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1300 path[level].bp_oldreq.bpr_ptr = 1392 path[level].bp_oldreq.bpr_ptr =
1301 nilfs_btree_node_get_ptr(btree, node, 1393 nilfs_btree_node_get_ptr(btree, node,
1302 path[level].bp_index); 1394 path[level].bp_index);
1303 if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) { 1395 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
1304 ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr( 1396 &path[level].bp_oldreq);
1305 &btree->bt_bmap, &path[level].bp_oldreq); 1397 if (ret < 0)
1306 if (ret < 0) 1398 goto err_out_child_node;
1307 goto err_out_child_node;
1308 }
1309 1399
1310 if (nilfs_btree_node_get_nchildren(btree, node) > 1400 if (nilfs_btree_node_get_nchildren(btree, node) >
1311 nilfs_btree_node_nchildren_min(btree, node)) { 1401 nilfs_btree_node_nchildren_min(btree, node)) {
@@ -1321,8 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1321 /* left sibling */ 1411 /* left sibling */
1322 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1412 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1323 pindex - 1); 1413 pindex - 1);
1324 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, 1414 ret = nilfs_btree_get_block(btree, sibptr, &bh);
1325 &bh);
1326 if (ret < 0) 1415 if (ret < 0)
1327 goto err_out_curr_node; 1416 goto err_out_curr_node;
1328 sib = (struct nilfs_btree_node *)bh->b_data; 1417 sib = (struct nilfs_btree_node *)bh->b_data;
@@ -1343,8 +1432,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1343 /* right sibling */ 1432 /* right sibling */
1344 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1433 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1345 pindex + 1); 1434 pindex + 1);
1346 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, 1435 ret = nilfs_btree_get_block(btree, sibptr, &bh);
1347 &bh);
1348 if (ret < 0) 1436 if (ret < 0)
1349 goto err_out_curr_node; 1437 goto err_out_curr_node;
1350 sib = (struct nilfs_btree_node *)bh->b_data; 1438 sib = (struct nilfs_btree_node *)bh->b_data;
@@ -1381,12 +1469,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1381 node = nilfs_btree_get_root(btree); 1469 node = nilfs_btree_get_root(btree);
1382 path[level].bp_oldreq.bpr_ptr = 1470 path[level].bp_oldreq.bpr_ptr =
1383 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); 1471 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
1384 if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) { 1472
1385 ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr( 1473 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
1386 &btree->bt_bmap, &path[level].bp_oldreq); 1474 &path[level].bp_oldreq);
1387 if (ret < 0) 1475 if (ret < 0)
1388 goto err_out_child_node; 1476 goto err_out_child_node;
1389 } 1477
1390 /* child of the root node is deleted */ 1478 /* child of the root node is deleted */
1391 path[level].bp_op = nilfs_btree_do_delete; 1479 path[level].bp_op = nilfs_btree_do_delete;
1392 stats->bs_nblocks++; 1480 stats->bs_nblocks++;
@@ -1398,15 +1486,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1398 1486
1399 /* error */ 1487 /* error */
1400 err_out_curr_node: 1488 err_out_curr_node:
1401 if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL) 1489 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq);
1402 btree->bt_bmap.b_pops->bpop_abort_end_ptr(
1403 &btree->bt_bmap, &path[level].bp_oldreq);
1404 err_out_child_node: 1490 err_out_child_node:
1405 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { 1491 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
1406 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); 1492 brelse(path[level].bp_sib_bh);
1407 if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL) 1493 nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
1408 btree->bt_bmap.b_pops->bpop_abort_end_ptr( 1494 &path[level].bp_oldreq);
1409 &btree->bt_bmap, &path[level].bp_oldreq);
1410 } 1495 }
1411 *levelp = level; 1496 *levelp = level;
1412 stats->bs_nblocks = 0; 1497 stats->bs_nblocks = 0;
@@ -1420,9 +1505,8 @@ static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
1420 int level; 1505 int level;
1421 1506
1422 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1507 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1423 if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL) 1508 nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
1424 btree->bt_bmap.b_pops->bpop_commit_end_ptr( 1509 &path[level].bp_oldreq);
1425 &btree->bt_bmap, &path[level].bp_oldreq);
1426 path[level].bp_op(btree, path, level, NULL, NULL); 1510 path[level].bp_op(btree, path, level, NULL, NULL);
1427 } 1511 }
1428 1512
@@ -1501,7 +1585,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1501 if (nchildren > 1) 1585 if (nchildren > 1)
1502 return 0; 1586 return 0;
1503 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1587 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1504 ret = nilfs_bmap_get_block(bmap, ptr, &bh); 1588 ret = nilfs_btree_get_block(btree, ptr, &bh);
1505 if (ret < 0) 1589 if (ret < 0)
1506 return ret; 1590 return ret;
1507 node = (struct nilfs_btree_node *)bh->b_data; 1591 node = (struct nilfs_btree_node *)bh->b_data;
@@ -1515,9 +1599,9 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1515 nextmaxkey = (nchildren > 1) ? 1599 nextmaxkey = (nchildren > 1) ?
1516 nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0; 1600 nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
1517 if (bh != NULL) 1601 if (bh != NULL)
1518 nilfs_bmap_put_block(bmap, bh); 1602 brelse(bh);
1519 1603
1520 return (maxkey == key) && (nextmaxkey < bmap->b_low); 1604 return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
1521} 1605}
1522 1606
1523static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, 1607static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
@@ -1542,7 +1626,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1542 nchildren = nilfs_btree_node_get_nchildren(btree, root); 1626 nchildren = nilfs_btree_node_get_nchildren(btree, root);
1543 WARN_ON(nchildren > 1); 1627 WARN_ON(nchildren > 1);
1544 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1628 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1545 ret = nilfs_bmap_get_block(bmap, ptr, &bh); 1629 ret = nilfs_btree_get_block(btree, ptr, &bh);
1546 if (ret < 0) 1630 if (ret < 0)
1547 return ret; 1631 return ret;
1548 node = (struct nilfs_btree_node *)bh->b_data; 1632 node = (struct nilfs_btree_node *)bh->b_data;
@@ -1563,7 +1647,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1563 } 1647 }
1564 1648
1565 if (bh != NULL) 1649 if (bh != NULL)
1566 nilfs_bmap_put_block(bmap, bh); 1650 brelse(bh);
1567 1651
1568 return nitems; 1652 return nitems;
1569} 1653}
@@ -1584,10 +1668,10 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1584 1668
1585 /* for data */ 1669 /* for data */
1586 /* cannot find near ptr */ 1670 /* cannot find near ptr */
1587 if (btree->bt_ops->btop_find_target != NULL) 1671 if (NILFS_BMAP_USE_VBN(bmap))
1588 dreq->bpr_ptr 1672 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
1589 = btree->bt_ops->btop_find_target(btree, NULL, key); 1673
1590 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq); 1674 ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq);
1591 if (ret < 0) 1675 if (ret < 0)
1592 return ret; 1676 return ret;
1593 1677
@@ -1595,11 +1679,11 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1595 stats->bs_nblocks++; 1679 stats->bs_nblocks++;
1596 if (nreq != NULL) { 1680 if (nreq != NULL) {
1597 nreq->bpr_ptr = dreq->bpr_ptr + 1; 1681 nreq->bpr_ptr = dreq->bpr_ptr + 1;
1598 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq); 1682 ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq);
1599 if (ret < 0) 1683 if (ret < 0)
1600 goto err_out_dreq; 1684 goto err_out_dreq;
1601 1685
1602 ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh); 1686 ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh);
1603 if (ret < 0) 1687 if (ret < 0)
1604 goto err_out_nreq; 1688 goto err_out_nreq;
1605 1689
@@ -1612,9 +1696,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1612 1696
1613 /* error */ 1697 /* error */
1614 err_out_nreq: 1698 err_out_nreq:
1615 bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq); 1699 nilfs_bmap_abort_alloc_ptr(bmap, nreq);
1616 err_out_dreq: 1700 err_out_dreq:
1617 bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq); 1701 nilfs_bmap_abort_alloc_ptr(bmap, dreq);
1618 stats->bs_nblocks = 0; 1702 stats->bs_nblocks = 0;
1619 return ret; 1703 return ret;
1620 1704
@@ -1624,7 +1708,7 @@ static void
1624nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, 1708nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1625 __u64 key, __u64 ptr, 1709 __u64 key, __u64 ptr,
1626 const __u64 *keys, const __u64 *ptrs, 1710 const __u64 *keys, const __u64 *ptrs,
1627 int n, __u64 low, __u64 high, 1711 int n,
1628 union nilfs_bmap_ptr_req *dreq, 1712 union nilfs_bmap_ptr_req *dreq,
1629 union nilfs_bmap_ptr_req *nreq, 1713 union nilfs_bmap_ptr_req *nreq,
1630 struct buffer_head *bh) 1714 struct buffer_head *bh)
@@ -1642,12 +1726,10 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1642 1726
1643 /* convert and insert */ 1727 /* convert and insert */
1644 btree = (struct nilfs_btree *)bmap; 1728 btree = (struct nilfs_btree *)bmap;
1645 nilfs_btree_init(bmap, low, high); 1729 nilfs_btree_init(bmap);
1646 if (nreq != NULL) { 1730 if (nreq != NULL) {
1647 if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) { 1731 nilfs_bmap_commit_alloc_ptr(bmap, dreq);
1648 bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq); 1732 nilfs_bmap_commit_alloc_ptr(bmap, nreq);
1649 bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
1650 }
1651 1733
1652 /* create child node at level 1 */ 1734 /* create child node at level 1 */
1653 lock_buffer(bh); 1735 lock_buffer(bh);
@@ -1661,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1661 nilfs_bmap_set_dirty(bmap); 1743 nilfs_bmap_set_dirty(bmap);
1662 1744
1663 unlock_buffer(bh); 1745 unlock_buffer(bh);
1664 nilfs_bmap_put_block(bmap, bh); 1746 brelse(bh);
1665 1747
1666 /* create root node at level 2 */ 1748 /* create root node at level 2 */
1667 node = nilfs_btree_get_root(btree); 1749 node = nilfs_btree_get_root(btree);
@@ -1669,8 +1751,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1669 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 1751 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
1670 2, 1, &keys[0], &tmpptr); 1752 2, 1, &keys[0], &tmpptr);
1671 } else { 1753 } else {
1672 if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) 1754 nilfs_bmap_commit_alloc_ptr(bmap, dreq);
1673 bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
1674 1755
1675 /* create root node at level 1 */ 1756 /* create root node at level 1 */
1676 node = nilfs_btree_get_root(btree); 1757 node = nilfs_btree_get_root(btree);
@@ -1682,8 +1763,8 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1682 nilfs_bmap_set_dirty(bmap); 1763 nilfs_bmap_set_dirty(bmap);
1683 } 1764 }
1684 1765
1685 if (btree->bt_ops->btop_set_target != NULL) 1766 if (NILFS_BMAP_USE_VBN(bmap))
1686 btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr); 1767 nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr);
1687} 1768}
1688 1769
1689/** 1770/**
@@ -1694,13 +1775,10 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1694 * @keys: 1775 * @keys:
1695 * @ptrs: 1776 * @ptrs:
1696 * @n: 1777 * @n:
1697 * @low:
1698 * @high:
1699 */ 1778 */
1700int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, 1779int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
1701 __u64 key, __u64 ptr, 1780 __u64 key, __u64 ptr,
1702 const __u64 *keys, const __u64 *ptrs, 1781 const __u64 *keys, const __u64 *ptrs, int n)
1703 int n, __u64 low, __u64 high)
1704{ 1782{
1705 struct buffer_head *bh; 1783 struct buffer_head *bh;
1706 union nilfs_bmap_ptr_req dreq, nreq, *di, *ni; 1784 union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
@@ -1725,7 +1803,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
1725 if (ret < 0) 1803 if (ret < 0)
1726 return ret; 1804 return ret;
1727 nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n, 1805 nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
1728 low, high, di, ni, bh); 1806 di, ni, bh);
1729 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1807 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1730 return 0; 1808 return 0;
1731} 1809}
@@ -1754,9 +1832,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1754 nilfs_btree_node_get_ptr(btree, parent, 1832 nilfs_btree_node_get_ptr(btree, parent,
1755 path[level + 1].bp_index); 1833 path[level + 1].bp_index);
1756 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; 1834 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
1757 ret = nilfs_bmap_prepare_update(&btree->bt_bmap, 1835 ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap,
1758 &path[level].bp_oldreq, 1836 &path[level].bp_oldreq,
1759 &path[level].bp_newreq); 1837 &path[level].bp_newreq);
1760 if (ret < 0) 1838 if (ret < 0)
1761 return ret; 1839 return ret;
1762 1840
@@ -1768,9 +1846,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1768 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1846 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1769 &path[level].bp_ctxt); 1847 &path[level].bp_ctxt);
1770 if (ret < 0) { 1848 if (ret < 0) {
1771 nilfs_bmap_abort_update(&btree->bt_bmap, 1849 nilfs_bmap_abort_update_v(&btree->bt_bmap,
1772 &path[level].bp_oldreq, 1850 &path[level].bp_oldreq,
1773 &path[level].bp_newreq); 1851 &path[level].bp_newreq);
1774 return ret; 1852 return ret;
1775 } 1853 }
1776 } 1854 }
@@ -1784,9 +1862,9 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
1784{ 1862{
1785 struct nilfs_btree_node *parent; 1863 struct nilfs_btree_node *parent;
1786 1864
1787 nilfs_bmap_commit_update(&btree->bt_bmap, 1865 nilfs_bmap_commit_update_v(&btree->bt_bmap,
1788 &path[level].bp_oldreq, 1866 &path[level].bp_oldreq,
1789 &path[level].bp_newreq); 1867 &path[level].bp_newreq);
1790 1868
1791 if (buffer_nilfs_node(path[level].bp_bh)) { 1869 if (buffer_nilfs_node(path[level].bp_bh)) {
1792 nilfs_btnode_commit_change_key( 1870 nilfs_btnode_commit_change_key(
@@ -1805,9 +1883,9 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1805 struct nilfs_btree_path *path, 1883 struct nilfs_btree_path *path,
1806 int level) 1884 int level)
1807{ 1885{
1808 nilfs_bmap_abort_update(&btree->bt_bmap, 1886 nilfs_bmap_abort_update_v(&btree->bt_bmap,
1809 &path[level].bp_oldreq, 1887 &path[level].bp_oldreq,
1810 &path[level].bp_newreq); 1888 &path[level].bp_newreq);
1811 if (buffer_nilfs_node(path[level].bp_bh)) 1889 if (buffer_nilfs_node(path[level].bp_bh))
1812 nilfs_btnode_abort_change_key( 1890 nilfs_btnode_abort_change_key(
1813 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1891 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1930,7 +2008,9 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1930 goto out; 2008 goto out;
1931 } 2009 }
1932 2010
1933 ret = btree->bt_ops->btop_propagate(btree, path, level, bh); 2011 ret = NILFS_BMAP_USE_VBN(bmap) ?
2012 nilfs_btree_propagate_v(btree, path, level, bh) :
2013 nilfs_btree_propagate_p(btree, path, level, bh);
1934 2014
1935 out: 2015 out:
1936 nilfs_btree_clear_path(btree, path); 2016 nilfs_btree_clear_path(btree, path);
@@ -2066,12 +2146,9 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2066 ptr = nilfs_btree_node_get_ptr(btree, parent, 2146 ptr = nilfs_btree_node_get_ptr(btree, parent,
2067 path[level + 1].bp_index); 2147 path[level + 1].bp_index);
2068 req.bpr_ptr = ptr; 2148 req.bpr_ptr = ptr;
2069 ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap, 2149 ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr);
2070 &req); 2150 if (unlikely(ret < 0))
2071 if (ret < 0)
2072 return ret; 2151 return ret;
2073 btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
2074 &req, blocknr);
2075 2152
2076 key = nilfs_btree_node_get_key(btree, parent, 2153 key = nilfs_btree_node_get_key(btree, parent,
2077 path[level + 1].bp_index); 2154 path[level + 1].bp_index);
@@ -2114,8 +2191,9 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2114 goto out; 2191 goto out;
2115 } 2192 }
2116 2193
2117 ret = btree->bt_ops->btop_assign(btree, path, level, bh, 2194 ret = NILFS_BMAP_USE_VBN(bmap) ?
2118 blocknr, binfo); 2195 nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
2196 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
2119 2197
2120 out: 2198 out:
2121 nilfs_btree_clear_path(btree, path); 2199 nilfs_btree_clear_path(btree, path);
@@ -2171,7 +2249,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2171 WARN_ON(ret == -ENOENT); 2249 WARN_ON(ret == -ENOENT);
2172 goto out; 2250 goto out;
2173 } 2251 }
2174 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh); 2252 ret = nilfs_btree_get_block(btree, ptr, &bh);
2175 if (ret < 0) { 2253 if (ret < 0) {
2176 WARN_ON(ret == -ENOENT); 2254 WARN_ON(ret == -ENOENT);
2177 goto out; 2255 goto out;
@@ -2179,7 +2257,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2179 2257
2180 if (!buffer_dirty(bh)) 2258 if (!buffer_dirty(bh))
2181 nilfs_btnode_mark_dirty(bh); 2259 nilfs_btnode_mark_dirty(bh);
2182 nilfs_bmap_put_block(&btree->bt_bmap, bh); 2260 brelse(bh);
2183 if (!nilfs_bmap_dirty(&btree->bt_bmap)) 2261 if (!nilfs_bmap_dirty(&btree->bt_bmap))
2184 nilfs_bmap_set_dirty(&btree->bt_bmap); 2262 nilfs_bmap_set_dirty(&btree->bt_bmap);
2185 2263
@@ -2191,6 +2269,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2191 2269
2192static const struct nilfs_bmap_operations nilfs_btree_ops = { 2270static const struct nilfs_bmap_operations nilfs_btree_ops = {
2193 .bop_lookup = nilfs_btree_lookup, 2271 .bop_lookup = nilfs_btree_lookup,
2272 .bop_lookup_contig = nilfs_btree_lookup_contig,
2194 .bop_insert = nilfs_btree_insert, 2273 .bop_insert = nilfs_btree_insert,
2195 .bop_delete = nilfs_btree_delete, 2274 .bop_delete = nilfs_btree_delete,
2196 .bop_clear = NULL, 2275 .bop_clear = NULL,
@@ -2210,6 +2289,7 @@ static const struct nilfs_bmap_operations nilfs_btree_ops = {
2210 2289
2211static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { 2290static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
2212 .bop_lookup = NULL, 2291 .bop_lookup = NULL,
2292 .bop_lookup_contig = NULL,
2213 .bop_insert = NULL, 2293 .bop_insert = NULL,
2214 .bop_delete = NULL, 2294 .bop_delete = NULL,
2215 .bop_clear = NULL, 2295 .bop_clear = NULL,
@@ -2227,43 +2307,13 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
2227 .bop_gather_data = NULL, 2307 .bop_gather_data = NULL,
2228}; 2308};
2229 2309
2230static const struct nilfs_btree_operations nilfs_btree_ops_v = { 2310int nilfs_btree_init(struct nilfs_bmap *bmap)
2231 .btop_find_target = nilfs_btree_find_target_v,
2232 .btop_set_target = nilfs_btree_set_target_v,
2233 .btop_propagate = nilfs_btree_propagate_v,
2234 .btop_assign = nilfs_btree_assign_v,
2235};
2236
2237static const struct nilfs_btree_operations nilfs_btree_ops_p = {
2238 .btop_find_target = NULL,
2239 .btop_set_target = NULL,
2240 .btop_propagate = nilfs_btree_propagate_p,
2241 .btop_assign = nilfs_btree_assign_p,
2242};
2243
2244int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
2245{ 2311{
2246 struct nilfs_btree *btree;
2247
2248 btree = (struct nilfs_btree *)bmap;
2249 bmap->b_ops = &nilfs_btree_ops; 2312 bmap->b_ops = &nilfs_btree_ops;
2250 bmap->b_low = low;
2251 bmap->b_high = high;
2252 switch (bmap->b_inode->i_ino) {
2253 case NILFS_DAT_INO:
2254 btree->bt_ops = &nilfs_btree_ops_p;
2255 break;
2256 default:
2257 btree->bt_ops = &nilfs_btree_ops_v;
2258 break;
2259 }
2260
2261 return 0; 2313 return 0;
2262} 2314}
2263 2315
2264void nilfs_btree_init_gc(struct nilfs_bmap *bmap) 2316void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
2265{ 2317{
2266 bmap->b_low = NILFS_BMAP_LARGE_LOW;
2267 bmap->b_high = NILFS_BMAP_LARGE_HIGH;
2268 bmap->b_ops = &nilfs_btree_ops_gc; 2318 bmap->b_ops = &nilfs_btree_ops_gc;
2269} 2319}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4766deb52fb1..0e72bbbc6b64 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -34,28 +34,6 @@ struct nilfs_btree;
34struct nilfs_btree_path; 34struct nilfs_btree_path;
35 35
36/** 36/**
37 * struct nilfs_btree_operations - B-tree operation table
38 */
39struct nilfs_btree_operations {
40 __u64 (*btop_find_target)(const struct nilfs_btree *,
41 const struct nilfs_btree_path *, __u64);
42 void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
43
44 struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
45
46 int (*btop_propagate)(struct nilfs_btree *,
47 struct nilfs_btree_path *,
48 int,
49 struct buffer_head *);
50 int (*btop_assign)(struct nilfs_btree *,
51 struct nilfs_btree_path *,
52 int,
53 struct buffer_head **,
54 sector_t,
55 union nilfs_binfo *);
56};
57
58/**
59 * struct nilfs_btree_node - B-tree node 37 * struct nilfs_btree_node - B-tree node
60 * @bn_flags: flags 38 * @bn_flags: flags
61 * @bn_level: level 39 * @bn_level: level
@@ -80,13 +58,9 @@ struct nilfs_btree_node {
80/** 58/**
81 * struct nilfs_btree - B-tree structure 59 * struct nilfs_btree - B-tree structure
82 * @bt_bmap: bmap base structure 60 * @bt_bmap: bmap base structure
83 * @bt_ops: B-tree operation table
84 */ 61 */
85struct nilfs_btree { 62struct nilfs_btree {
86 struct nilfs_bmap bt_bmap; 63 struct nilfs_bmap bt_bmap;
87
88 /* B-tree-specific members */
89 const struct nilfs_btree_operations *bt_ops;
90}; 64};
91 65
92 66
@@ -108,10 +82,9 @@ struct nilfs_btree {
108 82
109int nilfs_btree_path_cache_init(void); 83int nilfs_btree_path_cache_init(void);
110void nilfs_btree_path_cache_destroy(void); 84void nilfs_btree_path_cache_destroy(void);
111int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64); 85int nilfs_btree_init(struct nilfs_bmap *);
112int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, 86int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
113 const __u64 *, const __u64 *, 87 const __u64 *, const __u64 *, int);
114 int, __u64, __u64);
115void nilfs_btree_init_gc(struct nilfs_bmap *); 88void nilfs_btree_init_gc(struct nilfs_bmap *);
116 89
117#endif /* _NILFS_BTREE_H */ 90#endif /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 300f1cdfa862..aec942cf79e3 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -295,10 +295,6 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
295 return -EINVAL; 295 return -EINVAL;
296 } 296 }
297 297
298 /* cannot delete the latest checkpoint */
299 if (start == nilfs_mdt_cno(cpfile) - 1)
300 return -EPERM;
301
302 down_write(&NILFS_MDT(cpfile)->mi_sem); 298 down_write(&NILFS_MDT(cpfile)->mi_sem);
303 299
304 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); 300 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
@@ -311,7 +307,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
311 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); 307 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
312 if (ret < 0) { 308 if (ret < 0) {
313 if (ret != -ENOENT) 309 if (ret != -ENOENT)
314 goto out_header; 310 break;
315 /* skip hole */ 311 /* skip hole */
316 ret = 0; 312 ret = 0;
317 continue; 313 continue;
@@ -344,7 +340,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
344 continue; 340 continue;
345 printk(KERN_ERR "%s: cannot delete block\n", 341 printk(KERN_ERR "%s: cannot delete block\n",
346 __func__); 342 __func__);
347 goto out_header; 343 break;
348 } 344 }
349 } 345 }
350 346
@@ -362,7 +358,6 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
362 kunmap_atomic(kaddr, KM_USER0); 358 kunmap_atomic(kaddr, KM_USER0);
363 } 359 }
364 360
365 out_header:
366 brelse(header_bh); 361 brelse(header_bh);
367 362
368 out_sem: 363 out_sem:
@@ -384,9 +379,10 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
384} 379}
385 380
386static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, 381static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
387 struct nilfs_cpinfo *ci, size_t nci) 382 void *buf, unsigned cisz, size_t nci)
388{ 383{
389 struct nilfs_checkpoint *cp; 384 struct nilfs_checkpoint *cp;
385 struct nilfs_cpinfo *ci = buf;
390 struct buffer_head *bh; 386 struct buffer_head *bh;
391 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; 387 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
392 __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop; 388 __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
@@ -410,17 +406,22 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
410 kaddr = kmap_atomic(bh->b_page, KM_USER0); 406 kaddr = kmap_atomic(bh->b_page, KM_USER0);
411 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); 407 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
412 for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) { 408 for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
413 if (!nilfs_checkpoint_invalid(cp)) 409 if (!nilfs_checkpoint_invalid(cp)) {
414 nilfs_cpfile_checkpoint_to_cpinfo( 410 nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp,
415 cpfile, cp, &ci[n++]); 411 ci);
412 ci = (void *)ci + cisz;
413 n++;
414 }
416 } 415 }
417 kunmap_atomic(kaddr, KM_USER0); 416 kunmap_atomic(kaddr, KM_USER0);
418 brelse(bh); 417 brelse(bh);
419 } 418 }
420 419
421 ret = n; 420 ret = n;
422 if (n > 0) 421 if (n > 0) {
423 *cnop = ci[n - 1].ci_cno + 1; 422 ci = (void *)ci - cisz;
423 *cnop = ci->ci_cno + 1;
424 }
424 425
425 out: 426 out:
426 up_read(&NILFS_MDT(cpfile)->mi_sem); 427 up_read(&NILFS_MDT(cpfile)->mi_sem);
@@ -428,11 +429,12 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
428} 429}
429 430
430static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, 431static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
431 struct nilfs_cpinfo *ci, size_t nci) 432 void *buf, unsigned cisz, size_t nci)
432{ 433{
433 struct buffer_head *bh; 434 struct buffer_head *bh;
434 struct nilfs_cpfile_header *header; 435 struct nilfs_cpfile_header *header;
435 struct nilfs_checkpoint *cp; 436 struct nilfs_checkpoint *cp;
437 struct nilfs_cpinfo *ci = buf;
436 __u64 curr = *cnop, next; 438 __u64 curr = *cnop, next;
437 unsigned long curr_blkoff, next_blkoff; 439 unsigned long curr_blkoff, next_blkoff;
438 void *kaddr; 440 void *kaddr;
@@ -472,7 +474,9 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
472 if (unlikely(nilfs_checkpoint_invalid(cp) || 474 if (unlikely(nilfs_checkpoint_invalid(cp) ||
473 !nilfs_checkpoint_snapshot(cp))) 475 !nilfs_checkpoint_snapshot(cp)))
474 break; 476 break;
475 nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]); 477 nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci);
478 ci = (void *)ci + cisz;
479 n++;
476 next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); 480 next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
477 if (next == 0) 481 if (next == 0)
478 break; /* reach end of the snapshot list */ 482 break; /* reach end of the snapshot list */
@@ -511,13 +515,13 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
511 */ 515 */
512 516
513ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode, 517ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
514 struct nilfs_cpinfo *ci, size_t nci) 518 void *buf, unsigned cisz, size_t nci)
515{ 519{
516 switch (mode) { 520 switch (mode) {
517 case NILFS_CHECKPOINT: 521 case NILFS_CHECKPOINT:
518 return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci); 522 return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, buf, cisz, nci);
519 case NILFS_SNAPSHOT: 523 case NILFS_SNAPSHOT:
520 return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci); 524 return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, buf, cisz, nci);
521 default: 525 default:
522 return -EINVAL; 526 return -EINVAL;
523 } 527 }
@@ -533,20 +537,14 @@ int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
533 struct nilfs_cpinfo ci; 537 struct nilfs_cpinfo ci;
534 __u64 tcno = cno; 538 __u64 tcno = cno;
535 ssize_t nci; 539 ssize_t nci;
536 int ret;
537 540
538 nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1); 541 nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, sizeof(ci), 1);
539 if (nci < 0) 542 if (nci < 0)
540 return nci; 543 return nci;
541 else if (nci == 0 || ci.ci_cno != cno) 544 else if (nci == 0 || ci.ci_cno != cno)
542 return -ENOENT; 545 return -ENOENT;
543 546 else if (nilfs_cpinfo_snapshot(&ci))
544 /* cannot delete the latest checkpoint nor snapshots */ 547 return -EBUSY;
545 ret = nilfs_cpinfo_snapshot(&ci);
546 if (ret < 0)
547 return ret;
548 else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
549 return -EPERM;
550 548
551 return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1); 549 return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
552} 550}
@@ -864,11 +862,11 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
864 case NILFS_CHECKPOINT: 862 case NILFS_CHECKPOINT:
865 /* 863 /*
866 * Check for protecting existing snapshot mounts: 864 * Check for protecting existing snapshot mounts:
867 * bd_mount_sem is used to make this operation atomic and 865 * ns_mount_mutex is used to make this operation atomic and
868 * exclusive with a new mount job. Though it doesn't cover 866 * exclusive with a new mount job. Though it doesn't cover
869 * umount, it's enough for the purpose. 867 * umount, it's enough for the purpose.
870 */ 868 */
871 down(&nilfs->ns_bdev->bd_mount_sem); 869 mutex_lock(&nilfs->ns_mount_mutex);
872 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { 870 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
873 /* Current implementation does not have to protect 871 /* Current implementation does not have to protect
874 plain read-only mounts since they are exclusive 872 plain read-only mounts since they are exclusive
@@ -877,7 +875,7 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
877 ret = -EBUSY; 875 ret = -EBUSY;
878 } else 876 } else
879 ret = nilfs_cpfile_clear_snapshot(cpfile, cno); 877 ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
880 up(&nilfs->ns_bdev->bd_mount_sem); 878 mutex_unlock(&nilfs->ns_mount_mutex);
881 return ret; 879 return ret;
882 case NILFS_SNAPSHOT: 880 case NILFS_SNAPSHOT:
883 return nilfs_cpfile_set_snapshot(cpfile, cno); 881 return nilfs_cpfile_set_snapshot(cpfile, cno);
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 1a8a1008c342..788a45950197 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -39,7 +39,7 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
39int nilfs_cpfile_change_cpmode(struct inode *, __u64, int); 39int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
40int nilfs_cpfile_is_snapshot(struct inode *, __u64); 40int nilfs_cpfile_is_snapshot(struct inode *, __u64);
41int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *); 41int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
42ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, 42ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
43 struct nilfs_cpinfo *, size_t); 43 size_t);
44 44
45#endif /* _NILFS_CPFILE_H */ 45#endif /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index bb8a5818e7f1..8927ca27e6f7 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -92,21 +92,6 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
92 nilfs_palloc_abort_alloc_entry(dat, req); 92 nilfs_palloc_abort_alloc_entry(dat, req);
93} 93}
94 94
95int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
96{
97 int ret;
98
99 ret = nilfs_palloc_prepare_free_entry(dat, req);
100 if (ret < 0)
101 return ret;
102 ret = nilfs_dat_prepare_entry(dat, req, 0);
103 if (ret < 0) {
104 nilfs_palloc_abort_free_entry(dat, req);
105 return ret;
106 }
107 return 0;
108}
109
110void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) 95void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
111{ 96{
112 struct nilfs_dat_entry *entry; 97 struct nilfs_dat_entry *entry;
@@ -149,15 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
149 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, 134 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
150 req->pr_entry_bh, kaddr); 135 req->pr_entry_bh, kaddr);
151 entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat)); 136 entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
152 if (entry->de_blocknr != cpu_to_le64(0) ||
153 entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) {
154 printk(KERN_CRIT
155 "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n",
156 __func__, (unsigned long long)req->pr_entry_nr,
157 (unsigned long long)le64_to_cpu(entry->de_start),
158 (unsigned long long)le64_to_cpu(entry->de_end),
159 (unsigned long long)le64_to_cpu(entry->de_blocknr));
160 }
161 entry->de_blocknr = cpu_to_le64(blocknr); 137 entry->de_blocknr = cpu_to_le64(blocknr);
162 kunmap_atomic(kaddr, KM_USER0); 138 kunmap_atomic(kaddr, KM_USER0);
163 139
@@ -391,36 +367,37 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
391 return ret; 367 return ret;
392} 368}
393 369
394ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo, 370ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
395 size_t nvi) 371 size_t nvi)
396{ 372{
397 struct buffer_head *entry_bh; 373 struct buffer_head *entry_bh;
398 struct nilfs_dat_entry *entry; 374 struct nilfs_dat_entry *entry;
375 struct nilfs_vinfo *vinfo = buf;
399 __u64 first, last; 376 __u64 first, last;
400 void *kaddr; 377 void *kaddr;
401 unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block; 378 unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
402 int i, j, n, ret; 379 int i, j, n, ret;
403 380
404 for (i = 0; i < nvi; i += n) { 381 for (i = 0; i < nvi; i += n) {
405 ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr, 382 ret = nilfs_palloc_get_entry_block(dat, vinfo->vi_vblocknr,
406 0, &entry_bh); 383 0, &entry_bh);
407 if (ret < 0) 384 if (ret < 0)
408 return ret; 385 return ret;
409 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); 386 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
410 /* last virtual block number in this block */ 387 /* last virtual block number in this block */
411 first = vinfo[i].vi_vblocknr; 388 first = vinfo->vi_vblocknr;
412 do_div(first, entries_per_block); 389 do_div(first, entries_per_block);
413 first *= entries_per_block; 390 first *= entries_per_block;
414 last = first + entries_per_block - 1; 391 last = first + entries_per_block - 1;
415 for (j = i, n = 0; 392 for (j = i, n = 0;
416 j < nvi && vinfo[j].vi_vblocknr >= first && 393 j < nvi && vinfo->vi_vblocknr >= first &&
417 vinfo[j].vi_vblocknr <= last; 394 vinfo->vi_vblocknr <= last;
418 j++, n++) { 395 j++, n++, vinfo = (void *)vinfo + visz) {
419 entry = nilfs_palloc_block_get_entry( 396 entry = nilfs_palloc_block_get_entry(
420 dat, vinfo[j].vi_vblocknr, entry_bh, kaddr); 397 dat, vinfo->vi_vblocknr, entry_bh, kaddr);
421 vinfo[j].vi_start = le64_to_cpu(entry->de_start); 398 vinfo->vi_start = le64_to_cpu(entry->de_start);
422 vinfo[j].vi_end = le64_to_cpu(entry->de_end); 399 vinfo->vi_end = le64_to_cpu(entry->de_end);
423 vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr); 400 vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
424 } 401 }
425 kunmap_atomic(kaddr, KM_USER0); 402 kunmap_atomic(kaddr, KM_USER0);
426 brelse(entry_bh); 403 brelse(entry_bh);
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d9560654a4b7..d328b81eead4 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -47,6 +47,6 @@ void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
47int nilfs_dat_mark_dirty(struct inode *, __u64); 47int nilfs_dat_mark_dirty(struct inode *, __u64);
48int nilfs_dat_freev(struct inode *, __u64 *, size_t); 48int nilfs_dat_freev(struct inode *, __u64 *, size_t);
49int nilfs_dat_move(struct inode *, __u64, sector_t); 49int nilfs_dat_move(struct inode *, __u64, sector_t);
50ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t); 50ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
51 51
52#endif /* _NILFS_DAT_H */ 52#endif /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 54100acc1102..1a4fa04cf071 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -43,7 +43,6 @@
43 */ 43 */
44 44
45#include <linux/pagemap.h> 45#include <linux/pagemap.h>
46#include <linux/smp_lock.h>
47#include "nilfs.h" 46#include "nilfs.h"
48#include "page.h" 47#include "page.h"
49 48
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index c6379e482781..342d9765df8d 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -25,6 +25,7 @@
25#include "page.h" 25#include "page.h"
26#include "direct.h" 26#include "direct.h"
27#include "alloc.h" 27#include "alloc.h"
28#include "dat.h"
28 29
29static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct) 30static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
30{ 31{
@@ -62,6 +63,47 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
62 return 0; 63 return 0;
63} 64}
64 65
66static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
67 __u64 key, __u64 *ptrp,
68 unsigned maxblocks)
69{
70 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
71 struct inode *dat = NULL;
72 __u64 ptr, ptr2;
73 sector_t blocknr;
74 int ret, cnt;
75
76 if (key > NILFS_DIRECT_KEY_MAX ||
77 (ptr = nilfs_direct_get_ptr(direct, key)) ==
78 NILFS_BMAP_INVALID_PTR)
79 return -ENOENT;
80
81 if (NILFS_BMAP_USE_VBN(bmap)) {
82 dat = nilfs_bmap_get_dat(bmap);
83 ret = nilfs_dat_translate(dat, ptr, &blocknr);
84 if (ret < 0)
85 return ret;
86 ptr = blocknr;
87 }
88
89 maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1);
90 for (cnt = 1; cnt < maxblocks &&
91 (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
92 NILFS_BMAP_INVALID_PTR;
93 cnt++) {
94 if (dat) {
95 ret = nilfs_dat_translate(dat, ptr2, &blocknr);
96 if (ret < 0)
97 return ret;
98 ptr2 = blocknr;
99 }
100 if (ptr2 != ptr + cnt)
101 break;
102 }
103 *ptrp = ptr;
104 return cnt;
105}
106
65static __u64 107static __u64
66nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key) 108nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
67{ 109{
@@ -90,10 +132,9 @@ static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
90{ 132{
91 int ret; 133 int ret;
92 134
93 if (direct->d_ops->dop_find_target != NULL) 135 if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
94 req->bpr_ptr = direct->d_ops->dop_find_target(direct, key); 136 req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
95 ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap, 137 ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
96 req);
97 if (ret < 0) 138 if (ret < 0)
98 return ret; 139 return ret;
99 140
@@ -111,16 +152,14 @@ static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
111 bh = (struct buffer_head *)((unsigned long)ptr); 152 bh = (struct buffer_head *)((unsigned long)ptr);
112 set_buffer_nilfs_volatile(bh); 153 set_buffer_nilfs_volatile(bh);
113 154
114 if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL) 155 nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
115 direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
116 &direct->d_bmap, req);
117 nilfs_direct_set_ptr(direct, key, req->bpr_ptr); 156 nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
118 157
119 if (!nilfs_bmap_dirty(&direct->d_bmap)) 158 if (!nilfs_bmap_dirty(&direct->d_bmap))
120 nilfs_bmap_set_dirty(&direct->d_bmap); 159 nilfs_bmap_set_dirty(&direct->d_bmap);
121 160
122 if (direct->d_ops->dop_set_target != NULL) 161 if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
123 direct->d_ops->dop_set_target(direct, key, req->bpr_ptr); 162 nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
124} 163}
125 164
126static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 165static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -152,25 +191,18 @@ static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
152{ 191{
153 int ret; 192 int ret;
154 193
155 if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) { 194 req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
156 req->bpr_ptr = nilfs_direct_get_ptr(direct, key); 195 ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req);
157 ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr( 196 if (!ret)
158 &direct->d_bmap, req); 197 stats->bs_nblocks = 1;
159 if (ret < 0) 198 return ret;
160 return ret;
161 }
162
163 stats->bs_nblocks = 1;
164 return 0;
165} 199}
166 200
167static void nilfs_direct_commit_delete(struct nilfs_direct *direct, 201static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
168 union nilfs_bmap_ptr_req *req, 202 union nilfs_bmap_ptr_req *req,
169 __u64 key) 203 __u64 key)
170{ 204{
171 if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL) 205 nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
172 direct->d_bmap.b_pops->bpop_commit_end_ptr(
173 &direct->d_bmap, req);
174 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); 206 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
175} 207}
176 208
@@ -244,8 +276,7 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
244} 276}
245 277
246int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, 278int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
247 __u64 key, __u64 *keys, __u64 *ptrs, 279 __u64 key, __u64 *keys, __u64 *ptrs, int n)
248 int n, __u64 low, __u64 high)
249{ 280{
250 struct nilfs_direct *direct; 281 struct nilfs_direct *direct;
251 __le64 *dptrs; 282 __le64 *dptrs;
@@ -275,8 +306,7 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
275 dptrs[i] = NILFS_BMAP_INVALID_PTR; 306 dptrs[i] = NILFS_BMAP_INVALID_PTR;
276 } 307 }
277 308
278 nilfs_direct_init(bmap, low, high); 309 nilfs_direct_init(bmap);
279
280 return 0; 310 return 0;
281} 311}
282 312
@@ -293,11 +323,11 @@ static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
293 if (!buffer_nilfs_volatile(bh)) { 323 if (!buffer_nilfs_volatile(bh)) {
294 oldreq.bpr_ptr = ptr; 324 oldreq.bpr_ptr = ptr;
295 newreq.bpr_ptr = ptr; 325 newreq.bpr_ptr = ptr;
296 ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq, 326 ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq,
297 &newreq); 327 &newreq);
298 if (ret < 0) 328 if (ret < 0)
299 return ret; 329 return ret;
300 nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq); 330 nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq);
301 set_buffer_nilfs_volatile(bh); 331 set_buffer_nilfs_volatile(bh);
302 nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr); 332 nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
303 } else 333 } else
@@ -309,12 +339,10 @@ static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
309static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, 339static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
310 struct buffer_head *bh) 340 struct buffer_head *bh)
311{ 341{
312 struct nilfs_direct *direct; 342 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
313 343
314 direct = (struct nilfs_direct *)bmap; 344 return NILFS_BMAP_USE_VBN(bmap) ?
315 return (direct->d_ops->dop_propagate != NULL) ? 345 nilfs_direct_propagate_v(direct, bh) : 0;
316 direct->d_ops->dop_propagate(direct, bh) :
317 0;
318} 346}
319 347
320static int nilfs_direct_assign_v(struct nilfs_direct *direct, 348static int nilfs_direct_assign_v(struct nilfs_direct *direct,
@@ -327,12 +355,9 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct,
327 int ret; 355 int ret;
328 356
329 req.bpr_ptr = ptr; 357 req.bpr_ptr = ptr;
330 ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr( 358 ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr);
331 &direct->d_bmap, &req); 359 if (unlikely(ret < 0))
332 if (ret < 0)
333 return ret; 360 return ret;
334 direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
335 &req, blocknr);
336 361
337 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 362 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
338 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 363 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -377,12 +402,14 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
377 return -EINVAL; 402 return -EINVAL;
378 } 403 }
379 404
380 return direct->d_ops->dop_assign(direct, key, ptr, bh, 405 return NILFS_BMAP_USE_VBN(bmap) ?
381 blocknr, binfo); 406 nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) :
407 nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo);
382} 408}
383 409
384static const struct nilfs_bmap_operations nilfs_direct_ops = { 410static const struct nilfs_bmap_operations nilfs_direct_ops = {
385 .bop_lookup = nilfs_direct_lookup, 411 .bop_lookup = nilfs_direct_lookup,
412 .bop_lookup_contig = nilfs_direct_lookup_contig,
386 .bop_insert = nilfs_direct_insert, 413 .bop_insert = nilfs_direct_insert,
387 .bop_delete = nilfs_direct_delete, 414 .bop_delete = nilfs_direct_delete,
388 .bop_clear = NULL, 415 .bop_clear = NULL,
@@ -401,36 +428,8 @@ static const struct nilfs_bmap_operations nilfs_direct_ops = {
401}; 428};
402 429
403 430
404static const struct nilfs_direct_operations nilfs_direct_ops_v = { 431int nilfs_direct_init(struct nilfs_bmap *bmap)
405 .dop_find_target = nilfs_direct_find_target_v,
406 .dop_set_target = nilfs_direct_set_target_v,
407 .dop_propagate = nilfs_direct_propagate_v,
408 .dop_assign = nilfs_direct_assign_v,
409};
410
411static const struct nilfs_direct_operations nilfs_direct_ops_p = {
412 .dop_find_target = NULL,
413 .dop_set_target = NULL,
414 .dop_propagate = NULL,
415 .dop_assign = nilfs_direct_assign_p,
416};
417
418int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
419{ 432{
420 struct nilfs_direct *direct;
421
422 direct = (struct nilfs_direct *)bmap;
423 bmap->b_ops = &nilfs_direct_ops; 433 bmap->b_ops = &nilfs_direct_ops;
424 bmap->b_low = low;
425 bmap->b_high = high;
426 switch (bmap->b_inode->i_ino) {
427 case NILFS_DAT_INO:
428 direct->d_ops = &nilfs_direct_ops_p;
429 break;
430 default:
431 direct->d_ops = &nilfs_direct_ops_v;
432 break;
433 }
434
435 return 0; 434 return 0;
436} 435}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index 45d2c5cda812..a5ffd66e25d0 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -31,18 +31,6 @@
31struct nilfs_direct; 31struct nilfs_direct;
32 32
33/** 33/**
34 * struct nilfs_direct_operations - direct mapping operation table
35 */
36struct nilfs_direct_operations {
37 __u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
38 void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
39 int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
40 int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
41 struct buffer_head **, sector_t,
42 union nilfs_binfo *);
43};
44
45/**
46 * struct nilfs_direct_node - direct node 34 * struct nilfs_direct_node - direct node
47 * @dn_flags: flags 35 * @dn_flags: flags
48 * @dn_pad: padding 36 * @dn_pad: padding
@@ -55,13 +43,9 @@ struct nilfs_direct_node {
55/** 43/**
56 * struct nilfs_direct - direct mapping 44 * struct nilfs_direct - direct mapping
57 * @d_bmap: bmap structure 45 * @d_bmap: bmap structure
58 * @d_ops: direct mapping operation table
59 */ 46 */
60struct nilfs_direct { 47struct nilfs_direct {
61 struct nilfs_bmap d_bmap; 48 struct nilfs_bmap d_bmap;
62
63 /* direct-mapping-specific members */
64 const struct nilfs_direct_operations *d_ops;
65}; 49};
66 50
67 51
@@ -70,9 +54,9 @@ struct nilfs_direct {
70#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) 54#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1)
71 55
72 56
73int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64); 57int nilfs_direct_init(struct nilfs_bmap *);
74int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *, 58int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
75 __u64 *, int, __u64, __u64); 59 __u64 *, int);
76 60
77 61
78#endif /* _NILFS_DIRECT_H */ 62#endif /* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 19d2102b6a69..1b3c2bb20da9 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -52,8 +52,9 @@
52#include "dat.h" 52#include "dat.h"
53#include "ifile.h" 53#include "ifile.h"
54 54
55static struct address_space_operations def_gcinode_aops = {}; 55static struct address_space_operations def_gcinode_aops = {
56/* XXX need def_gcinode_iops/fops? */ 56 .sync_page = block_sync_page,
57};
57 58
58/* 59/*
59 * nilfs_gccache_submit_read_data() - add data buffer and submit read request 60 * nilfs_gccache_submit_read_data() - add data buffer and submit read request
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 49ab4a49bb4f..fe9d8f2a13f8 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -43,22 +43,23 @@
43 * 43 *
44 * This function does not issue actual read request of the specified data 44 * This function does not issue actual read request of the specified data
45 * block. It is done by VFS. 45 * block. It is done by VFS.
46 * Bulk read for direct-io is not supported yet. (should be supported)
47 */ 46 */
48int nilfs_get_block(struct inode *inode, sector_t blkoff, 47int nilfs_get_block(struct inode *inode, sector_t blkoff,
49 struct buffer_head *bh_result, int create) 48 struct buffer_head *bh_result, int create)
50{ 49{
51 struct nilfs_inode_info *ii = NILFS_I(inode); 50 struct nilfs_inode_info *ii = NILFS_I(inode);
52 unsigned long blknum = 0; 51 __u64 blknum = 0;
53 int err = 0, ret; 52 int err = 0, ret;
54 struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode)); 53 struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
54 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
55 55
56 /* This exclusion control is a workaround; should be revised */ 56 down_read(&NILFS_MDT(dat)->mi_sem);
57 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 57 ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
58 ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum); 58 up_read(&NILFS_MDT(dat)->mi_sem);
59 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 59 if (ret >= 0) { /* found */
60 if (ret == 0) { /* found */
61 map_bh(bh_result, inode->i_sb, blknum); 60 map_bh(bh_result, inode->i_sb, blknum);
61 if (ret > 0)
62 bh_result->b_size = (ret << inode->i_blkbits);
62 goto out; 63 goto out;
63 } 64 }
64 /* data block was not found */ 65 /* data block was not found */
@@ -240,7 +241,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
240struct address_space_operations nilfs_aops = { 241struct address_space_operations nilfs_aops = {
241 .writepage = nilfs_writepage, 242 .writepage = nilfs_writepage,
242 .readpage = nilfs_readpage, 243 .readpage = nilfs_readpage,
243 /* .sync_page = nilfs_sync_page, */ 244 .sync_page = block_sync_page,
244 .writepages = nilfs_writepages, 245 .writepages = nilfs_writepages,
245 .set_page_dirty = nilfs_set_page_dirty, 246 .set_page_dirty = nilfs_set_page_dirty,
246 .readpages = nilfs_readpages, 247 .readpages = nilfs_readpages,
@@ -249,6 +250,7 @@ struct address_space_operations nilfs_aops = {
249 /* .releasepage = nilfs_releasepage, */ 250 /* .releasepage = nilfs_releasepage, */
250 .invalidatepage = block_invalidatepage, 251 .invalidatepage = block_invalidatepage,
251 .direct_IO = nilfs_direct_IO, 252 .direct_IO = nilfs_direct_IO,
253 .is_partially_uptodate = block_is_partially_uptodate,
252}; 254};
253 255
254struct inode *nilfs_new_inode(struct inode *dir, int mode) 256struct inode *nilfs_new_inode(struct inode *dir, int mode)
@@ -307,10 +309,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
307 /* ii->i_file_acl = 0; */ 309 /* ii->i_file_acl = 0; */
308 /* ii->i_dir_acl = 0; */ 310 /* ii->i_dir_acl = 0; */
309 ii->i_dir_start_lookup = 0; 311 ii->i_dir_start_lookup = 0;
310#ifdef CONFIG_NILFS_FS_POSIX_ACL
311 ii->i_acl = NULL;
312 ii->i_default_acl = NULL;
313#endif
314 ii->i_cno = 0; 312 ii->i_cno = 0;
315 nilfs_set_inode_flags(inode); 313 nilfs_set_inode_flags(inode);
316 spin_lock(&sbi->s_next_gen_lock); 314 spin_lock(&sbi->s_next_gen_lock);
@@ -432,10 +430,6 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
432 430
433 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); 431 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
434 432
435#ifdef CONFIG_NILFS_FS_POSIX_ACL
436 ii->i_acl = NILFS_ACL_NOT_CACHED;
437 ii->i_default_acl = NILFS_ACL_NOT_CACHED;
438#endif
439 if (nilfs_read_inode_common(inode, raw_inode)) 433 if (nilfs_read_inode_common(inode, raw_inode))
440 goto failed_unmap; 434 goto failed_unmap;
441 435
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index d6759b92006f..6ea5f872e2de 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -152,7 +152,7 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
152 152
153 down_read(&nilfs->ns_segctor_sem); 153 down_read(&nilfs->ns_segctor_sem);
154 ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf, 154 ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
155 nmembs); 155 size, nmembs);
156 up_read(&nilfs->ns_segctor_sem); 156 up_read(&nilfs->ns_segctor_sem);
157 return ret; 157 return ret;
158} 158}
@@ -182,7 +182,8 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
182 int ret; 182 int ret;
183 183
184 down_read(&nilfs->ns_segctor_sem); 184 down_read(&nilfs->ns_segctor_sem);
185 ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs); 185 ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, size,
186 nmembs);
186 up_read(&nilfs->ns_segctor_sem); 187 up_read(&nilfs->ns_segctor_sem);
187 return ret; 188 return ret;
188} 189}
@@ -212,7 +213,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
212 int ret; 213 int ret;
213 214
214 down_read(&nilfs->ns_segctor_sem); 215 down_read(&nilfs->ns_segctor_sem);
215 ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs); 216 ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs);
216 up_read(&nilfs->ns_segctor_sem); 217 up_read(&nilfs->ns_segctor_sem);
217 return ret; 218 return ret;
218} 219}
@@ -435,24 +436,6 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
435 return nmembs; 436 return nmembs;
436} 437}
437 438
438static int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
439 struct nilfs_argv *argv, void *buf)
440{
441 size_t nmembs = argv->v_nmembs;
442 struct nilfs_sb_info *sbi = nilfs->ns_writer;
443 int ret;
444
445 if (unlikely(!sbi)) {
446 /* never happens because called for a writable mount */
447 WARN_ON(1);
448 return -EROFS;
449 }
450 ret = nilfs_segctor_add_segments_to_be_freed(
451 NILFS_SC(sbi), buf, nmembs);
452
453 return (ret < 0) ? ret : nmembs;
454}
455
456int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, 439int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
457 struct nilfs_argv *argv, void **kbufs) 440 struct nilfs_argv *argv, void **kbufs)
458{ 441{
@@ -491,14 +474,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
491 msg = "cannot mark copying blocks dirty"; 474 msg = "cannot mark copying blocks dirty";
492 goto failed; 475 goto failed;
493 } 476 }
494 ret = nilfs_ioctl_free_segments(nilfs, &argv[4], kbufs[4]);
495 if (ret < 0) {
496 /*
497 * can safely abort because this operation is atomic.
498 */
499 msg = "cannot set segments to be freed";
500 goto failed;
501 }
502 return 0; 477 return 0;
503 478
504 failed: 479 failed:
@@ -615,7 +590,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
615 if (copy_from_user(&argv, argp, sizeof(argv))) 590 if (copy_from_user(&argv, argp, sizeof(argv)))
616 return -EFAULT; 591 return -EFAULT;
617 592
618 if (argv.v_size != membsz) 593 if (argv.v_size < membsz)
619 return -EINVAL; 594 return -EINVAL;
620 595
621 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc); 596 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index bb78745a0e30..2dfd47714ae5 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -412,8 +412,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
412 return 0; /* Do not request flush for shadow page cache */ 412 return 0; /* Do not request flush for shadow page cache */
413 if (!sb) { 413 if (!sb) {
414 writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs); 414 writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
415 if (!writer) 415 if (!writer) {
416 nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
416 return -EROFS; 417 return -EROFS;
418 }
417 sb = writer->s_super; 419 sb = writer->s_super;
418 } 420 }
419 421
@@ -430,6 +432,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
430 432
431static struct address_space_operations def_mdt_aops = { 433static struct address_space_operations def_mdt_aops = {
432 .writepage = nilfs_mdt_write_page, 434 .writepage = nilfs_mdt_write_page,
435 .sync_page = block_sync_page,
433}; 436};
434 437
435static struct inode_operations def_mdt_iops; 438static struct inode_operations def_mdt_iops;
@@ -449,7 +452,7 @@ struct inode *
449nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, 452nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
450 ino_t ino, gfp_t gfp_mask) 453 ino_t ino, gfp_t gfp_mask)
451{ 454{
452 struct inode *inode = nilfs_alloc_inode(sb); 455 struct inode *inode = nilfs_alloc_inode_common(nilfs);
453 456
454 if (!inode) 457 if (!inode)
455 return NULL; 458 return NULL;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index da6fc0bba2e5..724c63766e82 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -58,10 +58,6 @@ struct nilfs_inode_info {
58 */ 58 */
59 struct rw_semaphore xattr_sem; 59 struct rw_semaphore xattr_sem;
60#endif 60#endif
61#ifdef CONFIG_NILFS_POSIX_ACL
62 struct posix_acl *i_acl;
63 struct posix_acl *i_default_acl;
64#endif
65 struct buffer_head *i_bh; /* i_bh contains a new or dirty 61 struct buffer_head *i_bh; /* i_bh contains a new or dirty
66 disk inode */ 62 disk inode */
67 struct inode vfs_inode; 63 struct inode vfs_inode;
@@ -263,6 +259,7 @@ extern void nilfs_dirty_inode(struct inode *);
263extern struct dentry *nilfs_get_parent(struct dentry *); 259extern struct dentry *nilfs_get_parent(struct dentry *);
264 260
265/* super.c */ 261/* super.c */
262extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
266extern struct inode *nilfs_alloc_inode(struct super_block *); 263extern struct inode *nilfs_alloc_inode(struct super_block *);
267extern void nilfs_destroy_inode(struct inode *); 264extern void nilfs_destroy_inode(struct inode *);
268extern void nilfs_error(struct super_block *, const char *, const char *, ...) 265extern void nilfs_error(struct super_block *, const char *, const char *, ...)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 57afa9d24061..d80cc71be749 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -28,7 +28,6 @@
28#include "segment.h" 28#include "segment.h"
29#include "sufile.h" 29#include "sufile.h"
30#include "page.h" 30#include "page.h"
31#include "seglist.h"
32#include "segbuf.h" 31#include "segbuf.h"
33 32
34/* 33/*
@@ -395,6 +394,24 @@ static void dispose_recovery_list(struct list_head *head)
395 } 394 }
396} 395}
397 396
397struct nilfs_segment_entry {
398 struct list_head list;
399 __u64 segnum;
400};
401
402static int nilfs_segment_list_add(struct list_head *head, __u64 segnum)
403{
404 struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
405
406 if (unlikely(!ent))
407 return -ENOMEM;
408
409 ent->segnum = segnum;
410 INIT_LIST_HEAD(&ent->list);
411 list_add_tail(&ent->list, head);
412 return 0;
413}
414
398void nilfs_dispose_segment_list(struct list_head *head) 415void nilfs_dispose_segment_list(struct list_head *head)
399{ 416{
400 while (!list_empty(head)) { 417 while (!list_empty(head)) {
@@ -402,7 +419,7 @@ void nilfs_dispose_segment_list(struct list_head *head)
402 = list_entry(head->next, 419 = list_entry(head->next,
403 struct nilfs_segment_entry, list); 420 struct nilfs_segment_entry, list);
404 list_del(&ent->list); 421 list_del(&ent->list);
405 nilfs_free_segment_entry(ent); 422 kfree(ent);
406 } 423 }
407} 424}
408 425
@@ -431,12 +448,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
431 if (unlikely(err)) 448 if (unlikely(err))
432 goto failed; 449 goto failed;
433 450
434 err = -ENOMEM;
435 for (i = 1; i < 4; i++) { 451 for (i = 1; i < 4; i++) {
436 ent = nilfs_alloc_segment_entry(segnum[i]); 452 err = nilfs_segment_list_add(head, segnum[i]);
437 if (unlikely(!ent)) 453 if (unlikely(err))
438 goto failed; 454 goto failed;
439 list_add_tail(&ent->list, head);
440 } 455 }
441 456
442 /* 457 /*
@@ -450,7 +465,7 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
450 goto failed; 465 goto failed;
451 } 466 }
452 list_del(&ent->list); 467 list_del(&ent->list);
453 nilfs_free_segment_entry(ent); 468 kfree(ent);
454 } 469 }
455 470
456 /* Allocate new segments for recovery */ 471 /* Allocate new segments for recovery */
@@ -791,7 +806,6 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
791 u64 seg_seq; 806 u64 seg_seq;
792 __u64 segnum, nextnum = 0; 807 __u64 segnum, nextnum = 0;
793 __u64 cno; 808 __u64 cno;
794 struct nilfs_segment_entry *ent;
795 LIST_HEAD(segments); 809 LIST_HEAD(segments);
796 int empty_seg = 0, scan_newer = 0; 810 int empty_seg = 0, scan_newer = 0;
797 int ret; 811 int ret;
@@ -892,12 +906,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
892 if (empty_seg++) 906 if (empty_seg++)
893 goto super_root_found; /* found a valid super root */ 907 goto super_root_found; /* found a valid super root */
894 908
895 ent = nilfs_alloc_segment_entry(segnum); 909 ret = nilfs_segment_list_add(&segments, segnum);
896 if (unlikely(!ent)) { 910 if (unlikely(ret))
897 ret = -ENOMEM;
898 goto failed; 911 goto failed;
899 }
900 list_add_tail(&ent->list, &segments);
901 912
902 seg_seq++; 913 seg_seq++;
903 segnum = nextnum; 914 segnum = nextnum;
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index adccd4fc654e..0776ccc2504a 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -60,6 +60,7 @@ struct nilfs_sb_info {
60 struct super_block *s_super; /* reverse pointer to super_block */ 60 struct super_block *s_super; /* reverse pointer to super_block */
61 struct the_nilfs *s_nilfs; 61 struct the_nilfs *s_nilfs;
62 struct list_head s_list; /* list head for nilfs->ns_supers */ 62 struct list_head s_list; /* list head for nilfs->ns_supers */
63 atomic_t s_count; /* reference count */
63 64
64 /* Segment constructor */ 65 /* Segment constructor */
65 struct list_head s_dirty_files; /* dirty files list */ 66 struct list_head s_dirty_files; /* dirty files list */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 1e68821b4a9b..9e3fe17bb96b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -26,7 +26,6 @@
26#include <linux/crc32.h> 26#include <linux/crc32.h>
27#include "page.h" 27#include "page.h"
28#include "segbuf.h" 28#include "segbuf.h"
29#include "seglist.h"
30 29
31 30
32static struct kmem_cache *nilfs_segbuf_cachep; 31static struct kmem_cache *nilfs_segbuf_cachep;
@@ -394,7 +393,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
394 * Last BIO is always sent through the following 393 * Last BIO is always sent through the following
395 * submission. 394 * submission.
396 */ 395 */
397 rw |= (1 << BIO_RW_SYNCIO); 396 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
398 res = nilfs_submit_seg_bio(wi, rw); 397 res = nilfs_submit_seg_bio(wi, rw);
399 if (unlikely(res)) 398 if (unlikely(res))
400 goto failed_bio; 399 goto failed_bio;
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
deleted file mode 100644
index d39df9144e99..000000000000
--- a/fs/nilfs2/seglist.h
+++ /dev/null
@@ -1,85 +0,0 @@
1/*
2 * seglist.h - expediential structure and routines to handle list of segments
3 * (would be removed in a future release)
4 *
5 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 * Written by Ryusuke Konishi <ryusuke@osrg.net>
22 *
23 */
24#ifndef _NILFS_SEGLIST_H
25#define _NILFS_SEGLIST_H
26
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "sufile.h"
31
32struct nilfs_segment_entry {
33 __u64 segnum;
34
35#define NILFS_SLH_FREED 0x0001 /* The segment was freed provisonally.
36 It must be cancelled if
37 construction aborted */
38
39 unsigned flags;
40 struct list_head list;
41 struct buffer_head *bh_su;
42 struct nilfs_segment_usage *raw_su;
43};
44
45
46void nilfs_dispose_segment_list(struct list_head *);
47
48static inline struct nilfs_segment_entry *
49nilfs_alloc_segment_entry(__u64 segnum)
50{
51 struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
52
53 if (likely(ent)) {
54 ent->segnum = segnum;
55 ent->flags = 0;
56 ent->bh_su = NULL;
57 ent->raw_su = NULL;
58 INIT_LIST_HEAD(&ent->list);
59 }
60 return ent;
61}
62
63static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
64 struct inode *sufile)
65{
66 return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
67 &ent->raw_su, &ent->bh_su);
68}
69
70static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
71 struct inode *sufile)
72{
73 if (!ent->bh_su)
74 return;
75 nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
76 ent->bh_su = NULL;
77 ent->raw_su = NULL;
78}
79
80static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
81{
82 kfree(ent);
83}
84
85#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 22c7f65c2403..51ff3d0a4ee2 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -39,7 +39,6 @@
39#include "sufile.h" 39#include "sufile.h"
40#include "cpfile.h" 40#include "cpfile.h"
41#include "ifile.h" 41#include "ifile.h"
42#include "seglist.h"
43#include "segbuf.h" 42#include "segbuf.h"
44 43
45 44
@@ -79,7 +78,8 @@ enum {
79/* State flags of collection */ 78/* State flags of collection */
80#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */ 79#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */
81#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */ 80#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */
82#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED) 81#define NILFS_CF_SUFREED 0x0004 /* segment usages has been freed */
82#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED | NILFS_CF_SUFREED)
83 83
84/* Operations depending on the construction mode and file type */ 84/* Operations depending on the construction mode and file type */
85struct nilfs_sc_operations { 85struct nilfs_sc_operations {
@@ -810,7 +810,7 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
810{ 810{
811 return list_empty(&sci->sc_dirty_files) && 811 return list_empty(&sci->sc_dirty_files) &&
812 !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) && 812 !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
813 list_empty(&sci->sc_cleaning_segments) && 813 sci->sc_nfreesegs == 0 &&
814 (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes)); 814 (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
815} 815}
816 816
@@ -1005,44 +1005,6 @@ static void nilfs_drop_collected_inodes(struct list_head *head)
1005 } 1005 }
1006} 1006}
1007 1007
1008static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
1009 struct inode *sufile)
1010
1011{
1012 struct list_head *head = &sci->sc_cleaning_segments;
1013 struct nilfs_segment_entry *ent;
1014 int err;
1015
1016 list_for_each_entry(ent, head, list) {
1017 if (!(ent->flags & NILFS_SLH_FREED))
1018 break;
1019 err = nilfs_sufile_cancel_free(sufile, ent->segnum);
1020 WARN_ON(err); /* do not happen */
1021 ent->flags &= ~NILFS_SLH_FREED;
1022 }
1023}
1024
1025static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
1026 struct inode *sufile)
1027{
1028 struct list_head *head = &sci->sc_cleaning_segments;
1029 struct nilfs_segment_entry *ent;
1030 int err;
1031
1032 list_for_each_entry(ent, head, list) {
1033 err = nilfs_sufile_free(sufile, ent->segnum);
1034 if (unlikely(err))
1035 return err;
1036 ent->flags |= NILFS_SLH_FREED;
1037 }
1038 return 0;
1039}
1040
1041static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
1042{
1043 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
1044}
1045
1046static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci, 1008static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
1047 struct inode *inode, 1009 struct inode *inode,
1048 struct list_head *listp, 1010 struct list_head *listp,
@@ -1161,6 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1161 struct the_nilfs *nilfs = sbi->s_nilfs; 1123 struct the_nilfs *nilfs = sbi->s_nilfs;
1162 struct list_head *head; 1124 struct list_head *head;
1163 struct nilfs_inode_info *ii; 1125 struct nilfs_inode_info *ii;
1126 size_t ndone;
1164 int err = 0; 1127 int err = 0;
1165 1128
1166 switch (sci->sc_stage.scnt) { 1129 switch (sci->sc_stage.scnt) {
@@ -1250,10 +1213,16 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1250 break; 1213 break;
1251 sci->sc_stage.scnt++; /* Fall through */ 1214 sci->sc_stage.scnt++; /* Fall through */
1252 case NILFS_ST_SUFILE: 1215 case NILFS_ST_SUFILE:
1253 err = nilfs_segctor_prepare_free_segments(sci, 1216 err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
1254 nilfs->ns_sufile); 1217 sci->sc_nfreesegs, &ndone);
1255 if (unlikely(err)) 1218 if (unlikely(err)) {
1219 nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1220 sci->sc_freesegs, ndone,
1221 NULL);
1256 break; 1222 break;
1223 }
1224 sci->sc_stage.flags |= NILFS_CF_SUFREED;
1225
1257 err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile, 1226 err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
1258 &nilfs_sc_file_ops); 1227 &nilfs_sc_file_ops);
1259 if (unlikely(err)) 1228 if (unlikely(err))
@@ -1486,7 +1455,15 @@ static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
1486{ 1455{
1487 if (unlikely(err)) { 1456 if (unlikely(err)) {
1488 nilfs_segctor_free_incomplete_segments(sci, nilfs); 1457 nilfs_segctor_free_incomplete_segments(sci, nilfs);
1489 nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile); 1458 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1459 int ret;
1460
1461 ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1462 sci->sc_freesegs,
1463 sci->sc_nfreesegs,
1464 NULL);
1465 WARN_ON(ret); /* do not happen */
1466 }
1490 } 1467 }
1491 nilfs_segctor_clear_segment_buffers(sci); 1468 nilfs_segctor_clear_segment_buffers(sci);
1492} 1469}
@@ -1585,7 +1562,13 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1585 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) 1562 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
1586 break; 1563 break;
1587 1564
1588 nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile); 1565 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1566 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1567 sci->sc_freesegs,
1568 sci->sc_nfreesegs,
1569 NULL);
1570 WARN_ON(err); /* do not happen */
1571 }
1589 nilfs_segctor_clear_segment_buffers(sci); 1572 nilfs_segctor_clear_segment_buffers(sci);
1590 1573
1591 err = nilfs_segctor_extend_segments(sci, nilfs, nadd); 1574 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
@@ -1846,26 +1829,13 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci,
1846 err = nilfs_segbuf_write(segbuf, &wi); 1829 err = nilfs_segbuf_write(segbuf, &wi);
1847 1830
1848 res = nilfs_segbuf_wait(segbuf, &wi); 1831 res = nilfs_segbuf_wait(segbuf, &wi);
1849 err = unlikely(err) ? : res; 1832 err = err ? : res;
1850 if (unlikely(err)) 1833 if (err)
1851 return err; 1834 return err;
1852 } 1835 }
1853 return 0; 1836 return 0;
1854} 1837}
1855 1838
1856static int nilfs_page_has_uncleared_buffer(struct page *page)
1857{
1858 struct buffer_head *head, *bh;
1859
1860 head = bh = page_buffers(page);
1861 do {
1862 if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers))
1863 return 1;
1864 bh = bh->b_this_page;
1865 } while (bh != head);
1866 return 0;
1867}
1868
1869static void __nilfs_end_page_io(struct page *page, int err) 1839static void __nilfs_end_page_io(struct page *page, int err)
1870{ 1840{
1871 if (!err) { 1841 if (!err) {
@@ -1889,13 +1859,26 @@ static void nilfs_end_page_io(struct page *page, int err)
1889 if (!page) 1859 if (!page)
1890 return; 1860 return;
1891 1861
1892 if (buffer_nilfs_node(page_buffers(page)) && 1862 if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) {
1893 nilfs_page_has_uncleared_buffer(page)) 1863 /*
1894 /* For b-tree node pages, this function may be called twice 1864 * For b-tree node pages, this function may be called twice
1895 or more because they might be split in a segment. 1865 * or more because they might be split in a segment.
1896 This check assures that cleanup has been done for all 1866 */
1897 buffers in a split btnode page. */ 1867 if (PageDirty(page)) {
1868 /*
1869 * For pages holding split b-tree node buffers, dirty
1870 * flag on the buffers may be cleared discretely.
1871 * In that case, the page is once redirtied for
1872 * remaining buffers, and it must be cancelled if
1873 * all the buffers get cleaned later.
1874 */
1875 lock_page(page);
1876 if (nilfs_page_buffers_clean(page))
1877 __nilfs_clear_page_dirty(page);
1878 unlock_page(page);
1879 }
1898 return; 1880 return;
1881 }
1899 1882
1900 __nilfs_end_page_io(page, err); 1883 __nilfs_end_page_io(page, err);
1901} 1884}
@@ -1957,7 +1940,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
1957 } 1940 }
1958 if (bh->b_page != fs_page) { 1941 if (bh->b_page != fs_page) {
1959 nilfs_end_page_io(fs_page, err); 1942 nilfs_end_page_io(fs_page, err);
1960 if (unlikely(fs_page == failed_page)) 1943 if (fs_page && fs_page == failed_page)
1961 goto done; 1944 goto done;
1962 fs_page = bh->b_page; 1945 fs_page = bh->b_page;
1963 } 1946 }
@@ -2224,10 +2207,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2224 nilfs_segctor_complete_write(sci); 2207 nilfs_segctor_complete_write(sci);
2225 2208
2226 /* Commit segments */ 2209 /* Commit segments */
2227 if (has_sr) { 2210 if (has_sr)
2228 nilfs_segctor_commit_free_segments(sci);
2229 nilfs_segctor_clear_metadata_dirty(sci); 2211 nilfs_segctor_clear_metadata_dirty(sci);
2230 }
2231 2212
2232 nilfs_segctor_end_construction(sci, nilfs, 0); 2213 nilfs_segctor_end_construction(sci, nilfs, 0);
2233 2214
@@ -2301,48 +2282,6 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino)
2301 /* assign bit 0 to data files */ 2282 /* assign bit 0 to data files */
2302} 2283}
2303 2284
2304int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
2305 __u64 *segnum, size_t nsegs)
2306{
2307 struct nilfs_segment_entry *ent;
2308 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2309 struct inode *sufile = nilfs->ns_sufile;
2310 LIST_HEAD(list);
2311 __u64 *pnum;
2312 size_t i;
2313 int err;
2314
2315 for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
2316 ent = nilfs_alloc_segment_entry(*pnum);
2317 if (unlikely(!ent)) {
2318 err = -ENOMEM;
2319 goto failed;
2320 }
2321 list_add_tail(&ent->list, &list);
2322
2323 err = nilfs_open_segment_entry(ent, sufile);
2324 if (unlikely(err))
2325 goto failed;
2326
2327 if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
2328 printk(KERN_WARNING "NILFS: unused segment is "
2329 "requested to be cleaned (segnum=%llu)\n",
2330 (unsigned long long)ent->segnum);
2331 nilfs_close_segment_entry(ent, sufile);
2332 }
2333 list_splice(&list, sci->sc_cleaning_segments.prev);
2334 return 0;
2335
2336 failed:
2337 nilfs_dispose_segment_list(&list);
2338 return err;
2339}
2340
2341void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
2342{
2343 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
2344}
2345
2346struct nilfs_segctor_wait_request { 2285struct nilfs_segctor_wait_request {
2347 wait_queue_t wq; 2286 wait_queue_t wq;
2348 __u32 seq; 2287 __u32 seq;
@@ -2607,10 +2546,13 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2607 err = nilfs_init_gcdat_inode(nilfs); 2546 err = nilfs_init_gcdat_inode(nilfs);
2608 if (unlikely(err)) 2547 if (unlikely(err))
2609 goto out_unlock; 2548 goto out_unlock;
2549
2610 err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs); 2550 err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
2611 if (unlikely(err)) 2551 if (unlikely(err))
2612 goto out_unlock; 2552 goto out_unlock;
2613 2553
2554 sci->sc_freesegs = kbufs[4];
2555 sci->sc_nfreesegs = argv[4].v_nmembs;
2614 list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev); 2556 list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
2615 2557
2616 for (;;) { 2558 for (;;) {
@@ -2629,6 +2571,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2629 } 2571 }
2630 2572
2631 out_unlock: 2573 out_unlock:
2574 sci->sc_freesegs = NULL;
2575 sci->sc_nfreesegs = 0;
2632 nilfs_clear_gcdat_inode(nilfs); 2576 nilfs_clear_gcdat_inode(nilfs);
2633 nilfs_transaction_unlock(sbi); 2577 nilfs_transaction_unlock(sbi);
2634 return err; 2578 return err;
@@ -2835,7 +2779,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2835 INIT_LIST_HEAD(&sci->sc_dirty_files); 2779 INIT_LIST_HEAD(&sci->sc_dirty_files);
2836 INIT_LIST_HEAD(&sci->sc_segbufs); 2780 INIT_LIST_HEAD(&sci->sc_segbufs);
2837 INIT_LIST_HEAD(&sci->sc_gc_inodes); 2781 INIT_LIST_HEAD(&sci->sc_gc_inodes);
2838 INIT_LIST_HEAD(&sci->sc_cleaning_segments);
2839 INIT_LIST_HEAD(&sci->sc_copied_buffers); 2782 INIT_LIST_HEAD(&sci->sc_copied_buffers);
2840 2783
2841 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; 2784 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2901,9 +2844,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2901 nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1); 2844 nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
2902 } 2845 }
2903 2846
2904 if (!list_empty(&sci->sc_cleaning_segments))
2905 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
2906
2907 WARN_ON(!list_empty(&sci->sc_segbufs)); 2847 WARN_ON(!list_empty(&sci->sc_segbufs));
2908 2848
2909 down_write(&sbi->s_nilfs->ns_segctor_sem); 2849 down_write(&sbi->s_nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 476bdd5df5be..0d2a475a741b 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -90,8 +90,9 @@ struct nilfs_segsum_pointer {
90 * @sc_nblk_inc: Block count of current generation 90 * @sc_nblk_inc: Block count of current generation
91 * @sc_dirty_files: List of files to be written 91 * @sc_dirty_files: List of files to be written
92 * @sc_gc_inodes: List of GC inodes having blocks to be written 92 * @sc_gc_inodes: List of GC inodes having blocks to be written
93 * @sc_cleaning_segments: List of segments to be freed through construction
94 * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data 93 * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
94 * @sc_freesegs: array of segment numbers to be freed
95 * @sc_nfreesegs: number of segments on @sc_freesegs
95 * @sc_dsync_inode: inode whose data pages are written for a sync operation 96 * @sc_dsync_inode: inode whose data pages are written for a sync operation
96 * @sc_dsync_start: start byte offset of data pages 97 * @sc_dsync_start: start byte offset of data pages
97 * @sc_dsync_end: end byte offset of data pages (inclusive) 98 * @sc_dsync_end: end byte offset of data pages (inclusive)
@@ -131,9 +132,11 @@ struct nilfs_sc_info {
131 132
132 struct list_head sc_dirty_files; 133 struct list_head sc_dirty_files;
133 struct list_head sc_gc_inodes; 134 struct list_head sc_gc_inodes;
134 struct list_head sc_cleaning_segments;
135 struct list_head sc_copied_buffers; 135 struct list_head sc_copied_buffers;
136 136
137 __u64 *sc_freesegs;
138 size_t sc_nfreesegs;
139
137 struct nilfs_inode_info *sc_dsync_inode; 140 struct nilfs_inode_info *sc_dsync_inode;
138 loff_t sc_dsync_start; 141 loff_t sc_dsync_start;
139 loff_t sc_dsync_end; 142 loff_t sc_dsync_end;
@@ -225,10 +228,6 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
225extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, 228extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
226 void **); 229 void **);
227 230
228extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
229 __u64 *, size_t);
230extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
231
232extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *); 231extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
233extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); 232extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
234 233
@@ -240,5 +239,6 @@ extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
240extern int nilfs_recover_logical_segments(struct the_nilfs *, 239extern int nilfs_recover_logical_segments(struct the_nilfs *,
241 struct nilfs_sb_info *, 240 struct nilfs_sb_info *,
242 struct nilfs_recovery_info *); 241 struct nilfs_recovery_info *);
242extern void nilfs_dispose_segment_list(struct list_head *);
243 243
244#endif /* _NILFS_SEGMENT_H */ 244#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 98e68677f045..37994d4a59cc 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,6 +18,7 @@
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 * 19 *
20 * Written by Koji Sato <koji@osrg.net>. 20 * Written by Koji Sato <koji@osrg.net>.
21 * Rivised by Ryusuke Konishi <ryusuke@osrg.net>.
21 */ 22 */
22 23
23#include <linux/kernel.h> 24#include <linux/kernel.h>
@@ -108,6 +109,102 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
108 nilfs_mdt_mark_buffer_dirty(header_bh); 109 nilfs_mdt_mark_buffer_dirty(header_bh);
109} 110}
110 111
112/**
113 * nilfs_sufile_updatev - modify multiple segment usages at a time
114 * @sufile: inode of segment usage file
115 * @segnumv: array of segment numbers
116 * @nsegs: size of @segnumv array
117 * @create: creation flag
118 * @ndone: place to store number of modified segments on @segnumv
119 * @dofunc: primitive operation for the update
120 *
121 * Description: nilfs_sufile_updatev() repeatedly calls @dofunc
122 * against the given array of segments. The @dofunc is called with
123 * buffers of a header block and the sufile block in which the target
124 * segment usage entry is contained. If @ndone is given, the number
125 * of successfully modified segments from the head is stored in the
126 * place @ndone points to.
127 *
128 * Return Value: On success, zero is returned. On error, one of the
129 * following negative error codes is returned.
130 *
131 * %-EIO - I/O error.
132 *
133 * %-ENOMEM - Insufficient amount of memory available.
134 *
135 * %-ENOENT - Given segment usage is in hole block (may be returned if
136 * @create is zero)
137 *
138 * %-EINVAL - Invalid segment usage number
139 */
140int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
141 int create, size_t *ndone,
142 void (*dofunc)(struct inode *, __u64,
143 struct buffer_head *,
144 struct buffer_head *))
145{
146 struct buffer_head *header_bh, *bh;
147 unsigned long blkoff, prev_blkoff;
148 __u64 *seg;
149 size_t nerr = 0, n = 0;
150 int ret = 0;
151
152 if (unlikely(nsegs == 0))
153 goto out;
154
155 down_write(&NILFS_MDT(sufile)->mi_sem);
156 for (seg = segnumv; seg < segnumv + nsegs; seg++) {
157 if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
158 printk(KERN_WARNING
159 "%s: invalid segment number: %llu\n", __func__,
160 (unsigned long long)*seg);
161 nerr++;
162 }
163 }
164 if (nerr > 0) {
165 ret = -EINVAL;
166 goto out_sem;
167 }
168
169 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
170 if (ret < 0)
171 goto out_sem;
172
173 seg = segnumv;
174 blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
175 ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
176 if (ret < 0)
177 goto out_header;
178
179 for (;;) {
180 dofunc(sufile, *seg, header_bh, bh);
181
182 if (++seg >= segnumv + nsegs)
183 break;
184 prev_blkoff = blkoff;
185 blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
186 if (blkoff == prev_blkoff)
187 continue;
188
189 /* get different block */
190 brelse(bh);
191 ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
192 if (unlikely(ret < 0))
193 goto out_header;
194 }
195 brelse(bh);
196
197 out_header:
198 n = seg - segnumv;
199 brelse(header_bh);
200 out_sem:
201 up_write(&NILFS_MDT(sufile)->mi_sem);
202 out:
203 if (ndone)
204 *ndone = n;
205 return ret;
206}
207
111int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, 208int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
112 void (*dofunc)(struct inode *, __u64, 209 void (*dofunc)(struct inode *, __u64,
113 struct buffer_head *, 210 struct buffer_head *,
@@ -490,7 +587,8 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
490 * nilfs_sufile_get_suinfo - 587 * nilfs_sufile_get_suinfo -
491 * @sufile: inode of segment usage file 588 * @sufile: inode of segment usage file
492 * @segnum: segment number to start looking 589 * @segnum: segment number to start looking
493 * @si: array of suinfo 590 * @buf: array of suinfo
591 * @sisz: byte size of suinfo
494 * @nsi: size of suinfo array 592 * @nsi: size of suinfo array
495 * 593 *
496 * Description: 594 * Description:
@@ -502,11 +600,12 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
502 * 600 *
503 * %-ENOMEM - Insufficient amount of memory available. 601 * %-ENOMEM - Insufficient amount of memory available.
504 */ 602 */
505ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, 603ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
506 struct nilfs_suinfo *si, size_t nsi) 604 unsigned sisz, size_t nsi)
507{ 605{
508 struct buffer_head *su_bh; 606 struct buffer_head *su_bh;
509 struct nilfs_segment_usage *su; 607 struct nilfs_segment_usage *su;
608 struct nilfs_suinfo *si = buf;
510 size_t susz = NILFS_MDT(sufile)->mi_entry_size; 609 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
511 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; 610 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
512 void *kaddr; 611 void *kaddr;
@@ -531,20 +630,22 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
531 if (ret != -ENOENT) 630 if (ret != -ENOENT)
532 goto out; 631 goto out;
533 /* hole */ 632 /* hole */
534 memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n); 633 memset(si, 0, sisz * n);
634 si = (void *)si + sisz * n;
535 continue; 635 continue;
536 } 636 }
537 637
538 kaddr = kmap_atomic(su_bh->b_page, KM_USER0); 638 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
539 su = nilfs_sufile_block_get_segment_usage( 639 su = nilfs_sufile_block_get_segment_usage(
540 sufile, segnum, su_bh, kaddr); 640 sufile, segnum, su_bh, kaddr);
541 for (j = 0; j < n; j++, su = (void *)su + susz) { 641 for (j = 0; j < n;
542 si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod); 642 j++, su = (void *)su + susz, si = (void *)si + sisz) {
543 si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks); 643 si->sui_lastmod = le64_to_cpu(su->su_lastmod);
544 si[i + j].sui_flags = le32_to_cpu(su->su_flags) & 644 si->sui_nblocks = le32_to_cpu(su->su_nblocks);
645 si->sui_flags = le32_to_cpu(su->su_flags) &
545 ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); 646 ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
546 if (nilfs_segment_is_active(nilfs, segnum + j)) 647 if (nilfs_segment_is_active(nilfs, segnum + j))
547 si[i + j].sui_flags |= 648 si->sui_flags |=
548 (1UL << NILFS_SEGMENT_USAGE_ACTIVE); 649 (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
549 } 650 }
550 kunmap_atomic(kaddr, KM_USER0); 651 kunmap_atomic(kaddr, KM_USER0);
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2e2efd4ade1..a2c4d76c3366 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -43,43 +43,27 @@ void nilfs_sufile_put_segment_usage(struct inode *, __u64,
43 struct buffer_head *); 43 struct buffer_head *);
44int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); 44int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
45int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *); 45int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
46ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *, 46ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
47 size_t); 47 size_t);
48 48
49int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
50 void (*dofunc)(struct inode *, __u64,
51 struct buffer_head *,
52 struct buffer_head *));
49int nilfs_sufile_update(struct inode *, __u64, int, 53int nilfs_sufile_update(struct inode *, __u64, int,
50 void (*dofunc)(struct inode *, __u64, 54 void (*dofunc)(struct inode *, __u64,
51 struct buffer_head *, 55 struct buffer_head *,
52 struct buffer_head *)); 56 struct buffer_head *));
53void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
54 struct buffer_head *);
55void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *, 57void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
56 struct buffer_head *); 58 struct buffer_head *);
57void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *, 59void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
58 struct buffer_head *); 60 struct buffer_head *);
61void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
62 struct buffer_head *);
59void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, 63void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
60 struct buffer_head *); 64 struct buffer_head *);
61 65
62/** 66/**
63 * nilfs_sufile_cancel_free -
64 * @sufile: inode of segment usage file
65 * @segnum: segment number
66 *
67 * Description:
68 *
69 * Return Value: On success, 0 is returned. On error, one of the following
70 * negative error codes is returned.
71 *
72 * %-EIO - I/O error.
73 *
74 * %-ENOMEM - Insufficient amount of memory available.
75 */
76static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
77{
78 return nilfs_sufile_update(sufile, segnum, 0,
79 nilfs_sufile_do_cancel_free);
80}
81
82/**
83 * nilfs_sufile_scrap - make a segment garbage 67 * nilfs_sufile_scrap - make a segment garbage
84 * @sufile: inode of segment usage file 68 * @sufile: inode of segment usage file
85 * @segnum: segment number to be freed 69 * @segnum: segment number to be freed
@@ -100,6 +84,38 @@ static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
100} 84}
101 85
102/** 86/**
87 * nilfs_sufile_freev - free segments
88 * @sufile: inode of segment usage file
89 * @segnumv: array of segment numbers
90 * @nsegs: size of @segnumv array
91 * @ndone: place to store the number of freed segments
92 */
93static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv,
94 size_t nsegs, size_t *ndone)
95{
96 return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
97 nilfs_sufile_do_free);
98}
99
100/**
101 * nilfs_sufile_cancel_freev - reallocate freeing segments
102 * @sufile: inode of segment usage file
103 * @segnumv: array of segment numbers
104 * @nsegs: size of @segnumv array
105 * @ndone: place to store the number of cancelled segments
106 *
107 * Return Value: On success, 0 is returned. On error, a negative error codes
108 * is returned.
109 */
110static inline int nilfs_sufile_cancel_freev(struct inode *sufile,
111 __u64 *segnumv, size_t nsegs,
112 size_t *ndone)
113{
114 return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
115 nilfs_sufile_do_cancel_free);
116}
117
118/**
103 * nilfs_sufile_set_error - mark a segment as erroneous 119 * nilfs_sufile_set_error - mark a segment as erroneous
104 * @sufile: inode of segment usage file 120 * @sufile: inode of segment usage file
105 * @segnum: segment number 121 * @segnum: segment number
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6989b03e97ab..151964f0de4c 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -65,9 +65,8 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
65 "(NILFS)"); 65 "(NILFS)");
66MODULE_LICENSE("GPL"); 66MODULE_LICENSE("GPL");
67 67
68static void nilfs_write_super(struct super_block *sb);
68static int nilfs_remount(struct super_block *sb, int *flags, char *data); 69static int nilfs_remount(struct super_block *sb, int *flags, char *data);
69static int test_exclusive_mount(struct file_system_type *fs_type,
70 struct block_device *bdev, int flags);
71 70
72/** 71/**
73 * nilfs_error() - report failure condition on a filesystem 72 * nilfs_error() - report failure condition on a filesystem
@@ -134,7 +133,7 @@ void nilfs_warning(struct super_block *sb, const char *function,
134 133
135static struct kmem_cache *nilfs_inode_cachep; 134static struct kmem_cache *nilfs_inode_cachep;
136 135
137struct inode *nilfs_alloc_inode(struct super_block *sb) 136struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
138{ 137{
139 struct nilfs_inode_info *ii; 138 struct nilfs_inode_info *ii;
140 139
@@ -144,10 +143,15 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
144 ii->i_bh = NULL; 143 ii->i_bh = NULL;
145 ii->i_state = 0; 144 ii->i_state = 0;
146 ii->vfs_inode.i_version = 1; 145 ii->vfs_inode.i_version = 1;
147 nilfs_btnode_cache_init(&ii->i_btnode_cache); 146 nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi);
148 return &ii->vfs_inode; 147 return &ii->vfs_inode;
149} 148}
150 149
150struct inode *nilfs_alloc_inode(struct super_block *sb)
151{
152 return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs);
153}
154
151void nilfs_destroy_inode(struct inode *inode) 155void nilfs_destroy_inode(struct inode *inode)
152{ 156{
153 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); 157 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
@@ -185,16 +189,6 @@ static void nilfs_clear_inode(struct inode *inode)
185{ 189{
186 struct nilfs_inode_info *ii = NILFS_I(inode); 190 struct nilfs_inode_info *ii = NILFS_I(inode);
187 191
188#ifdef CONFIG_NILFS_POSIX_ACL
189 if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) {
190 posix_acl_release(ii->i_acl);
191 ii->i_acl = NILFS_ACL_NOT_CACHED;
192 }
193 if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) {
194 posix_acl_release(ii->i_default_acl);
195 ii->i_default_acl = NILFS_ACL_NOT_CACHED;
196 }
197#endif
198 /* 192 /*
199 * Free resources allocated in nilfs_read_inode(), here. 193 * Free resources allocated in nilfs_read_inode(), here.
200 */ 194 */
@@ -315,6 +309,11 @@ static void nilfs_put_super(struct super_block *sb)
315 struct nilfs_sb_info *sbi = NILFS_SB(sb); 309 struct nilfs_sb_info *sbi = NILFS_SB(sb);
316 struct the_nilfs *nilfs = sbi->s_nilfs; 310 struct the_nilfs *nilfs = sbi->s_nilfs;
317 311
312 lock_kernel();
313
314 if (sb->s_dirt)
315 nilfs_write_super(sb);
316
318 nilfs_detach_segment_constructor(sbi); 317 nilfs_detach_segment_constructor(sbi);
319 318
320 if (!(sb->s_flags & MS_RDONLY)) { 319 if (!(sb->s_flags & MS_RDONLY)) {
@@ -323,12 +322,18 @@ static void nilfs_put_super(struct super_block *sb)
323 nilfs_commit_super(sbi, 1); 322 nilfs_commit_super(sbi, 1);
324 up_write(&nilfs->ns_sem); 323 up_write(&nilfs->ns_sem);
325 } 324 }
325 down_write(&nilfs->ns_super_sem);
326 if (nilfs->ns_current == sbi)
327 nilfs->ns_current = NULL;
328 up_write(&nilfs->ns_super_sem);
326 329
327 nilfs_detach_checkpoint(sbi); 330 nilfs_detach_checkpoint(sbi);
328 put_nilfs(sbi->s_nilfs); 331 put_nilfs(sbi->s_nilfs);
329 sbi->s_super = NULL; 332 sbi->s_super = NULL;
330 sb->s_fs_info = NULL; 333 sb->s_fs_info = NULL;
331 kfree(sbi); 334 nilfs_put_sbinfo(sbi);
335
336 unlock_kernel();
332} 337}
333 338
334/** 339/**
@@ -383,6 +388,8 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
383{ 388{
384 int err = 0; 389 int err = 0;
385 390
391 nilfs_write_super(sb);
392
386 /* This function is called when super block should be written back */ 393 /* This function is called when super block should be written back */
387 if (wait) 394 if (wait)
388 err = nilfs_construct_segment(sb); 395 err = nilfs_construct_segment(sb);
@@ -396,9 +403,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
396 struct buffer_head *bh_cp; 403 struct buffer_head *bh_cp;
397 int err; 404 int err;
398 405
399 down_write(&nilfs->ns_sem); 406 down_write(&nilfs->ns_super_sem);
400 list_add(&sbi->s_list, &nilfs->ns_supers); 407 list_add(&sbi->s_list, &nilfs->ns_supers);
401 up_write(&nilfs->ns_sem); 408 up_write(&nilfs->ns_super_sem);
402 409
403 sbi->s_ifile = nilfs_mdt_new( 410 sbi->s_ifile = nilfs_mdt_new(
404 nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP); 411 nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
@@ -409,8 +416,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
409 if (unlikely(err)) 416 if (unlikely(err))
410 goto failed; 417 goto failed;
411 418
419 down_read(&nilfs->ns_segctor_sem);
412 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, 420 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
413 &bh_cp); 421 &bh_cp);
422 up_read(&nilfs->ns_segctor_sem);
414 if (unlikely(err)) { 423 if (unlikely(err)) {
415 if (err == -ENOENT || err == -EINVAL) { 424 if (err == -ENOENT || err == -EINVAL) {
416 printk(KERN_ERR 425 printk(KERN_ERR
@@ -436,9 +445,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
436 nilfs_mdt_destroy(sbi->s_ifile); 445 nilfs_mdt_destroy(sbi->s_ifile);
437 sbi->s_ifile = NULL; 446 sbi->s_ifile = NULL;
438 447
439 down_write(&nilfs->ns_sem); 448 down_write(&nilfs->ns_super_sem);
440 list_del_init(&sbi->s_list); 449 list_del_init(&sbi->s_list);
441 up_write(&nilfs->ns_sem); 450 up_write(&nilfs->ns_super_sem);
442 451
443 return err; 452 return err;
444} 453}
@@ -450,9 +459,9 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
450 nilfs_mdt_clear(sbi->s_ifile); 459 nilfs_mdt_clear(sbi->s_ifile);
451 nilfs_mdt_destroy(sbi->s_ifile); 460 nilfs_mdt_destroy(sbi->s_ifile);
452 sbi->s_ifile = NULL; 461 sbi->s_ifile = NULL;
453 down_write(&nilfs->ns_sem); 462 down_write(&nilfs->ns_super_sem);
454 list_del_init(&sbi->s_list); 463 list_del_init(&sbi->s_list);
455 up_write(&nilfs->ns_sem); 464 up_write(&nilfs->ns_super_sem);
456} 465}
457 466
458static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi) 467static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
@@ -752,7 +761,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
752 * @silent: silent mode flag 761 * @silent: silent mode flag
753 * @nilfs: the_nilfs struct 762 * @nilfs: the_nilfs struct
754 * 763 *
755 * This function is called exclusively by bd_mount_mutex. 764 * This function is called exclusively by nilfs->ns_mount_mutex.
756 * So, the recovery process is protected from other simultaneous mounts. 765 * So, the recovery process is protected from other simultaneous mounts.
757 */ 766 */
758static int 767static int
@@ -773,6 +782,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
773 get_nilfs(nilfs); 782 get_nilfs(nilfs);
774 sbi->s_nilfs = nilfs; 783 sbi->s_nilfs = nilfs;
775 sbi->s_super = sb; 784 sbi->s_super = sb;
785 atomic_set(&sbi->s_count, 1);
776 786
777 err = init_nilfs(nilfs, sbi, (char *)data); 787 err = init_nilfs(nilfs, sbi, (char *)data);
778 if (err) 788 if (err)
@@ -870,6 +880,11 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
870 goto failed_root; 880 goto failed_root;
871 } 881 }
872 882
883 down_write(&nilfs->ns_super_sem);
884 if (!nilfs_test_opt(sbi, SNAPSHOT))
885 nilfs->ns_current = sbi;
886 up_write(&nilfs->ns_super_sem);
887
873 return 0; 888 return 0;
874 889
875 failed_root: 890 failed_root:
@@ -885,7 +900,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
885 failed_sbi: 900 failed_sbi:
886 put_nilfs(nilfs); 901 put_nilfs(nilfs);
887 sb->s_fs_info = NULL; 902 sb->s_fs_info = NULL;
888 kfree(sbi); 903 nilfs_put_sbinfo(sbi);
889 return err; 904 return err;
890} 905}
891 906
@@ -898,6 +913,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
898 struct nilfs_mount_options old_opts; 913 struct nilfs_mount_options old_opts;
899 int err; 914 int err;
900 915
916 lock_kernel();
917
918 down_write(&nilfs->ns_super_sem);
901 old_sb_flags = sb->s_flags; 919 old_sb_flags = sb->s_flags;
902 old_opts.mount_opt = sbi->s_mount_opt; 920 old_opts.mount_opt = sbi->s_mount_opt;
903 old_opts.snapshot_cno = sbi->s_snapshot_cno; 921 old_opts.snapshot_cno = sbi->s_snapshot_cno;
@@ -945,14 +963,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
945 * store the current valid flag. (It may have been changed 963 * store the current valid flag. (It may have been changed
946 * by fsck since we originally mounted the partition.) 964 * by fsck since we originally mounted the partition.)
947 */ 965 */
948 down(&sb->s_bdev->bd_mount_sem); 966 if (nilfs->ns_current && nilfs->ns_current != sbi) {
949 /* Check existing RW-mount */
950 if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
951 printk(KERN_WARNING "NILFS (device %s): couldn't " 967 printk(KERN_WARNING "NILFS (device %s): couldn't "
952 "remount because a RW-mount exists.\n", 968 "remount because an RW-mount exists.\n",
953 sb->s_id); 969 sb->s_id);
954 err = -EBUSY; 970 err = -EBUSY;
955 goto rw_remount_failed; 971 goto restore_opts;
956 } 972 }
957 if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) { 973 if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
958 printk(KERN_WARNING "NILFS (device %s): couldn't " 974 printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -960,7 +976,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
960 "the latest one.\n", 976 "the latest one.\n",
961 sb->s_id); 977 sb->s_id);
962 err = -EINVAL; 978 err = -EINVAL;
963 goto rw_remount_failed; 979 goto restore_opts;
964 } 980 }
965 sb->s_flags &= ~MS_RDONLY; 981 sb->s_flags &= ~MS_RDONLY;
966 nilfs_clear_opt(sbi, SNAPSHOT); 982 nilfs_clear_opt(sbi, SNAPSHOT);
@@ -968,28 +984,31 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
968 984
969 err = nilfs_attach_segment_constructor(sbi); 985 err = nilfs_attach_segment_constructor(sbi);
970 if (err) 986 if (err)
971 goto rw_remount_failed; 987 goto restore_opts;
972 988
973 down_write(&nilfs->ns_sem); 989 down_write(&nilfs->ns_sem);
974 nilfs_setup_super(sbi); 990 nilfs_setup_super(sbi);
975 up_write(&nilfs->ns_sem); 991 up_write(&nilfs->ns_sem);
976 992
977 up(&sb->s_bdev->bd_mount_sem); 993 nilfs->ns_current = sbi;
978 } 994 }
979 out: 995 out:
996 up_write(&nilfs->ns_super_sem);
997 unlock_kernel();
980 return 0; 998 return 0;
981 999
982 rw_remount_failed:
983 up(&sb->s_bdev->bd_mount_sem);
984 restore_opts: 1000 restore_opts:
985 sb->s_flags = old_sb_flags; 1001 sb->s_flags = old_sb_flags;
986 sbi->s_mount_opt = old_opts.mount_opt; 1002 sbi->s_mount_opt = old_opts.mount_opt;
987 sbi->s_snapshot_cno = old_opts.snapshot_cno; 1003 sbi->s_snapshot_cno = old_opts.snapshot_cno;
1004 up_write(&nilfs->ns_super_sem);
1005 unlock_kernel();
988 return err; 1006 return err;
989} 1007}
990 1008
991struct nilfs_super_data { 1009struct nilfs_super_data {
992 struct block_device *bdev; 1010 struct block_device *bdev;
1011 struct nilfs_sb_info *sbi;
993 __u64 cno; 1012 __u64 cno;
994 int flags; 1013 int flags;
995}; 1014};
@@ -1048,33 +1067,7 @@ static int nilfs_test_bdev_super(struct super_block *s, void *data)
1048{ 1067{
1049 struct nilfs_super_data *sd = data; 1068 struct nilfs_super_data *sd = data;
1050 1069
1051 return s->s_bdev == sd->bdev; 1070 return sd->sbi && s->s_fs_info == (void *)sd->sbi;
1052}
1053
1054static int nilfs_test_bdev_super2(struct super_block *s, void *data)
1055{
1056 struct nilfs_super_data *sd = data;
1057 int ret;
1058
1059 if (s->s_bdev != sd->bdev)
1060 return 0;
1061
1062 if (!((s->s_flags | sd->flags) & MS_RDONLY))
1063 return 1; /* Reuse an old R/W-mode super_block */
1064
1065 if (s->s_flags & sd->flags & MS_RDONLY) {
1066 if (down_read_trylock(&s->s_umount)) {
1067 ret = s->s_root &&
1068 (sd->cno == NILFS_SB(s)->s_snapshot_cno);
1069 up_read(&s->s_umount);
1070 /*
1071 * This path is locked with sb_lock by sget().
1072 * So, drop_super() causes deadlock.
1073 */
1074 return ret;
1075 }
1076 }
1077 return 0;
1078} 1071}
1079 1072
1080static int 1073static int
@@ -1082,8 +1075,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1082 const char *dev_name, void *data, struct vfsmount *mnt) 1075 const char *dev_name, void *data, struct vfsmount *mnt)
1083{ 1076{
1084 struct nilfs_super_data sd; 1077 struct nilfs_super_data sd;
1085 struct super_block *s, *s2; 1078 struct super_block *s;
1086 struct the_nilfs *nilfs = NULL; 1079 struct the_nilfs *nilfs;
1087 int err, need_to_close = 1; 1080 int err, need_to_close = 1;
1088 1081
1089 sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); 1082 sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
@@ -1095,7 +1088,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1095 * much more information than normal filesystems to identify mount 1088 * much more information than normal filesystems to identify mount
1096 * instance. For snapshot mounts, not only a mount type (ro-mount 1089 * instance. For snapshot mounts, not only a mount type (ro-mount
1097 * or rw-mount) but also a checkpoint number is required. 1090 * or rw-mount) but also a checkpoint number is required.
1098 * The results are passed in sget() using nilfs_super_data.
1099 */ 1091 */
1100 sd.cno = 0; 1092 sd.cno = 0;
1101 sd.flags = flags; 1093 sd.flags = flags;
@@ -1104,64 +1096,59 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1104 goto failed; 1096 goto failed;
1105 } 1097 }
1106 1098
1107 /* 1099 nilfs = find_or_create_nilfs(sd.bdev);
1108 * once the super is inserted into the list by sget, s_umount 1100 if (!nilfs) {
1109 * will protect the lockfs code from trying to start a snapshot 1101 err = -ENOMEM;
1110 * while we are mounting 1102 goto failed;
1111 */
1112 down(&sd.bdev->bd_mount_sem);
1113 if (!sd.cno &&
1114 (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
1115 err = (err < 0) ? : -EBUSY;
1116 goto failed_unlock;
1117 } 1103 }
1118 1104
1119 /* 1105 mutex_lock(&nilfs->ns_mount_mutex);
1120 * Phase-1: search any existent instance and get the_nilfs
1121 */
1122 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
1123 if (IS_ERR(s))
1124 goto error_s;
1125
1126 if (!s->s_root) {
1127 err = -ENOMEM;
1128 nilfs = alloc_nilfs(sd.bdev);
1129 if (!nilfs)
1130 goto cancel_new;
1131 } else {
1132 struct nilfs_sb_info *sbi = NILFS_SB(s);
1133 1106
1107 if (!sd.cno) {
1134 /* 1108 /*
1135 * s_umount protects super_block from unmount process; 1109 * Check if an exclusive mount exists or not.
1136 * It covers pointers of nilfs_sb_info and the_nilfs. 1110 * Snapshot mounts coexist with a current mount
1111 * (i.e. rw-mount or ro-mount), whereas rw-mount and
1112 * ro-mount are mutually exclusive.
1137 */ 1113 */
1138 nilfs = sbi->s_nilfs; 1114 down_read(&nilfs->ns_super_sem);
1139 get_nilfs(nilfs); 1115 if (nilfs->ns_current &&
1140 up_write(&s->s_umount); 1116 ((nilfs->ns_current->s_super->s_flags ^ flags)
1117 & MS_RDONLY)) {
1118 up_read(&nilfs->ns_super_sem);
1119 err = -EBUSY;
1120 goto failed_unlock;
1121 }
1122 up_read(&nilfs->ns_super_sem);
1123 }
1141 1124
1142 /* 1125 /*
1143 * Phase-2: search specified snapshot or R/W mode super_block 1126 * Find existing nilfs_sb_info struct
1144 */ 1127 */
1145 if (!sd.cno) 1128 sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
1146 /* trying to get the latest checkpoint. */
1147 sd.cno = nilfs_last_cno(nilfs);
1148 1129
1149 s2 = sget(fs_type, nilfs_test_bdev_super2, 1130 if (!sd.cno)
1150 nilfs_set_bdev_super, &sd); 1131 /* trying to get the latest checkpoint. */
1151 deactivate_super(s); 1132 sd.cno = nilfs_last_cno(nilfs);
1152 /* 1133
1153 * Although deactivate_super() invokes close_bdev_exclusive() at 1134 /*
1154 * kill_block_super(). Here, s is an existent mount; we need 1135 * Get super block instance holding the nilfs_sb_info struct.
1155 * one more close_bdev_exclusive() call. 1136 * A new instance is allocated if no existing mount is present or
1156 */ 1137 * existing instance has been unmounted.
1157 s = s2; 1138 */
1158 if (IS_ERR(s)) 1139 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
1159 goto error_s; 1140 if (sd.sbi)
1141 nilfs_put_sbinfo(sd.sbi);
1142
1143 if (IS_ERR(s)) {
1144 err = PTR_ERR(s);
1145 goto failed_unlock;
1160 } 1146 }
1161 1147
1162 if (!s->s_root) { 1148 if (!s->s_root) {
1163 char b[BDEVNAME_SIZE]; 1149 char b[BDEVNAME_SIZE];
1164 1150
1151 /* New superblock instance created */
1165 s->s_flags = flags; 1152 s->s_flags = flags;
1166 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); 1153 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1167 sb_set_blocksize(s, block_size(sd.bdev)); 1154 sb_set_blocksize(s, block_size(sd.bdev));
@@ -1172,26 +1159,18 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1172 1159
1173 s->s_flags |= MS_ACTIVE; 1160 s->s_flags |= MS_ACTIVE;
1174 need_to_close = 0; 1161 need_to_close = 0;
1175 } else if (!(s->s_flags & MS_RDONLY)) {
1176 err = -EBUSY;
1177 } 1162 }
1178 1163
1179 up(&sd.bdev->bd_mount_sem); 1164 mutex_unlock(&nilfs->ns_mount_mutex);
1180 put_nilfs(nilfs); 1165 put_nilfs(nilfs);
1181 if (need_to_close) 1166 if (need_to_close)
1182 close_bdev_exclusive(sd.bdev, flags); 1167 close_bdev_exclusive(sd.bdev, flags);
1183 simple_set_mnt(mnt, s); 1168 simple_set_mnt(mnt, s);
1184 return 0; 1169 return 0;
1185 1170
1186 error_s:
1187 up(&sd.bdev->bd_mount_sem);
1188 if (nilfs)
1189 put_nilfs(nilfs);
1190 close_bdev_exclusive(sd.bdev, flags);
1191 return PTR_ERR(s);
1192
1193 failed_unlock: 1171 failed_unlock:
1194 up(&sd.bdev->bd_mount_sem); 1172 mutex_unlock(&nilfs->ns_mount_mutex);
1173 put_nilfs(nilfs);
1195 failed: 1174 failed:
1196 close_bdev_exclusive(sd.bdev, flags); 1175 close_bdev_exclusive(sd.bdev, flags);
1197 1176
@@ -1199,70 +1178,18 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1199 1178
1200 cancel_new: 1179 cancel_new:
1201 /* Abandoning the newly allocated superblock */ 1180 /* Abandoning the newly allocated superblock */
1202 up(&sd.bdev->bd_mount_sem); 1181 mutex_unlock(&nilfs->ns_mount_mutex);
1203 if (nilfs) 1182 put_nilfs(nilfs);
1204 put_nilfs(nilfs);
1205 up_write(&s->s_umount); 1183 up_write(&s->s_umount);
1206 deactivate_super(s); 1184 deactivate_super(s);
1207 /* 1185 /*
1208 * deactivate_super() invokes close_bdev_exclusive(). 1186 * deactivate_super() invokes close_bdev_exclusive().
1209 * We must finish all post-cleaning before this call; 1187 * We must finish all post-cleaning before this call;
1210 * put_nilfs() and unlocking bd_mount_sem need the block device. 1188 * put_nilfs() needs the block device.
1211 */ 1189 */
1212 return err; 1190 return err;
1213} 1191}
1214 1192
1215static int nilfs_test_bdev_super3(struct super_block *s, void *data)
1216{
1217 struct nilfs_super_data *sd = data;
1218 int ret;
1219
1220 if (s->s_bdev != sd->bdev)
1221 return 0;
1222 if (down_read_trylock(&s->s_umount)) {
1223 ret = (s->s_flags & MS_RDONLY) && s->s_root &&
1224 nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
1225 up_read(&s->s_umount);
1226 if (ret)
1227 return 0; /* ignore snapshot mounts */
1228 }
1229 return !((sd->flags ^ s->s_flags) & MS_RDONLY);
1230}
1231
1232static int __false_bdev_super(struct super_block *s, void *data)
1233{
1234#if 0 /* XXX: workaround for lock debug. This is not good idea */
1235 up_write(&s->s_umount);
1236#endif
1237 return -EFAULT;
1238}
1239
1240/**
1241 * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
1242 * fs_type: filesystem type
1243 * bdev: block device
1244 * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
1245 * res: pointer to an integer to store result
1246 *
1247 * This function must be called within a section protected by bd_mount_mutex.
1248 */
1249static int test_exclusive_mount(struct file_system_type *fs_type,
1250 struct block_device *bdev, int flags)
1251{
1252 struct super_block *s;
1253 struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
1254
1255 s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
1256 if (IS_ERR(s)) {
1257 if (PTR_ERR(s) != -EFAULT)
1258 return PTR_ERR(s);
1259 return 0; /* Not found */
1260 }
1261 up_write(&s->s_umount);
1262 deactivate_super(s);
1263 return 1; /* Found */
1264}
1265
1266struct file_system_type nilfs_fs_type = { 1193struct file_system_type nilfs_fs_type = {
1267 .owner = THIS_MODULE, 1194 .owner = THIS_MODULE,
1268 .name = "nilfs2", 1195 .name = "nilfs2",
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 7f65b3be4aa9..8b8889825716 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -32,9 +32,12 @@
32#include "cpfile.h" 32#include "cpfile.h"
33#include "sufile.h" 33#include "sufile.h"
34#include "dat.h" 34#include "dat.h"
35#include "seglist.h"
36#include "segbuf.h" 35#include "segbuf.h"
37 36
37
38static LIST_HEAD(nilfs_objects);
39static DEFINE_SPINLOCK(nilfs_lock);
40
38void nilfs_set_last_segment(struct the_nilfs *nilfs, 41void nilfs_set_last_segment(struct the_nilfs *nilfs,
39 sector_t start_blocknr, u64 seq, __u64 cno) 42 sector_t start_blocknr, u64 seq, __u64 cno)
40{ 43{
@@ -55,7 +58,7 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
55 * Return Value: On success, pointer to the_nilfs is returned. 58 * Return Value: On success, pointer to the_nilfs is returned.
56 * On error, NULL is returned. 59 * On error, NULL is returned.
57 */ 60 */
58struct the_nilfs *alloc_nilfs(struct block_device *bdev) 61static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
59{ 62{
60 struct the_nilfs *nilfs; 63 struct the_nilfs *nilfs;
61 64
@@ -68,7 +71,10 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
68 atomic_set(&nilfs->ns_writer_refcount, -1); 71 atomic_set(&nilfs->ns_writer_refcount, -1);
69 atomic_set(&nilfs->ns_ndirtyblks, 0); 72 atomic_set(&nilfs->ns_ndirtyblks, 0);
70 init_rwsem(&nilfs->ns_sem); 73 init_rwsem(&nilfs->ns_sem);
74 init_rwsem(&nilfs->ns_super_sem);
75 mutex_init(&nilfs->ns_mount_mutex);
71 mutex_init(&nilfs->ns_writer_mutex); 76 mutex_init(&nilfs->ns_writer_mutex);
77 INIT_LIST_HEAD(&nilfs->ns_list);
72 INIT_LIST_HEAD(&nilfs->ns_supers); 78 INIT_LIST_HEAD(&nilfs->ns_supers);
73 spin_lock_init(&nilfs->ns_last_segment_lock); 79 spin_lock_init(&nilfs->ns_last_segment_lock);
74 nilfs->ns_gc_inodes_h = NULL; 80 nilfs->ns_gc_inodes_h = NULL;
@@ -78,6 +84,45 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
78} 84}
79 85
80/** 86/**
87 * find_or_create_nilfs - find or create nilfs object
88 * @bdev: block device to which the_nilfs is related
89 *
90 * find_nilfs() looks up an existent nilfs object created on the
91 * device and gets the reference count of the object. If no nilfs object
92 * is found on the device, a new nilfs object is allocated.
93 *
94 * Return Value: On success, pointer to the nilfs object is returned.
95 * On error, NULL is returned.
96 */
97struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
98{
99 struct the_nilfs *nilfs, *new = NULL;
100
101 retry:
102 spin_lock(&nilfs_lock);
103 list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
104 if (nilfs->ns_bdev == bdev) {
105 get_nilfs(nilfs);
106 spin_unlock(&nilfs_lock);
107 if (new)
108 put_nilfs(new);
109 return nilfs; /* existing object */
110 }
111 }
112 if (new) {
113 list_add_tail(&new->ns_list, &nilfs_objects);
114 spin_unlock(&nilfs_lock);
115 return new; /* new object */
116 }
117 spin_unlock(&nilfs_lock);
118
119 new = alloc_nilfs(bdev);
120 if (new)
121 goto retry;
122 return NULL; /* insufficient memory */
123}
124
125/**
81 * put_nilfs - release a reference to the_nilfs 126 * put_nilfs - release a reference to the_nilfs
82 * @nilfs: the_nilfs structure to be released 127 * @nilfs: the_nilfs structure to be released
83 * 128 *
@@ -86,13 +131,20 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
86 */ 131 */
87void put_nilfs(struct the_nilfs *nilfs) 132void put_nilfs(struct the_nilfs *nilfs)
88{ 133{
89 if (!atomic_dec_and_test(&nilfs->ns_count)) 134 spin_lock(&nilfs_lock);
135 if (!atomic_dec_and_test(&nilfs->ns_count)) {
136 spin_unlock(&nilfs_lock);
90 return; 137 return;
138 }
139 list_del_init(&nilfs->ns_list);
140 spin_unlock(&nilfs_lock);
141
91 /* 142 /*
92 * Increment of ns_count never occur below because the caller 143 * Increment of ns_count never occurs below because the caller
93 * of get_nilfs() holds at least one reference to the_nilfs. 144 * of get_nilfs() holds at least one reference to the_nilfs.
94 * Thus its exclusion control is not required here. 145 * Thus its exclusion control is not required here.
95 */ 146 */
147
96 might_sleep(); 148 might_sleep();
97 if (nilfs_loaded(nilfs)) { 149 if (nilfs_loaded(nilfs)) {
98 nilfs_mdt_clear(nilfs->ns_sufile); 150 nilfs_mdt_clear(nilfs->ns_sufile);
@@ -515,7 +567,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
515 567
516 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); 568 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
517 if (sb->s_blocksize != blocksize) { 569 if (sb->s_blocksize != blocksize) {
518 int hw_blocksize = bdev_hardsect_size(sb->s_bdev); 570 int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
519 571
520 if (blocksize < hw_blocksize) { 572 if (blocksize < hw_blocksize) {
521 printk(KERN_ERR 573 printk(KERN_ERR
@@ -613,13 +665,63 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
613 return ret; 665 return ret;
614} 666}
615 667
668/**
669 * nilfs_find_sbinfo - find existing nilfs_sb_info structure
670 * @nilfs: nilfs object
671 * @rw_mount: mount type (non-zero value for read/write mount)
672 * @cno: checkpoint number (zero for read-only mount)
673 *
674 * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
675 * @rw_mount and @cno (in case of snapshots) matched. If no instance
676 * was found, NULL is returned. Although the super block instance can
677 * be unmounted after this function returns, the nilfs_sb_info struct
678 * is kept on memory until nilfs_put_sbinfo() is called.
679 */
680struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
681 int rw_mount, __u64 cno)
682{
683 struct nilfs_sb_info *sbi;
684
685 down_read(&nilfs->ns_super_sem);
686 /*
687 * The SNAPSHOT flag and sb->s_flags are supposed to be
688 * protected with nilfs->ns_super_sem.
689 */
690 sbi = nilfs->ns_current;
691 if (rw_mount) {
692 if (sbi && !(sbi->s_super->s_flags & MS_RDONLY))
693 goto found; /* read/write mount */
694 else
695 goto out;
696 } else if (cno == 0) {
697 if (sbi && (sbi->s_super->s_flags & MS_RDONLY))
698 goto found; /* read-only mount */
699 else
700 goto out;
701 }
702
703 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
704 if (nilfs_test_opt(sbi, SNAPSHOT) &&
705 sbi->s_snapshot_cno == cno)
706 goto found; /* snapshot mount */
707 }
708 out:
709 up_read(&nilfs->ns_super_sem);
710 return NULL;
711
712 found:
713 atomic_inc(&sbi->s_count);
714 up_read(&nilfs->ns_super_sem);
715 return sbi;
716}
717
616int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, 718int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
617 int snapshot_mount) 719 int snapshot_mount)
618{ 720{
619 struct nilfs_sb_info *sbi; 721 struct nilfs_sb_info *sbi;
620 int ret = 0; 722 int ret = 0;
621 723
622 down_read(&nilfs->ns_sem); 724 down_read(&nilfs->ns_super_sem);
623 if (cno == 0 || cno > nilfs->ns_cno) 725 if (cno == 0 || cno > nilfs->ns_cno)
624 goto out_unlock; 726 goto out_unlock;
625 727
@@ -636,6 +738,6 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
636 ret++; 738 ret++;
637 739
638 out_unlock: 740 out_unlock:
639 up_read(&nilfs->ns_sem); 741 up_read(&nilfs->ns_super_sem);
640 return ret; 742 return ret;
641} 743}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 30fe58778d05..1b9caafb8662 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -43,12 +43,16 @@ enum {
43 * struct the_nilfs - struct to supervise multiple nilfs mount points 43 * struct the_nilfs - struct to supervise multiple nilfs mount points
44 * @ns_flags: flags 44 * @ns_flags: flags
45 * @ns_count: reference count 45 * @ns_count: reference count
46 * @ns_list: list head for nilfs_list
46 * @ns_bdev: block device 47 * @ns_bdev: block device
47 * @ns_bdi: backing dev info 48 * @ns_bdi: backing dev info
48 * @ns_writer: back pointer to writable nilfs_sb_info 49 * @ns_writer: back pointer to writable nilfs_sb_info
49 * @ns_sem: semaphore for shared states 50 * @ns_sem: semaphore for shared states
51 * @ns_super_sem: semaphore for global operations across super block instances
52 * @ns_mount_mutex: mutex protecting mount process of nilfs
50 * @ns_writer_mutex: mutex protecting ns_writer attach/detach 53 * @ns_writer_mutex: mutex protecting ns_writer attach/detach
51 * @ns_writer_refcount: number of referrers on ns_writer 54 * @ns_writer_refcount: number of referrers on ns_writer
55 * @ns_current: back pointer to current mount
52 * @ns_sbh: buffer heads of on-disk super blocks 56 * @ns_sbh: buffer heads of on-disk super blocks
53 * @ns_sbp: pointers to super block data 57 * @ns_sbp: pointers to super block data
54 * @ns_sbwtime: previous write time of super blocks 58 * @ns_sbwtime: previous write time of super blocks
@@ -88,15 +92,24 @@ enum {
88struct the_nilfs { 92struct the_nilfs {
89 unsigned long ns_flags; 93 unsigned long ns_flags;
90 atomic_t ns_count; 94 atomic_t ns_count;
95 struct list_head ns_list;
91 96
92 struct block_device *ns_bdev; 97 struct block_device *ns_bdev;
93 struct backing_dev_info *ns_bdi; 98 struct backing_dev_info *ns_bdi;
94 struct nilfs_sb_info *ns_writer; 99 struct nilfs_sb_info *ns_writer;
95 struct rw_semaphore ns_sem; 100 struct rw_semaphore ns_sem;
101 struct rw_semaphore ns_super_sem;
102 struct mutex ns_mount_mutex;
96 struct mutex ns_writer_mutex; 103 struct mutex ns_writer_mutex;
97 atomic_t ns_writer_refcount; 104 atomic_t ns_writer_refcount;
98 105
99 /* 106 /*
107 * components protected by ns_super_sem
108 */
109 struct nilfs_sb_info *ns_current;
110 struct list_head ns_supers;
111
112 /*
100 * used for 113 * used for
101 * - loading the latest checkpoint exclusively. 114 * - loading the latest checkpoint exclusively.
102 * - allocating a new full segment. 115 * - allocating a new full segment.
@@ -108,7 +121,6 @@ struct the_nilfs {
108 time_t ns_sbwtime[2]; 121 time_t ns_sbwtime[2];
109 unsigned ns_sbsize; 122 unsigned ns_sbsize;
110 unsigned ns_mount_state; 123 unsigned ns_mount_state;
111 struct list_head ns_supers;
112 124
113 /* 125 /*
114 * Following fields are dedicated to a writable FS-instance. 126 * Following fields are dedicated to a writable FS-instance.
@@ -191,11 +203,12 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
191#define NILFS_ALTSB_FREQ 60 /* spare superblock */ 203#define NILFS_ALTSB_FREQ 60 /* spare superblock */
192 204
193void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 205void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
194struct the_nilfs *alloc_nilfs(struct block_device *); 206struct the_nilfs *find_or_create_nilfs(struct block_device *);
195void put_nilfs(struct the_nilfs *); 207void put_nilfs(struct the_nilfs *);
196int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 208int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
197int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 209int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
198int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 210int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
211struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
199int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); 212int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
200int nilfs_near_disk_full(struct the_nilfs *); 213int nilfs_near_disk_full(struct the_nilfs *);
201void nilfs_fall_back_super_block(struct the_nilfs *); 214void nilfs_fall_back_super_block(struct the_nilfs *);
@@ -238,6 +251,12 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
238 mutex_unlock(&nilfs->ns_writer_mutex); 251 mutex_unlock(&nilfs->ns_writer_mutex);
239} 252}
240 253
254static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
255{
256 if (atomic_dec_and_test(&sbi->s_count))
257 kfree(sbi);
258}
259
241static inline void 260static inline void
242nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum, 261nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
243 sector_t *seg_start, sector_t *seg_end) 262 sector_t *seg_start, sector_t *seg_end)
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 9b0efdad8910..477d37d83b31 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -15,6 +15,7 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/kmod.h> 16#include <linux/kmod.h>
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <asm/byteorder.h>
18 19
19static struct nls_table default_table; 20static struct nls_table default_table;
20static struct nls_table *tables = &default_table; 21static struct nls_table *tables = &default_table;
@@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] =
43 {0, /* end of table */} 44 {0, /* end of table */}
44}; 45};
45 46
46int 47#define UNICODE_MAX 0x0010ffff
47utf8_mbtowc(wchar_t *p, const __u8 *s, int n) 48#define PLANE_SIZE 0x00010000
49
50#define SURROGATE_MASK 0xfffff800
51#define SURROGATE_PAIR 0x0000d800
52#define SURROGATE_LOW 0x00000400
53#define SURROGATE_BITS 0x000003ff
54
55int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
48{ 56{
49 long l; 57 unsigned long l;
50 int c0, c, nc; 58 int c0, c, nc;
51 const struct utf8_table *t; 59 const struct utf8_table *t;
52 60
@@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
57 nc++; 65 nc++;
58 if ((c0 & t->cmask) == t->cval) { 66 if ((c0 & t->cmask) == t->cval) {
59 l &= t->lmask; 67 l &= t->lmask;
60 if (l < t->lval) 68 if (l < t->lval || l > UNICODE_MAX ||
69 (l & SURROGATE_MASK) == SURROGATE_PAIR)
61 return -1; 70 return -1;
62 *p = l; 71 *pu = (unicode_t) l;
63 return nc; 72 return nc;
64 } 73 }
65 if (n <= nc) 74 if (len <= nc)
66 return -1; 75 return -1;
67 s++; 76 s++;
68 c = (*s ^ 0x80) & 0xFF; 77 c = (*s ^ 0x80) & 0xFF;
@@ -72,90 +81,133 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
72 } 81 }
73 return -1; 82 return -1;
74} 83}
84EXPORT_SYMBOL(utf8_to_utf32);
75 85
76int 86int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
77utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n)
78{ 87{
79 __u16 *op; 88 unsigned long l;
80 const __u8 *ip;
81 int size;
82
83 op = pwcs;
84 ip = s;
85 while (*ip && n > 0) {
86 if (*ip & 0x80) {
87 size = utf8_mbtowc(op, ip, n);
88 if (size == -1) {
89 /* Ignore character and move on */
90 ip++;
91 n--;
92 } else {
93 op++;
94 ip += size;
95 n -= size;
96 }
97 } else {
98 *op++ = *ip++;
99 n--;
100 }
101 }
102 return (op - pwcs);
103}
104
105int
106utf8_wctomb(__u8 *s, wchar_t wc, int maxlen)
107{
108 long l;
109 int c, nc; 89 int c, nc;
110 const struct utf8_table *t; 90 const struct utf8_table *t;
111 91
112 if (!s) 92 if (!s)
113 return 0; 93 return 0;
114 94
115 l = wc; 95 l = u;
96 if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
97 return -1;
98
116 nc = 0; 99 nc = 0;
117 for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) { 100 for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
118 nc++; 101 nc++;
119 if (l <= t->lmask) { 102 if (l <= t->lmask) {
120 c = t->shift; 103 c = t->shift;
121 *s = t->cval | (l >> c); 104 *s = (u8) (t->cval | (l >> c));
122 while (c > 0) { 105 while (c > 0) {
123 c -= 6; 106 c -= 6;
124 s++; 107 s++;
125 *s = 0x80 | ((l >> c) & 0x3F); 108 *s = (u8) (0x80 | ((l >> c) & 0x3F));
126 } 109 }
127 return nc; 110 return nc;
128 } 111 }
129 } 112 }
130 return -1; 113 return -1;
131} 114}
115EXPORT_SYMBOL(utf32_to_utf8);
132 116
133int 117int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
134utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
135{ 118{
136 const __u16 *ip; 119 u16 *op;
137 __u8 *op;
138 int size; 120 int size;
121 unicode_t u;
122
123 op = pwcs;
124 while (*s && len > 0) {
125 if (*s & 0x80) {
126 size = utf8_to_utf32(s, len, &u);
127 if (size < 0) {
128 /* Ignore character and move on */
129 size = 1;
130 } else if (u >= PLANE_SIZE) {
131 u -= PLANE_SIZE;
132 *op++ = (wchar_t) (SURROGATE_PAIR |
133 ((u >> 10) & SURROGATE_BITS));
134 *op++ = (wchar_t) (SURROGATE_PAIR |
135 SURROGATE_LOW |
136 (u & SURROGATE_BITS));
137 } else {
138 *op++ = (wchar_t) u;
139 }
140 s += size;
141 len -= size;
142 } else {
143 *op++ = *s++;
144 len--;
145 }
146 }
147 return op - pwcs;
148}
149EXPORT_SYMBOL(utf8s_to_utf16s);
150
151static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
152{
153 switch (endian) {
154 default:
155 return c;
156 case UTF16_LITTLE_ENDIAN:
157 return __le16_to_cpu(c);
158 case UTF16_BIG_ENDIAN:
159 return __be16_to_cpu(c);
160 }
161}
162
163int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
164 u8 *s, int maxlen)
165{
166 u8 *op;
167 int size;
168 unsigned long u, v;
139 169
140 op = s; 170 op = s;
141 ip = pwcs; 171 while (len > 0 && maxlen > 0) {
142 while (*ip && maxlen > 0) { 172 u = get_utf16(*pwcs, endian);
143 if (*ip > 0x7f) { 173 if (!u)
144 size = utf8_wctomb(op, *ip, maxlen); 174 break;
175 pwcs++;
176 len--;
177 if (u > 0x7f) {
178 if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
179 if (u & SURROGATE_LOW) {
180 /* Ignore character and move on */
181 continue;
182 }
183 if (len <= 0)
184 break;
185 v = get_utf16(*pwcs, endian);
186 if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
187 !(v & SURROGATE_LOW)) {
188 /* Ignore character and move on */
189 continue;
190 }
191 u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
192 + (v & SURROGATE_BITS);
193 pwcs++;
194 len--;
195 }
196 size = utf32_to_utf8(u, op, maxlen);
145 if (size == -1) { 197 if (size == -1) {
146 /* Ignore character and move on */ 198 /* Ignore character and move on */
147 maxlen--;
148 } else { 199 } else {
149 op += size; 200 op += size;
150 maxlen -= size; 201 maxlen -= size;
151 } 202 }
152 } else { 203 } else {
153 *op++ = (__u8) *ip; 204 *op++ = (u8) u;
205 maxlen--;
154 } 206 }
155 ip++;
156 } 207 }
157 return (op - s); 208 return op - s;
158} 209}
210EXPORT_SYMBOL(utf16s_to_utf8s);
159 211
160int register_nls(struct nls_table * nls) 212int register_nls(struct nls_table * nls)
161{ 213{
@@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls);
467EXPORT_SYMBOL(unload_nls); 519EXPORT_SYMBOL(unload_nls);
468EXPORT_SYMBOL(load_nls); 520EXPORT_SYMBOL(load_nls);
469EXPORT_SYMBOL(load_nls_default); 521EXPORT_SYMBOL(load_nls_default);
470EXPORT_SYMBOL(utf8_mbtowc);
471EXPORT_SYMBOL(utf8_mbstowcs);
472EXPORT_SYMBOL(utf8_wctomb);
473EXPORT_SYMBOL(utf8_wcstombs);
474 522
475MODULE_LICENSE("Dual BSD/GPL"); 523MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index aa2c42fdd977..0d60a44acacd 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
15{ 15{
16 int n; 16 int n;
17 17
18 if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) { 18 if (boundlen <= 0)
19 return -ENAMETOOLONG;
20
21 n = utf32_to_utf8(uni, out, boundlen);
22 if (n < 0) {
19 *out = '?'; 23 *out = '?';
20 return -EINVAL; 24 return -EINVAL;
21 } 25 }
@@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
25static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) 29static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
26{ 30{
27 int n; 31 int n;
32 unicode_t u;
28 33
29 if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) { 34 n = utf8_to_utf32(rawstring, boundlen, &u);
35 if (n < 0 || u > MAX_WCHAR_T) {
30 *uni = 0x003f; /* ? */ 36 *uni = 0x003f; /* ? */
31 n = -EINVAL; 37 return -EINVAL;
32 } 38 }
39 *uni = (wchar_t) u;
33 return n; 40 return n;
34} 41}
35 42
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 50914d7303c6..dffbb0911d02 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,2 +1,5 @@
1config FSNOTIFY
2 def_bool n
3
1source "fs/notify/dnotify/Kconfig" 4source "fs/notify/dnotify/Kconfig"
2source "fs/notify/inotify/Kconfig" 5source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 5a95b6010ce7..0922cc826c46 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,2 +1,4 @@
1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o
2
1obj-y += dnotify/ 3obj-y += dnotify/
2obj-y += inotify/ 4obj-y += inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 26adf5dfa646..f9c1ca139d8f 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,5 +1,6 @@
1config DNOTIFY 1config DNOTIFY
2 bool "Dnotify support" 2 bool "Dnotify support"
3 select FSNOTIFY
3 default y 4 default y
4 help 5 help
5 Dnotify is a directory-based per-fd file change notification system 6 Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index b0aa2cde80bd..828a889be909 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -3,6 +3,9 @@
3 * 3 *
4 * Copyright (C) 2000,2001,2002 Stephen Rothwell 4 * Copyright (C) 2000,2001,2002 Stephen Rothwell
5 * 5 *
6 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
7 * dnotify was largly rewritten to use the new fsnotify infrastructure
8 *
6 * This program is free software; you can redistribute it and/or modify it 9 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the 10 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any 11 * Free Software Foundation; either version 2, or (at your option) any
@@ -21,24 +24,173 @@
21#include <linux/spinlock.h> 24#include <linux/spinlock.h>
22#include <linux/slab.h> 25#include <linux/slab.h>
23#include <linux/fdtable.h> 26#include <linux/fdtable.h>
27#include <linux/fsnotify_backend.h>
24 28
25int dir_notify_enable __read_mostly = 1; 29int dir_notify_enable __read_mostly = 1;
26 30
27static struct kmem_cache *dn_cache __read_mostly; 31static struct kmem_cache *dnotify_struct_cache __read_mostly;
32static struct kmem_cache *dnotify_mark_entry_cache __read_mostly;
33static struct fsnotify_group *dnotify_group __read_mostly;
34static DEFINE_MUTEX(dnotify_mark_mutex);
35
36/*
37 * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
38 * is being watched by dnotify. If multiple userspace applications are watching
39 * the same directory with dnotify their information is chained in dn
40 */
41struct dnotify_mark_entry {
42 struct fsnotify_mark_entry fsn_entry;
43 struct dnotify_struct *dn;
44};
28 45
29static void redo_inode_mask(struct inode *inode) 46/*
47 * When a process starts or stops watching an inode the set of events which
48 * dnotify cares about for that inode may change. This function runs the
49 * list of everything receiving dnotify events about this directory and calculates
50 * the set of all those events. After it updates what dnotify is interested in
51 * it calls the fsnotify function so it can update the set of all events relevant
52 * to this inode.
53 */
54static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
30{ 55{
31 unsigned long new_mask; 56 __u32 new_mask, old_mask;
32 struct dnotify_struct *dn; 57 struct dnotify_struct *dn;
58 struct dnotify_mark_entry *dnentry = container_of(entry,
59 struct dnotify_mark_entry,
60 fsn_entry);
61
62 assert_spin_locked(&entry->lock);
33 63
64 old_mask = entry->mask;
34 new_mask = 0; 65 new_mask = 0;
35 for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next) 66 for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next)
36 new_mask |= dn->dn_mask & ~DN_MULTISHOT; 67 new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
37 inode->i_dnotify_mask = new_mask; 68 entry->mask = new_mask;
69
70 if (old_mask == new_mask)
71 return;
72
73 if (entry->inode)
74 fsnotify_recalc_inode_mask(entry->inode);
75}
76
77/*
78 * Mains fsnotify call where events are delivered to dnotify.
79 * Find the dnotify mark on the relevant inode, run the list of dnotify structs
80 * on that mark and determine which of them has expressed interest in receiving
81 * events of this type. When found send the correct process and signal and
82 * destroy the dnotify struct if it was not registered to receive multiple
83 * events.
84 */
85static int dnotify_handle_event(struct fsnotify_group *group,
86 struct fsnotify_event *event)
87{
88 struct fsnotify_mark_entry *entry = NULL;
89 struct dnotify_mark_entry *dnentry;
90 struct inode *to_tell;
91 struct dnotify_struct *dn;
92 struct dnotify_struct **prev;
93 struct fown_struct *fown;
94
95 to_tell = event->to_tell;
96
97 spin_lock(&to_tell->i_lock);
98 entry = fsnotify_find_mark_entry(group, to_tell);
99 spin_unlock(&to_tell->i_lock);
100
101 /* unlikely since we alreay passed dnotify_should_send_event() */
102 if (unlikely(!entry))
103 return 0;
104 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
105
106 spin_lock(&entry->lock);
107 prev = &dnentry->dn;
108 while ((dn = *prev) != NULL) {
109 if ((dn->dn_mask & event->mask) == 0) {
110 prev = &dn->dn_next;
111 continue;
112 }
113 fown = &dn->dn_filp->f_owner;
114 send_sigio(fown, dn->dn_fd, POLL_MSG);
115 if (dn->dn_mask & FS_DN_MULTISHOT)
116 prev = &dn->dn_next;
117 else {
118 *prev = dn->dn_next;
119 kmem_cache_free(dnotify_struct_cache, dn);
120 dnotify_recalc_inode_mask(entry);
121 }
122 }
123
124 spin_unlock(&entry->lock);
125 fsnotify_put_mark(entry);
126
127 return 0;
128}
129
130/*
131 * Given an inode and mask determine if dnotify would be interested in sending
132 * userspace notification for that pair.
133 */
134static bool dnotify_should_send_event(struct fsnotify_group *group,
135 struct inode *inode, __u32 mask)
136{
137 struct fsnotify_mark_entry *entry;
138 bool send;
139
140 /* !dir_notify_enable should never get here, don't waste time checking
141 if (!dir_notify_enable)
142 return 0; */
143
144 /* not a dir, dnotify doesn't care */
145 if (!S_ISDIR(inode->i_mode))
146 return false;
147
148 spin_lock(&inode->i_lock);
149 entry = fsnotify_find_mark_entry(group, inode);
150 spin_unlock(&inode->i_lock);
151
152 /* no mark means no dnotify watch */
153 if (!entry)
154 return false;
155
156 mask = (mask & ~FS_EVENT_ON_CHILD);
157 send = (mask & entry->mask);
158
159 fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
160
161 return send;
162}
163
164static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
165{
166 struct dnotify_mark_entry *dnentry = container_of(entry,
167 struct dnotify_mark_entry,
168 fsn_entry);
169
170 BUG_ON(dnentry->dn);
171
172 kmem_cache_free(dnotify_mark_entry_cache, dnentry);
38} 173}
39 174
175static struct fsnotify_ops dnotify_fsnotify_ops = {
176 .handle_event = dnotify_handle_event,
177 .should_send_event = dnotify_should_send_event,
178 .free_group_priv = NULL,
179 .freeing_mark = NULL,
180 .free_event_priv = NULL,
181};
182
183/*
184 * Called every time a file is closed. Looks first for a dnotify mark on the
185 * inode. If one is found run all of the ->dn entries attached to that
186 * mark for one relevant to this process closing the file and remove that
187 * dnotify_struct. If that was the last dnotify_struct also remove the
188 * fsnotify_mark_entry.
189 */
40void dnotify_flush(struct file *filp, fl_owner_t id) 190void dnotify_flush(struct file *filp, fl_owner_t id)
41{ 191{
192 struct fsnotify_mark_entry *entry;
193 struct dnotify_mark_entry *dnentry;
42 struct dnotify_struct *dn; 194 struct dnotify_struct *dn;
43 struct dnotify_struct **prev; 195 struct dnotify_struct **prev;
44 struct inode *inode; 196 struct inode *inode;
@@ -46,145 +198,243 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
46 inode = filp->f_path.dentry->d_inode; 198 inode = filp->f_path.dentry->d_inode;
47 if (!S_ISDIR(inode->i_mode)) 199 if (!S_ISDIR(inode->i_mode))
48 return; 200 return;
201
49 spin_lock(&inode->i_lock); 202 spin_lock(&inode->i_lock);
50 prev = &inode->i_dnotify; 203 entry = fsnotify_find_mark_entry(dnotify_group, inode);
204 spin_unlock(&inode->i_lock);
205 if (!entry)
206 return;
207 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
208
209 mutex_lock(&dnotify_mark_mutex);
210
211 spin_lock(&entry->lock);
212 prev = &dnentry->dn;
51 while ((dn = *prev) != NULL) { 213 while ((dn = *prev) != NULL) {
52 if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { 214 if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
53 *prev = dn->dn_next; 215 *prev = dn->dn_next;
54 redo_inode_mask(inode); 216 kmem_cache_free(dnotify_struct_cache, dn);
55 kmem_cache_free(dn_cache, dn); 217 dnotify_recalc_inode_mask(entry);
56 break; 218 break;
57 } 219 }
58 prev = &dn->dn_next; 220 prev = &dn->dn_next;
59 } 221 }
60 spin_unlock(&inode->i_lock); 222
223 spin_unlock(&entry->lock);
224
225 /* nothing else could have found us thanks to the dnotify_mark_mutex */
226 if (dnentry->dn == NULL)
227 fsnotify_destroy_mark_by_entry(entry);
228
229 fsnotify_recalc_group_mask(dnotify_group);
230
231 mutex_unlock(&dnotify_mark_mutex);
232
233 fsnotify_put_mark(entry);
234}
235
236/* this conversion is done only at watch creation */
237static __u32 convert_arg(unsigned long arg)
238{
239 __u32 new_mask = FS_EVENT_ON_CHILD;
240
241 if (arg & DN_MULTISHOT)
242 new_mask |= FS_DN_MULTISHOT;
243 if (arg & DN_DELETE)
244 new_mask |= (FS_DELETE | FS_MOVED_FROM);
245 if (arg & DN_MODIFY)
246 new_mask |= FS_MODIFY;
247 if (arg & DN_ACCESS)
248 new_mask |= FS_ACCESS;
249 if (arg & DN_ATTRIB)
250 new_mask |= FS_ATTRIB;
251 if (arg & DN_RENAME)
252 new_mask |= FS_DN_RENAME;
253 if (arg & DN_CREATE)
254 new_mask |= (FS_CREATE | FS_MOVED_TO);
255
256 return new_mask;
61} 257}
62 258
259/*
260 * If multiple processes watch the same inode with dnotify there is only one
261 * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
262 * onto that mark. This function either attaches the new dnotify_struct onto
263 * that list, or it |= the mask onto an existing dnofiy_struct.
264 */
265static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry,
266 fl_owner_t id, int fd, struct file *filp, __u32 mask)
267{
268 struct dnotify_struct *odn;
269
270 odn = dnentry->dn;
271 while (odn != NULL) {
272 /* adding more events to existing dnofiy_struct? */
273 if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
274 odn->dn_fd = fd;
275 odn->dn_mask |= mask;
276 return -EEXIST;
277 }
278 odn = odn->dn_next;
279 }
280
281 dn->dn_mask = mask;
282 dn->dn_fd = fd;
283 dn->dn_filp = filp;
284 dn->dn_owner = id;
285 dn->dn_next = dnentry->dn;
286 dnentry->dn = dn;
287
288 return 0;
289}
290
291/*
292 * When a process calls fcntl to attach a dnotify watch to a directory it ends
293 * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be
294 * attached to the fsnotify_mark.
295 */
63int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) 296int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
64{ 297{
298 struct dnotify_mark_entry *new_dnentry, *dnentry;
299 struct fsnotify_mark_entry *new_entry, *entry;
65 struct dnotify_struct *dn; 300 struct dnotify_struct *dn;
66 struct dnotify_struct *odn;
67 struct dnotify_struct **prev;
68 struct inode *inode; 301 struct inode *inode;
69 fl_owner_t id = current->files; 302 fl_owner_t id = current->files;
70 struct file *f; 303 struct file *f;
71 int error = 0; 304 int destroy = 0, error = 0;
305 __u32 mask;
306
307 /* we use these to tell if we need to kfree */
308 new_entry = NULL;
309 dn = NULL;
310
311 if (!dir_notify_enable) {
312 error = -EINVAL;
313 goto out_err;
314 }
72 315
316 /* a 0 mask means we are explicitly removing the watch */
73 if ((arg & ~DN_MULTISHOT) == 0) { 317 if ((arg & ~DN_MULTISHOT) == 0) {
74 dnotify_flush(filp, id); 318 dnotify_flush(filp, id);
75 return 0; 319 error = 0;
320 goto out_err;
76 } 321 }
77 if (!dir_notify_enable) 322
78 return -EINVAL; 323 /* dnotify only works on directories */
79 inode = filp->f_path.dentry->d_inode; 324 inode = filp->f_path.dentry->d_inode;
80 if (!S_ISDIR(inode->i_mode)) 325 if (!S_ISDIR(inode->i_mode)) {
81 return -ENOTDIR; 326 error = -ENOTDIR;
82 dn = kmem_cache_alloc(dn_cache, GFP_KERNEL); 327 goto out_err;
83 if (dn == NULL)
84 return -ENOMEM;
85 spin_lock(&inode->i_lock);
86 prev = &inode->i_dnotify;
87 while ((odn = *prev) != NULL) {
88 if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
89 odn->dn_fd = fd;
90 odn->dn_mask |= arg;
91 inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
92 goto out_free;
93 }
94 prev = &odn->dn_next;
95 } 328 }
96 329
97 rcu_read_lock(); 330 /* expect most fcntl to add new rather than augment old */
98 f = fcheck(fd); 331 dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
99 rcu_read_unlock(); 332 if (!dn) {
100 /* we'd lost the race with close(), sod off silently */ 333 error = -ENOMEM;
101 /* note that inode->i_lock prevents reordering problems 334 goto out_err;
102 * between accesses to descriptor table and ->i_dnotify */ 335 }
103 if (f != filp)
104 goto out_free;
105 336
106 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 337 /* new fsnotify mark, we expect most fcntl calls to add a new mark */
107 if (error) 338 new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL);
108 goto out_free; 339 if (!new_dnentry) {
340 error = -ENOMEM;
341 goto out_err;
342 }
109 343
110 dn->dn_mask = arg; 344 /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
111 dn->dn_fd = fd; 345 mask = convert_arg(arg);
112 dn->dn_filp = filp;
113 dn->dn_owner = id;
114 inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
115 dn->dn_next = inode->i_dnotify;
116 inode->i_dnotify = dn;
117 spin_unlock(&inode->i_lock);
118 return 0;
119 346
120out_free: 347 /* set up the new_entry and new_dnentry */
121 spin_unlock(&inode->i_lock); 348 new_entry = &new_dnentry->fsn_entry;
122 kmem_cache_free(dn_cache, dn); 349 fsnotify_init_mark(new_entry, dnotify_free_mark);
123 return error; 350 new_entry->mask = mask;
124} 351 new_dnentry->dn = NULL;
125 352
126void __inode_dir_notify(struct inode *inode, unsigned long event) 353 /* this is needed to prevent the fcntl/close race described below */
127{ 354 mutex_lock(&dnotify_mark_mutex);
128 struct dnotify_struct * dn;
129 struct dnotify_struct **prev;
130 struct fown_struct * fown;
131 int changed = 0;
132 355
356 /* add the new_entry or find an old one. */
133 spin_lock(&inode->i_lock); 357 spin_lock(&inode->i_lock);
134 prev = &inode->i_dnotify; 358 entry = fsnotify_find_mark_entry(dnotify_group, inode);
135 while ((dn = *prev) != NULL) {
136 if ((dn->dn_mask & event) == 0) {
137 prev = &dn->dn_next;
138 continue;
139 }
140 fown = &dn->dn_filp->f_owner;
141 send_sigio(fown, dn->dn_fd, POLL_MSG);
142 if (dn->dn_mask & DN_MULTISHOT)
143 prev = &dn->dn_next;
144 else {
145 *prev = dn->dn_next;
146 changed = 1;
147 kmem_cache_free(dn_cache, dn);
148 }
149 }
150 if (changed)
151 redo_inode_mask(inode);
152 spin_unlock(&inode->i_lock); 359 spin_unlock(&inode->i_lock);
153} 360 if (entry) {
154 361 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
155EXPORT_SYMBOL(__inode_dir_notify); 362 spin_lock(&entry->lock);
363 } else {
364 fsnotify_add_mark(new_entry, dnotify_group, inode);
365 spin_lock(&new_entry->lock);
366 entry = new_entry;
367 dnentry = new_dnentry;
368 /* we used new_entry, so don't free it */
369 new_entry = NULL;
370 }
156 371
157/* 372 rcu_read_lock();
158 * This is hopelessly wrong, but unfixable without API changes. At 373 f = fcheck(fd);
159 * least it doesn't oops the kernel... 374 rcu_read_unlock();
160 *
161 * To safely access ->d_parent we need to keep d_move away from it. Use the
162 * dentry's d_lock for this.
163 */
164void dnotify_parent(struct dentry *dentry, unsigned long event)
165{
166 struct dentry *parent;
167 375
168 if (!dir_notify_enable) 376 /* if (f != filp) means that we lost a race and another task/thread
169 return; 377 * actually closed the fd we are still playing with before we grabbed
378 * the dnotify_mark_mutex and entry->lock. Since closing the fd is the
379 * only time we clean up the mark entries we need to get our mark off
380 * the list. */
381 if (f != filp) {
382 /* if we added ourselves, shoot ourselves, it's possible that
383 * the flush actually did shoot this entry. That's fine too
384 * since multiple calls to destroy_mark is perfectly safe, if
385 * we found a dnentry already attached to the inode, just sod
386 * off silently as the flush at close time dealt with it.
387 */
388 if (dnentry == new_dnentry)
389 destroy = 1;
390 goto out;
391 }
170 392
171 spin_lock(&dentry->d_lock); 393 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
172 parent = dentry->d_parent; 394 if (error) {
173 if (parent->d_inode->i_dnotify_mask & event) { 395 /* if we added, we must shoot */
174 dget(parent); 396 if (dnentry == new_dnentry)
175 spin_unlock(&dentry->d_lock); 397 destroy = 1;
176 __inode_dir_notify(parent->d_inode, event); 398 goto out;
177 dput(parent);
178 } else {
179 spin_unlock(&dentry->d_lock);
180 } 399 }
400
401 error = attach_dn(dn, dnentry, id, fd, filp, mask);
402 /* !error means that we attached the dn to the dnentry, so don't free it */
403 if (!error)
404 dn = NULL;
405 /* -EEXIST means that we didn't add this new dn and used an old one.
406 * that isn't an error (and the unused dn should be freed) */
407 else if (error == -EEXIST)
408 error = 0;
409
410 dnotify_recalc_inode_mask(entry);
411out:
412 spin_unlock(&entry->lock);
413
414 if (destroy)
415 fsnotify_destroy_mark_by_entry(entry);
416
417 fsnotify_recalc_group_mask(dnotify_group);
418
419 mutex_unlock(&dnotify_mark_mutex);
420 fsnotify_put_mark(entry);
421out_err:
422 if (new_entry)
423 fsnotify_put_mark(new_entry);
424 if (dn)
425 kmem_cache_free(dnotify_struct_cache, dn);
426 return error;
181} 427}
182EXPORT_SYMBOL_GPL(dnotify_parent);
183 428
184static int __init dnotify_init(void) 429static int __init dnotify_init(void)
185{ 430{
186 dn_cache = kmem_cache_create("dnotify_cache", 431 dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
187 sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL); 432 dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
433
434 dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
435 0, &dnotify_fsnotify_ops);
436 if (IS_ERR(dnotify_group))
437 panic("unable to allocate fsnotify group for dnotify\n");
188 return 0; 438 return 0;
189} 439}
190 440
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
new file mode 100644
index 000000000000..037e878e03fc
--- /dev/null
+++ b/fs/notify/fsnotify.c
@@ -0,0 +1,188 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19#include <linux/dcache.h>
20#include <linux/fs.h>
21#include <linux/init.h>
22#include <linux/module.h>
23#include <linux/srcu.h>
24
25#include <linux/fsnotify_backend.h>
26#include "fsnotify.h"
27
28/*
29 * Clear all of the marks on an inode when it is being evicted from core
30 */
31void __fsnotify_inode_delete(struct inode *inode)
32{
33 fsnotify_clear_marks_by_inode(inode);
34}
35EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
36
37/*
38 * Given an inode, first check if we care what happens to our children. Inotify
39 * and dnotify both tell their parents about events. If we care about any event
40 * on a child we run all of our children and set a dentry flag saying that the
41 * parent cares. Thus when an event happens on a child it can quickly tell if
42 * if there is a need to find a parent and send the event to the parent.
43 */
44void __fsnotify_update_child_dentry_flags(struct inode *inode)
45{
46 struct dentry *alias;
47 int watched;
48
49 if (!S_ISDIR(inode->i_mode))
50 return;
51
52 /* determine if the children should tell inode about their events */
53 watched = fsnotify_inode_watches_children(inode);
54
55 spin_lock(&dcache_lock);
56 /* run all of the dentries associated with this inode. Since this is a
57 * directory, there damn well better only be one item on this list */
58 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
59 struct dentry *child;
60
61 /* run all of the children of the original inode and fix their
62 * d_flags to indicate parental interest (their parent is the
63 * original inode) */
64 list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
65 if (!child->d_inode)
66 continue;
67
68 spin_lock(&child->d_lock);
69 if (watched)
70 child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
71 else
72 child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
73 spin_unlock(&child->d_lock);
74 }
75 }
76 spin_unlock(&dcache_lock);
77}
78
79/* Notify this dentry's parent about a child's events. */
80void __fsnotify_parent(struct dentry *dentry, __u32 mask)
81{
82 struct dentry *parent;
83 struct inode *p_inode;
84 bool send = false;
85 bool should_update_children = false;
86
87 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
88 return;
89
90 spin_lock(&dentry->d_lock);
91 parent = dentry->d_parent;
92 p_inode = parent->d_inode;
93
94 if (fsnotify_inode_watches_children(p_inode)) {
95 if (p_inode->i_fsnotify_mask & mask) {
96 dget(parent);
97 send = true;
98 }
99 } else {
100 /*
101 * The parent doesn't care about events on it's children but
102 * at least one child thought it did. We need to run all the
103 * children and update their d_flags to let them know p_inode
104 * doesn't care about them any more.
105 */
106 dget(parent);
107 should_update_children = true;
108 }
109
110 spin_unlock(&dentry->d_lock);
111
112 if (send) {
113 /* we are notifying a parent so come up with the new mask which
114 * specifies these are events which came from a child. */
115 mask |= FS_EVENT_ON_CHILD;
116
117 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
118 dentry->d_name.name, 0);
119 dput(parent);
120 }
121
122 if (unlikely(should_update_children)) {
123 __fsnotify_update_child_dentry_flags(p_inode);
124 dput(parent);
125 }
126}
127EXPORT_SYMBOL_GPL(__fsnotify_parent);
128
129/*
130 * This is the main call to fsnotify. The VFS calls into hook specific functions
131 * in linux/fsnotify.h. Those functions then in turn call here. Here will call
132 * out to all of the registered fsnotify_group. Those groups can then use the
133 * notification event in whatever means they feel necessary.
134 */
135void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
136{
137 struct fsnotify_group *group;
138 struct fsnotify_event *event = NULL;
139 int idx;
140 /* global tests shouldn't care about events on child only the specific event */
141 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
142
143 if (list_empty(&fsnotify_groups))
144 return;
145
146 if (!(test_mask & fsnotify_mask))
147 return;
148
149 if (!(test_mask & to_tell->i_fsnotify_mask))
150 return;
151 /*
152 * SRCU!! the groups list is very very much read only and the path is
153 * very hot. The VAST majority of events are not going to need to do
154 * anything other than walk the list so it's crazy to pre-allocate.
155 */
156 idx = srcu_read_lock(&fsnotify_grp_srcu);
157 list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
158 if (test_mask & group->mask) {
159 if (!group->ops->should_send_event(group, to_tell, mask))
160 continue;
161 if (!event) {
162 event = fsnotify_create_event(to_tell, mask, data,
163 data_is, file_name, cookie,
164 GFP_KERNEL);
165 /* shit, we OOM'd and now we can't tell, maybe
166 * someday someone else will want to do something
167 * here */
168 if (!event)
169 break;
170 }
171 group->ops->handle_event(group, event);
172 }
173 }
174 srcu_read_unlock(&fsnotify_grp_srcu, idx);
175 /*
176 * fsnotify_create_event() took a reference so the event can't be cleaned
177 * up while we are still trying to add it to lists, drop that one.
178 */
179 if (event)
180 fsnotify_put_event(event);
181}
182EXPORT_SYMBOL_GPL(fsnotify);
183
184static __init int fsnotify_init(void)
185{
186 return init_srcu_struct(&fsnotify_grp_srcu);
187}
188subsys_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
new file mode 100644
index 000000000000..4dc240824b2d
--- /dev/null
+++ b/fs/notify/fsnotify.h
@@ -0,0 +1,34 @@
1#ifndef __FS_NOTIFY_FSNOTIFY_H_
2#define __FS_NOTIFY_FSNOTIFY_H_
3
4#include <linux/list.h>
5#include <linux/fsnotify.h>
6#include <linux/srcu.h>
7#include <linux/types.h>
8
9/* protects reads of fsnotify_groups */
10extern struct srcu_struct fsnotify_grp_srcu;
11/* all groups which receive fsnotify events */
12extern struct list_head fsnotify_groups;
13/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
14extern __u32 fsnotify_mask;
15
16/* destroy all events sitting in this groups notification queue */
17extern void fsnotify_flush_notify(struct fsnotify_group *group);
18
19/* final kfree of a group */
20extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
21
22/* run the list of all marks associated with inode and flag them to be freed */
23extern void fsnotify_clear_marks_by_inode(struct inode *inode);
24/*
25 * update the dentry->d_flags of all of inode's children to indicate if inode cares
26 * about events that happen to its children.
27 */
28extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
29
30/* allocate and destroy and event holder to attach events to notification/access queues */
31extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
32extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
33
34#endif /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
new file mode 100644
index 000000000000..0e1677144bc5
--- /dev/null
+++ b/fs/notify/group.c
@@ -0,0 +1,254 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19#include <linux/list.h>
20#include <linux/mutex.h>
21#include <linux/slab.h>
22#include <linux/srcu.h>
23#include <linux/rculist.h>
24#include <linux/wait.h>
25
26#include <linux/fsnotify_backend.h>
27#include "fsnotify.h"
28
29#include <asm/atomic.h>
30
31/* protects writes to fsnotify_groups and fsnotify_mask */
32static DEFINE_MUTEX(fsnotify_grp_mutex);
33/* protects reads while running the fsnotify_groups list */
34struct srcu_struct fsnotify_grp_srcu;
35/* all groups registered to receive filesystem notifications */
36LIST_HEAD(fsnotify_groups);
37/* bitwise OR of all events (FS_*) interesting to some group on this system */
38__u32 fsnotify_mask;
39
40/*
41 * When a new group registers or changes it's set of interesting events
42 * this function updates the fsnotify_mask to contain all interesting events
43 */
44void fsnotify_recalc_global_mask(void)
45{
46 struct fsnotify_group *group;
47 __u32 mask = 0;
48 int idx;
49
50 idx = srcu_read_lock(&fsnotify_grp_srcu);
51 list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
52 mask |= group->mask;
53 srcu_read_unlock(&fsnotify_grp_srcu, idx);
54 fsnotify_mask = mask;
55}
56
57/*
58 * Update the group->mask by running all of the marks associated with this
59 * group and finding the bitwise | of all of the mark->mask. If we change
60 * the group->mask we need to update the global mask of events interesting
61 * to the system.
62 */
63void fsnotify_recalc_group_mask(struct fsnotify_group *group)
64{
65 __u32 mask = 0;
66 __u32 old_mask = group->mask;
67 struct fsnotify_mark_entry *entry;
68
69 spin_lock(&group->mark_lock);
70 list_for_each_entry(entry, &group->mark_entries, g_list)
71 mask |= entry->mask;
72 spin_unlock(&group->mark_lock);
73
74 group->mask = mask;
75
76 if (old_mask != mask)
77 fsnotify_recalc_global_mask();
78}
79
80/*
81 * Take a reference to a group so things found under the fsnotify_grp_mutex
82 * can't get freed under us
83 */
84static void fsnotify_get_group(struct fsnotify_group *group)
85{
86 atomic_inc(&group->refcnt);
87}
88
89/*
90 * Final freeing of a group
91 */
92void fsnotify_final_destroy_group(struct fsnotify_group *group)
93{
94 /* clear the notification queue of all events */
95 fsnotify_flush_notify(group);
96
97 if (group->ops->free_group_priv)
98 group->ops->free_group_priv(group);
99
100 kfree(group);
101}
102
103/*
104 * Trying to get rid of a group. We need to first get rid of any outstanding
105 * allocations and then free the group. Remember that fsnotify_clear_marks_by_group
106 * could miss marks that are being freed by inode and those marks could still
107 * hold a reference to this group (via group->num_marks) If we get into that
108 * situtation, the fsnotify_final_destroy_group will get called when that final
109 * mark is freed.
110 */
111static void fsnotify_destroy_group(struct fsnotify_group *group)
112{
113 /* clear all inode mark entries for this group */
114 fsnotify_clear_marks_by_group(group);
115
116 /* past the point of no return, matches the initial value of 1 */
117 if (atomic_dec_and_test(&group->num_marks))
118 fsnotify_final_destroy_group(group);
119}
120
121/*
122 * Remove this group from the global list of groups that will get events
123 * this can be done even if there are still references and things still using
124 * this group. This just stops the group from getting new events.
125 */
126static void __fsnotify_evict_group(struct fsnotify_group *group)
127{
128 BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
129
130 if (group->on_group_list)
131 list_del_rcu(&group->group_list);
132 group->on_group_list = 0;
133}
134
135/*
136 * Called when a group is no longer interested in getting events. This can be
137 * used if a group is misbehaving or if for some reason a group should no longer
138 * get any filesystem events.
139 */
140void fsnotify_evict_group(struct fsnotify_group *group)
141{
142 mutex_lock(&fsnotify_grp_mutex);
143 __fsnotify_evict_group(group);
144 mutex_unlock(&fsnotify_grp_mutex);
145}
146
147/*
148 * Drop a reference to a group. Free it if it's through.
149 */
150void fsnotify_put_group(struct fsnotify_group *group)
151{
152 if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
153 return;
154
155 /*
156 * OK, now we know that there's no other users *and* we hold mutex,
157 * so no new references will appear
158 */
159 __fsnotify_evict_group(group);
160
161 /*
162 * now it's off the list, so the only thing we might care about is
163 * srcu access....
164 */
165 mutex_unlock(&fsnotify_grp_mutex);
166 synchronize_srcu(&fsnotify_grp_srcu);
167
168 /* and now it is really dead. _Nothing_ could be seeing it */
169 fsnotify_recalc_global_mask();
170 fsnotify_destroy_group(group);
171}
172
173/*
174 * Simply run the fsnotify_groups list and find a group which matches
175 * the given parameters. If a group is found we take a reference to that
176 * group.
177 */
178static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
179 const struct fsnotify_ops *ops)
180{
181 struct fsnotify_group *group_iter;
182 struct fsnotify_group *group = NULL;
183
184 BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
185
186 list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
187 if (group_iter->group_num == group_num) {
188 if ((group_iter->mask == mask) &&
189 (group_iter->ops == ops)) {
190 fsnotify_get_group(group_iter);
191 group = group_iter;
192 } else
193 group = ERR_PTR(-EEXIST);
194 }
195 }
196 return group;
197}
198
199/*
200 * Either finds an existing group which matches the group_num, mask, and ops or
201 * creates a new group and adds it to the global group list. In either case we
202 * take a reference for the group returned.
203 */
204struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
205 const struct fsnotify_ops *ops)
206{
207 struct fsnotify_group *group, *tgroup;
208
209 /* very low use, simpler locking if we just always alloc */
210 group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
211 if (!group)
212 return ERR_PTR(-ENOMEM);
213
214 atomic_set(&group->refcnt, 1);
215
216 group->on_group_list = 0;
217 group->group_num = group_num;
218 group->mask = mask;
219
220 mutex_init(&group->notification_mutex);
221 INIT_LIST_HEAD(&group->notification_list);
222 init_waitqueue_head(&group->notification_waitq);
223 group->q_len = 0;
224 group->max_events = UINT_MAX;
225
226 spin_lock_init(&group->mark_lock);
227 atomic_set(&group->num_marks, 0);
228 INIT_LIST_HEAD(&group->mark_entries);
229
230 group->ops = ops;
231
232 mutex_lock(&fsnotify_grp_mutex);
233 tgroup = fsnotify_find_group(group_num, mask, ops);
234 if (tgroup) {
235 /* group already exists */
236 mutex_unlock(&fsnotify_grp_mutex);
237 /* destroy the new one we made */
238 fsnotify_put_group(group);
239 return tgroup;
240 }
241
242 /* group not found, add a new one */
243 list_add_rcu(&group->group_list, &fsnotify_groups);
244 group->on_group_list = 1;
245 /* being on the fsnotify_groups list holds one num_marks */
246 atomic_inc(&group->num_marks);
247
248 mutex_unlock(&fsnotify_grp_mutex);
249
250 if (mask)
251 fsnotify_recalc_global_mask();
252
253 return group;
254}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
new file mode 100644
index 000000000000..c8a07c65482b
--- /dev/null
+++ b/fs/notify/inode_mark.c
@@ -0,0 +1,426 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19/*
20 * fsnotify inode mark locking/lifetime/and refcnting
21 *
22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are
24 * referencing this object. The object typically will live inside the kernel
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
26 * which can find this object holding the appropriete locks, can take a reference
27 * and the object itself is guarenteed to survive until the reference is dropped.
28 *
29 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
31 * be taken in order as follows:
32 *
33 * entry->lock
34 * group->mark_lock
35 * inode->i_lock
36 *
37 * entry->lock protects 2 things, entry->group and entry->inode. You must hold
38 * that lock to dereference either of these things (they could be NULL even with
39 * the lock)
40 *
41 * group->mark_lock protects the mark_entries list anchored inside a given group
42 * and each entry is hooked via the g_list. It also sorta protects the
43 * free_g_list, which when used is anchored by a private list on the stack of the
44 * task which held the group->mark_lock.
45 *
46 * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
47 * given inode and each entry is hooked via the i_list. (and sorta the
48 * free_i_list)
49 *
50 *
51 * LIFETIME:
52 * Inode marks survive between when they are added to an inode and when their
53 * refcnt==0.
54 *
55 * The inode mark can be cleared for a number of different reasons including:
56 * - The inode is unlinked for the last time. (fsnotify_inode_remove)
57 * - The inode is being evicted from cache. (fsnotify_inode_delete)
58 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes)
59 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark_by_entry)
60 * - The fsnotify_group associated with the mark is going away and all such marks
61 * need to be cleaned up. (fsnotify_clear_marks_by_group)
62 *
63 * Worst case we are given an inode and need to clean up all the marks on that
64 * inode. We take i_lock and walk the i_fsnotify_mark_entries safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no
68 * longer fear anything finding the mark using the inode's list of marks.
69 *
70 * We can safely and locklessly run the private list on the stack of everything
71 * we just unattached from the original inode. For each mark on the private list
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list.
80 *
81 * This has the very interesting property of being able to run concurrently with
82 * any (or all) other directions.
83 */
84
85#include <linux/fs.h>
86#include <linux/init.h>
87#include <linux/kernel.h>
88#include <linux/module.h>
89#include <linux/mutex.h>
90#include <linux/slab.h>
91#include <linux/spinlock.h>
92#include <linux/writeback.h> /* for inode_lock */
93
94#include <asm/atomic.h>
95
96#include <linux/fsnotify_backend.h>
97#include "fsnotify.h"
98
99void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
100{
101 atomic_inc(&entry->refcnt);
102}
103
104void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
105{
106 if (atomic_dec_and_test(&entry->refcnt))
107 entry->free_mark(entry);
108}
109
110/*
111 * Recalculate the mask of events relevant to a given inode locked.
112 */
113static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
114{
115 struct fsnotify_mark_entry *entry;
116 struct hlist_node *pos;
117 __u32 new_mask = 0;
118
119 assert_spin_locked(&inode->i_lock);
120
121 hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
122 new_mask |= entry->mask;
123 inode->i_fsnotify_mask = new_mask;
124}
125
126/*
127 * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
128 * any notifier is interested in hearing for this inode.
129 */
130void fsnotify_recalc_inode_mask(struct inode *inode)
131{
132 spin_lock(&inode->i_lock);
133 fsnotify_recalc_inode_mask_locked(inode);
134 spin_unlock(&inode->i_lock);
135
136 __fsnotify_update_child_dentry_flags(inode);
137}
138
139/*
140 * Any time a mark is getting freed we end up here.
141 * The caller had better be holding a reference to this mark so we don't actually
142 * do the final put under the entry->lock
143 */
144void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
145{
146 struct fsnotify_group *group;
147 struct inode *inode;
148
149 spin_lock(&entry->lock);
150
151 group = entry->group;
152 inode = entry->inode;
153
154 BUG_ON(group && !inode);
155 BUG_ON(!group && inode);
156
157 /* if !group something else already marked this to die */
158 if (!group) {
159 spin_unlock(&entry->lock);
160 return;
161 }
162
163 /* 1 from caller and 1 for being on i_list/g_list */
164 BUG_ON(atomic_read(&entry->refcnt) < 2);
165
166 spin_lock(&group->mark_lock);
167 spin_lock(&inode->i_lock);
168
169 hlist_del_init(&entry->i_list);
170 entry->inode = NULL;
171
172 list_del_init(&entry->g_list);
173 entry->group = NULL;
174
175 fsnotify_put_mark(entry); /* for i_list and g_list */
176
177 /*
178 * this mark is now off the inode->i_fsnotify_mark_entries list and we
179 * hold the inode->i_lock, so this is the perfect time to update the
180 * inode->i_fsnotify_mask
181 */
182 fsnotify_recalc_inode_mask_locked(inode);
183
184 spin_unlock(&inode->i_lock);
185 spin_unlock(&group->mark_lock);
186 spin_unlock(&entry->lock);
187
188 /*
189 * Some groups like to know that marks are being freed. This is a
190 * callback to the group function to let it know that this entry
191 * is being freed.
192 */
193 if (group->ops->freeing_mark)
194 group->ops->freeing_mark(entry, group);
195
196 /*
197 * __fsnotify_update_child_dentry_flags(inode);
198 *
199 * I really want to call that, but we can't, we have no idea if the inode
200 * still exists the second we drop the entry->lock.
201 *
202 * The next time an event arrive to this inode from one of it's children
203 * __fsnotify_parent will see that the inode doesn't care about it's
204 * children and will update all of these flags then. So really this
205 * is just a lazy update (and could be a perf win...)
206 */
207
208
209 iput(inode);
210
211 /*
212 * it's possible that this group tried to destroy itself, but this
213 * this mark was simultaneously being freed by inode. If that's the
214 * case, we finish freeing the group here.
215 */
216 if (unlikely(atomic_dec_and_test(&group->num_marks)))
217 fsnotify_final_destroy_group(group);
218}
219
220/*
221 * Given a group, destroy all of the marks associated with that group.
222 */
223void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
224{
225 struct fsnotify_mark_entry *lentry, *entry;
226 LIST_HEAD(free_list);
227
228 spin_lock(&group->mark_lock);
229 list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
230 list_add(&entry->free_g_list, &free_list);
231 list_del_init(&entry->g_list);
232 fsnotify_get_mark(entry);
233 }
234 spin_unlock(&group->mark_lock);
235
236 list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
237 fsnotify_destroy_mark_by_entry(entry);
238 fsnotify_put_mark(entry);
239 }
240}
241
242/*
243 * Given an inode, destroy all of the marks associated with that inode.
244 */
245void fsnotify_clear_marks_by_inode(struct inode *inode)
246{
247 struct fsnotify_mark_entry *entry, *lentry;
248 struct hlist_node *pos, *n;
249 LIST_HEAD(free_list);
250
251 spin_lock(&inode->i_lock);
252 hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
253 list_add(&entry->free_i_list, &free_list);
254 hlist_del_init(&entry->i_list);
255 fsnotify_get_mark(entry);
256 }
257 spin_unlock(&inode->i_lock);
258
259 list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
260 fsnotify_destroy_mark_by_entry(entry);
261 fsnotify_put_mark(entry);
262 }
263}
264
265/*
266 * given a group and inode, find the mark associated with that combination.
267 * if found take a reference to that mark and return it, else return NULL
268 */
269struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
270 struct inode *inode)
271{
272 struct fsnotify_mark_entry *entry;
273 struct hlist_node *pos;
274
275 assert_spin_locked(&inode->i_lock);
276
277 hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
278 if (entry->group == group) {
279 fsnotify_get_mark(entry);
280 return entry;
281 }
282 }
283 return NULL;
284}
285
286/*
287 * Nothing fancy, just initialize lists and locks and counters.
288 */
289void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
290 void (*free_mark)(struct fsnotify_mark_entry *entry))
291
292{
293 spin_lock_init(&entry->lock);
294 atomic_set(&entry->refcnt, 1);
295 INIT_HLIST_NODE(&entry->i_list);
296 entry->group = NULL;
297 entry->mask = 0;
298 entry->inode = NULL;
299 entry->free_mark = free_mark;
300}
301
302/*
303 * Attach an initialized mark entry to a given group and inode.
304 * These marks may be used for the fsnotify backend to determine which
305 * event types should be delivered to which group and for which inodes.
306 */
307int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
308 struct fsnotify_group *group, struct inode *inode)
309{
310 struct fsnotify_mark_entry *lentry;
311 int ret = 0;
312
313 inode = igrab(inode);
314 if (unlikely(!inode))
315 return -EINVAL;
316
317 /*
318 * LOCKING ORDER!!!!
319 * entry->lock
320 * group->mark_lock
321 * inode->i_lock
322 */
323 spin_lock(&entry->lock);
324 spin_lock(&group->mark_lock);
325 spin_lock(&inode->i_lock);
326
327 entry->group = group;
328 entry->inode = inode;
329
330 lentry = fsnotify_find_mark_entry(group, inode);
331 if (!lentry) {
332 hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
333 list_add(&entry->g_list, &group->mark_entries);
334
335 fsnotify_get_mark(entry); /* for i_list and g_list */
336
337 atomic_inc(&group->num_marks);
338
339 fsnotify_recalc_inode_mask_locked(inode);
340 }
341
342 spin_unlock(&inode->i_lock);
343 spin_unlock(&group->mark_lock);
344 spin_unlock(&entry->lock);
345
346 if (lentry) {
347 ret = -EEXIST;
348 iput(inode);
349 fsnotify_put_mark(lentry);
350 } else {
351 __fsnotify_update_child_dentry_flags(inode);
352 }
353
354 return ret;
355}
356
357/**
358 * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
359 * @list: list of inodes being unmounted (sb->s_inodes)
360 *
361 * Called with inode_lock held, protecting the unmounting super block's list
362 * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
363 * We temporarily drop inode_lock, however, and CAN block.
364 */
365void fsnotify_unmount_inodes(struct list_head *list)
366{
367 struct inode *inode, *next_i, *need_iput = NULL;
368
369 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
370 struct inode *need_iput_tmp;
371
372 /*
373 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
374 * I_WILL_FREE, or I_NEW which is fine because by that point
375 * the inode cannot have any associated watches.
376 */
377 if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
378 continue;
379
380 /*
381 * If i_count is zero, the inode cannot have any watches and
382 * doing an __iget/iput with MS_ACTIVE clear would actually
383 * evict all inodes with zero i_count from icache which is
384 * unnecessarily violent and may in fact be illegal to do.
385 */
386 if (!atomic_read(&inode->i_count))
387 continue;
388
389 need_iput_tmp = need_iput;
390 need_iput = NULL;
391
392 /* In case fsnotify_inode_delete() drops a reference. */
393 if (inode != need_iput_tmp)
394 __iget(inode);
395 else
396 need_iput_tmp = NULL;
397
398 /* In case the dropping of a reference would nuke next_i. */
399 if ((&next_i->i_sb_list != list) &&
400 atomic_read(&next_i->i_count) &&
401 !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
402 __iget(next_i);
403 need_iput = next_i;
404 }
405
406 /*
407 * We can safely drop inode_lock here because we hold
408 * references on both inode and next_i. Also no new inodes
409 * will be added since the umount has begun. Finally,
410 * iprune_mutex keeps shrink_icache_memory() away.
411 */
412 spin_unlock(&inode_lock);
413
414 if (need_iput_tmp)
415 iput(need_iput_tmp);
416
417 /* for each watch, send FS_UNMOUNT and then remove it */
418 fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
419
420 fsnotify_inode_delete(inode);
421
422 iput(inode);
423
424 spin_lock(&inode_lock);
425 }
426}
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 446792841023..3e56dbffe729 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,26 +1,30 @@
1config INOTIFY 1config INOTIFY
2 bool "Inotify file change notification support" 2 bool "Inotify file change notification support"
3 default y 3 default n
4 ---help--- 4 ---help---
5 Say Y here to enable inotify support. Inotify is a file change 5 Say Y here to enable legacy in kernel inotify support. Inotify is a
6 notification system and a replacement for dnotify. Inotify fixes 6 file change notification system. It is a replacement for dnotify.
7 numerous shortcomings in dnotify and introduces several new features 7 This option only provides the legacy inotify in kernel API. There
8 including multiple file events, one-shot support, and unmount 8 are no in tree kernel users of this interface since it is deprecated.
9 notification. 9 You only need this if you are loading an out of tree kernel module
10 that uses inotify.
10 11
11 For more information, see <file:Documentation/filesystems/inotify.txt> 12 For more information, see <file:Documentation/filesystems/inotify.txt>
12 13
13 If unsure, say Y. 14 If unsure, say N.
14 15
15config INOTIFY_USER 16config INOTIFY_USER
16 bool "Inotify support for userspace" 17 bool "Inotify support for userspace"
17 depends on INOTIFY 18 select FSNOTIFY
18 default y 19 default y
19 ---help--- 20 ---help---
20 Say Y here to enable inotify support for userspace, including the 21 Say Y here to enable inotify support for userspace, including the
21 associated system calls. Inotify allows monitoring of both files and 22 associated system calls. Inotify allows monitoring of both files and
22 directories via a single open fd. Events are read from the file 23 directories via a single open fd. Events are read from the file
23 descriptor, which is also select()- and poll()-able. 24 descriptor, which is also select()- and poll()-able.
25 Inotify fixes numerous shortcomings in dnotify and introduces several
26 new features including multiple file events, one-shot support, and
27 unmount notification.
24 28
25 For more information, see <file:Documentation/filesystems/inotify.txt> 29 For more information, see <file:Documentation/filesystems/inotify.txt>
26 30
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index e290f3bb9d8d..943828171362 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1,2 @@
1obj-$(CONFIG_INOTIFY) += inotify.o 1obj-$(CONFIG_INOTIFY) += inotify.o
2obj-$(CONFIG_INOTIFY_USER) += inotify_user.o 2obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 220c13f0d73d..40b1cf914ccb 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -32,6 +32,7 @@
32#include <linux/list.h> 32#include <linux/list.h>
33#include <linux/writeback.h> 33#include <linux/writeback.h>
34#include <linux/inotify.h> 34#include <linux/inotify.h>
35#include <linux/fsnotify_backend.h>
35 36
36static atomic_t inotify_cookie; 37static atomic_t inotify_cookie;
37 38
@@ -905,6 +906,25 @@ EXPORT_SYMBOL_GPL(inotify_rm_watch);
905 */ 906 */
906static int __init inotify_setup(void) 907static int __init inotify_setup(void)
907{ 908{
909 BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
910 BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
911 BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
912 BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
913 BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
914 BUILD_BUG_ON(IN_OPEN != FS_OPEN);
915 BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
916 BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
917 BUILD_BUG_ON(IN_CREATE != FS_CREATE);
918 BUILD_BUG_ON(IN_DELETE != FS_DELETE);
919 BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
920 BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
921 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
922
923 BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
924 BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
925 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
926 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
927
908 atomic_set(&inotify_cookie, 0); 928 atomic_set(&inotify_cookie, 0);
909 929
910 return 0; 930 return 0;
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
new file mode 100644
index 000000000000..f234f3a4c8ca
--- /dev/null
+++ b/fs/notify/inotify/inotify.h
@@ -0,0 +1,22 @@
1#include <linux/fsnotify_backend.h>
2#include <linux/inotify.h>
3#include <linux/slab.h> /* struct kmem_cache */
4
5extern struct kmem_cache *event_priv_cachep;
6
7struct inotify_event_private_data {
8 struct fsnotify_event_private_data fsnotify_event_priv_data;
9 int wd;
10};
11
12struct inotify_inode_mark_entry {
13 /* fsnotify_mark_entry MUST be the first thing */
14 struct fsnotify_mark_entry fsn_entry;
15 int wd;
16};
17
18extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
19 struct fsnotify_group *group);
20extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
21
22extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
new file mode 100644
index 000000000000..c9ee67b442e1
--- /dev/null
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -0,0 +1,168 @@
1/*
2 * fs/inotify_user.c - inotify support for userspace
3 *
4 * Authors:
5 * John McCutchan <ttb@tentacle.dhs.org>
6 * Robert Love <rml@novell.com>
7 *
8 * Copyright (C) 2005 John McCutchan
9 * Copyright 2006 Hewlett-Packard Development Company, L.P.
10 *
11 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
12 * inotify was largely rewriten to make use of the fsnotify infrastructure
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License as published by the
16 * Free Software Foundation; either version 2, or (at your option) any
17 * later version.
18 *
19 * This program is distributed in the hope that it will be useful, but
20 * WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 * General Public License for more details.
23 */
24
25#include <linux/fs.h> /* struct inode */
26#include <linux/fsnotify_backend.h>
27#include <linux/inotify.h>
28#include <linux/path.h> /* struct path */
29#include <linux/slab.h> /* kmem_* */
30#include <linux/types.h>
31
32#include "inotify.h"
33
34static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
35{
36 struct fsnotify_mark_entry *entry;
37 struct inotify_inode_mark_entry *ientry;
38 struct inode *to_tell;
39 struct inotify_event_private_data *event_priv;
40 struct fsnotify_event_private_data *fsn_event_priv;
41 int wd, ret;
42
43 to_tell = event->to_tell;
44
45 spin_lock(&to_tell->i_lock);
46 entry = fsnotify_find_mark_entry(group, to_tell);
47 spin_unlock(&to_tell->i_lock);
48 /* race with watch removal? We already passes should_send */
49 if (unlikely(!entry))
50 return 0;
51 ientry = container_of(entry, struct inotify_inode_mark_entry,
52 fsn_entry);
53 wd = ientry->wd;
54
55 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
56 if (unlikely(!event_priv))
57 return -ENOMEM;
58
59 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
60
61 fsn_event_priv->group = group;
62 event_priv->wd = wd;
63
64 ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
65 if (ret) {
66 inotify_free_event_priv(fsn_event_priv);
67 /* EEXIST says we tail matched, EOVERFLOW isn't something
68 * to report up the stack. */
69 if ((ret == -EEXIST) ||
70 (ret == -EOVERFLOW))
71 ret = 0;
72 }
73
74 /*
75 * If we hold the entry until after the event is on the queue
76 * IN_IGNORED won't be able to pass this event in the queue
77 */
78 fsnotify_put_mark(entry);
79
80 return ret;
81}
82
83static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
84{
85 inotify_ignored_and_remove_idr(entry, group);
86}
87
88static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
89{
90 struct fsnotify_mark_entry *entry;
91 bool send;
92
93 spin_lock(&inode->i_lock);
94 entry = fsnotify_find_mark_entry(group, inode);
95 spin_unlock(&inode->i_lock);
96 if (!entry)
97 return false;
98
99 mask = (mask & ~FS_EVENT_ON_CHILD);
100 send = (entry->mask & mask);
101
102 /* find took a reference */
103 fsnotify_put_mark(entry);
104
105 return send;
106}
107
108/*
109 * This is NEVER supposed to be called. Inotify marks should either have been
110 * removed from the idr when the watch was removed or in the
111 * fsnotify_destroy_mark_by_group() call when the inotify instance was being
112 * torn down. This is only called if the idr is about to be freed but there
113 * are still marks in it.
114 */
115static int idr_callback(int id, void *p, void *data)
116{
117 struct fsnotify_mark_entry *entry;
118 struct inotify_inode_mark_entry *ientry;
119 static bool warned = false;
120
121 if (warned)
122 return 0;
123
124 warned = false;
125 entry = p;
126 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
127
128 WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in "
129 "idr. Probably leaking memory\n", id, p, data);
130
131 /*
132 * I'm taking the liberty of assuming that the mark in question is a
133 * valid address and I'm dereferencing it. This might help to figure
134 * out why we got here and the panic is no worse than the original
135 * BUG() that was here.
136 */
137 if (entry)
138 printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n",
139 entry->group, entry->inode, ientry->wd);
140 return 0;
141}
142
143static void inotify_free_group_priv(struct fsnotify_group *group)
144{
145 /* ideally the idr is empty and we won't hit the BUG in teh callback */
146 idr_for_each(&group->inotify_data.idr, idr_callback, group);
147 idr_remove_all(&group->inotify_data.idr);
148 idr_destroy(&group->inotify_data.idr);
149}
150
151void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
152{
153 struct inotify_event_private_data *event_priv;
154
155
156 event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
157 fsnotify_event_priv_data);
158
159 kmem_cache_free(event_priv_cachep, event_priv);
160}
161
162const struct fsnotify_ops inotify_fsnotify_ops = {
163 .handle_event = inotify_handle_event,
164 .should_send_event = inotify_should_send_event,
165 .free_group_priv = inotify_free_group_priv,
166 .free_event_priv = inotify_free_event_priv,
167 .freeing_mark = inotify_freeing_mark,
168};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1634319e2404..0e781bc88d1e 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -8,6 +8,9 @@
8 * Copyright (C) 2005 John McCutchan 8 * Copyright (C) 2005 John McCutchan
9 * Copyright 2006 Hewlett-Packard Development Company, L.P. 9 * Copyright 2006 Hewlett-Packard Development Company, L.P.
10 * 10 *
11 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
12 * inotify was largely rewriten to make use of the fsnotify infrastructure
13 *
11 * This program is free software; you can redistribute it and/or modify it 14 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the 15 * under the terms of the GNU General Public License as published by the
13 * Free Software Foundation; either version 2, or (at your option) any 16 * Free Software Foundation; either version 2, or (at your option) any
@@ -19,94 +22,44 @@
19 * General Public License for more details. 22 * General Public License for more details.
20 */ 23 */
21 24
22#include <linux/kernel.h>
23#include <linux/sched.h>
24#include <linux/slab.h>
25#include <linux/fs.h>
26#include <linux/file.h> 25#include <linux/file.h>
27#include <linux/mount.h> 26#include <linux/fs.h> /* struct inode */
28#include <linux/namei.h> 27#include <linux/fsnotify_backend.h>
29#include <linux/poll.h> 28#include <linux/idr.h>
30#include <linux/init.h> 29#include <linux/init.h> /* module_init */
31#include <linux/list.h>
32#include <linux/inotify.h> 30#include <linux/inotify.h>
31#include <linux/kernel.h> /* roundup() */
32#include <linux/magic.h> /* superblock magic number */
33#include <linux/mount.h> /* mntget */
34#include <linux/namei.h> /* LOOKUP_FOLLOW */
35#include <linux/path.h> /* struct path */
36#include <linux/sched.h> /* struct user */
37#include <linux/slab.h> /* struct kmem_cache */
33#include <linux/syscalls.h> 38#include <linux/syscalls.h>
34#include <linux/magic.h> 39#include <linux/types.h>
40#include <linux/uaccess.h>
41#include <linux/poll.h>
42#include <linux/wait.h>
35 43
36#include <asm/ioctls.h> 44#include "inotify.h"
37 45
38static struct kmem_cache *watch_cachep __read_mostly; 46#include <asm/ioctls.h>
39static struct kmem_cache *event_cachep __read_mostly;
40 47
41static struct vfsmount *inotify_mnt __read_mostly; 48static struct vfsmount *inotify_mnt __read_mostly;
42 49
43/* these are configurable via /proc/sys/fs/inotify/ */ 50/* these are configurable via /proc/sys/fs/inotify/ */
44static int inotify_max_user_instances __read_mostly; 51static int inotify_max_user_instances __read_mostly;
45static int inotify_max_user_watches __read_mostly;
46static int inotify_max_queued_events __read_mostly; 52static int inotify_max_queued_events __read_mostly;
53int inotify_max_user_watches __read_mostly;
47 54
48/* 55static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
49 * Lock ordering: 56struct kmem_cache *event_priv_cachep __read_mostly;
50 *
51 * inotify_dev->up_mutex (ensures we don't re-add the same watch)
52 * inode->inotify_mutex (protects inode's watch list)
53 * inotify_handle->mutex (protects inotify_handle's watch list)
54 * inotify_dev->ev_mutex (protects device's event queue)
55 */
56
57/*
58 * Lifetimes of the main data structures:
59 *
60 * inotify_device: Lifetime is managed by reference count, from
61 * sys_inotify_init() until release. Additional references can bump the count
62 * via get_inotify_dev() and drop the count via put_inotify_dev().
63 *
64 * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
65 * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
66 * first event, or to inotify_destroy().
67 */
68
69/*
70 * struct inotify_device - represents an inotify instance
71 *
72 * This structure is protected by the mutex 'mutex'.
73 */
74struct inotify_device {
75 wait_queue_head_t wq; /* wait queue for i/o */
76 struct mutex ev_mutex; /* protects event queue */
77 struct mutex up_mutex; /* synchronizes watch updates */
78 struct list_head events; /* list of queued events */
79 struct user_struct *user; /* user who opened this dev */
80 struct inotify_handle *ih; /* inotify handle */
81 struct fasync_struct *fa; /* async notification */
82 atomic_t count; /* reference count */
83 unsigned int queue_size; /* size of the queue (bytes) */
84 unsigned int event_count; /* number of pending events */
85 unsigned int max_events; /* maximum number of events */
86};
87
88/*
89 * struct inotify_kernel_event - An inotify event, originating from a watch and
90 * queued for user-space. A list of these is attached to each instance of the
91 * device. In read(), this list is walked and all events that can fit in the
92 * buffer are returned.
93 *
94 * Protected by dev->ev_mutex of the device in which we are queued.
95 */
96struct inotify_kernel_event {
97 struct inotify_event event; /* the user-space event */
98 struct list_head list; /* entry in inotify_device's list */
99 char *name; /* filename, if any */
100};
101 57
102/* 58/*
103 * struct inotify_user_watch - our version of an inotify_watch, we add 59 * When inotify registers a new group it increments this and uses that
104 * a reference to the associated inotify_device. 60 * value as an offset to set the fsnotify group "name" and priority.
105 */ 61 */
106struct inotify_user_watch { 62static atomic_t inotify_grp_num;
107 struct inotify_device *dev; /* associated device */
108 struct inotify_watch wdata; /* inotify watch data */
109};
110 63
111#ifdef CONFIG_SYSCTL 64#ifdef CONFIG_SYSCTL
112 65
@@ -149,280 +102,36 @@ ctl_table inotify_table[] = {
149}; 102};
150#endif /* CONFIG_SYSCTL */ 103#endif /* CONFIG_SYSCTL */
151 104
152static inline void get_inotify_dev(struct inotify_device *dev) 105static inline __u32 inotify_arg_to_mask(u32 arg)
153{
154 atomic_inc(&dev->count);
155}
156
157static inline void put_inotify_dev(struct inotify_device *dev)
158{
159 if (atomic_dec_and_test(&dev->count)) {
160 atomic_dec(&dev->user->inotify_devs);
161 free_uid(dev->user);
162 kfree(dev);
163 }
164}
165
166/*
167 * free_inotify_user_watch - cleans up the watch and its references
168 */
169static void free_inotify_user_watch(struct inotify_watch *w)
170{
171 struct inotify_user_watch *watch;
172 struct inotify_device *dev;
173
174 watch = container_of(w, struct inotify_user_watch, wdata);
175 dev = watch->dev;
176
177 atomic_dec(&dev->user->inotify_watches);
178 put_inotify_dev(dev);
179 kmem_cache_free(watch_cachep, watch);
180}
181
182/*
183 * kernel_event - create a new kernel event with the given parameters
184 *
185 * This function can sleep.
186 */
187static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
188 const char *name)
189{
190 struct inotify_kernel_event *kevent;
191
192 kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
193 if (unlikely(!kevent))
194 return NULL;
195
196 /* we hand this out to user-space, so zero it just in case */
197 memset(&kevent->event, 0, sizeof(struct inotify_event));
198
199 kevent->event.wd = wd;
200 kevent->event.mask = mask;
201 kevent->event.cookie = cookie;
202
203 INIT_LIST_HEAD(&kevent->list);
204
205 if (name) {
206 size_t len, rem, event_size = sizeof(struct inotify_event);
207
208 /*
209 * We need to pad the filename so as to properly align an
210 * array of inotify_event structures. Because the structure is
211 * small and the common case is a small filename, we just round
212 * up to the next multiple of the structure's sizeof. This is
213 * simple and safe for all architectures.
214 */
215 len = strlen(name) + 1;
216 rem = event_size - len;
217 if (len > event_size) {
218 rem = event_size - (len % event_size);
219 if (len % event_size == 0)
220 rem = 0;
221 }
222
223 kevent->name = kmalloc(len + rem, GFP_NOFS);
224 if (unlikely(!kevent->name)) {
225 kmem_cache_free(event_cachep, kevent);
226 return NULL;
227 }
228 memcpy(kevent->name, name, len);
229 if (rem)
230 memset(kevent->name + len, 0, rem);
231 kevent->event.len = len + rem;
232 } else {
233 kevent->event.len = 0;
234 kevent->name = NULL;
235 }
236
237 return kevent;
238}
239
240/*
241 * inotify_dev_get_event - return the next event in the given dev's queue
242 *
243 * Caller must hold dev->ev_mutex.
244 */
245static inline struct inotify_kernel_event *
246inotify_dev_get_event(struct inotify_device *dev)
247{
248 return list_entry(dev->events.next, struct inotify_kernel_event, list);
249}
250
251/*
252 * inotify_dev_get_last_event - return the last event in the given dev's queue
253 *
254 * Caller must hold dev->ev_mutex.
255 */
256static inline struct inotify_kernel_event *
257inotify_dev_get_last_event(struct inotify_device *dev)
258{
259 if (list_empty(&dev->events))
260 return NULL;
261 return list_entry(dev->events.prev, struct inotify_kernel_event, list);
262}
263
264/*
265 * inotify_dev_queue_event - event handler registered with core inotify, adds
266 * a new event to the given device
267 *
268 * Can sleep (calls kernel_event()).
269 */
270static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
271 u32 cookie, const char *name,
272 struct inode *ignored)
273{
274 struct inotify_user_watch *watch;
275 struct inotify_device *dev;
276 struct inotify_kernel_event *kevent, *last;
277
278 watch = container_of(w, struct inotify_user_watch, wdata);
279 dev = watch->dev;
280
281 mutex_lock(&dev->ev_mutex);
282
283 /* we can safely put the watch as we don't reference it while
284 * generating the event
285 */
286 if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
287 put_inotify_watch(w); /* final put */
288
289 /* coalescing: drop this event if it is a dupe of the previous */
290 last = inotify_dev_get_last_event(dev);
291 if (last && last->event.mask == mask && last->event.wd == wd &&
292 last->event.cookie == cookie) {
293 const char *lastname = last->name;
294
295 if (!name && !lastname)
296 goto out;
297 if (name && lastname && !strcmp(lastname, name))
298 goto out;
299 }
300
301 /* the queue overflowed and we already sent the Q_OVERFLOW event */
302 if (unlikely(dev->event_count > dev->max_events))
303 goto out;
304
305 /* if the queue overflows, we need to notify user space */
306 if (unlikely(dev->event_count == dev->max_events))
307 kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
308 else
309 kevent = kernel_event(wd, mask, cookie, name);
310
311 if (unlikely(!kevent))
312 goto out;
313
314 /* queue the event and wake up anyone waiting */
315 dev->event_count++;
316 dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
317 list_add_tail(&kevent->list, &dev->events);
318 wake_up_interruptible(&dev->wq);
319 kill_fasync(&dev->fa, SIGIO, POLL_IN);
320
321out:
322 mutex_unlock(&dev->ev_mutex);
323}
324
325/*
326 * remove_kevent - cleans up the given kevent
327 *
328 * Caller must hold dev->ev_mutex.
329 */
330static void remove_kevent(struct inotify_device *dev,
331 struct inotify_kernel_event *kevent)
332{ 106{
333 list_del(&kevent->list); 107 __u32 mask;
334 108
335 dev->event_count--; 109 /* everything should accept their own ignored and cares about children */
336 dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; 110 mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
337}
338 111
339/* 112 /* mask off the flags used to open the fd */
340 * free_kevent - frees the given kevent. 113 mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
341 */
342static void free_kevent(struct inotify_kernel_event *kevent)
343{
344 kfree(kevent->name);
345 kmem_cache_free(event_cachep, kevent);
346}
347 114
348/* 115 return mask;
349 * inotify_dev_event_dequeue - destroy an event on the given device
350 *
351 * Caller must hold dev->ev_mutex.
352 */
353static void inotify_dev_event_dequeue(struct inotify_device *dev)
354{
355 if (!list_empty(&dev->events)) {
356 struct inotify_kernel_event *kevent;
357 kevent = inotify_dev_get_event(dev);
358 remove_kevent(dev, kevent);
359 free_kevent(kevent);
360 }
361} 116}
362 117
363/* 118static inline u32 inotify_mask_to_arg(__u32 mask)
364 * find_inode - resolve a user-given path to a specific inode
365 */
366static int find_inode(const char __user *dirname, struct path *path,
367 unsigned flags)
368{ 119{
369 int error; 120 return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
370 121 IN_Q_OVERFLOW);
371 error = user_path_at(AT_FDCWD, dirname, flags, path);
372 if (error)
373 return error;
374 /* you can only watch an inode if you have read permissions on it */
375 error = inode_permission(path->dentry->d_inode, MAY_READ);
376 if (error)
377 path_put(path);
378 return error;
379} 122}
380 123
381/* 124/* intofiy userspace file descriptor functions */
382 * create_watch - creates a watch on the given device.
383 *
384 * Callers must hold dev->up_mutex.
385 */
386static int create_watch(struct inotify_device *dev, struct inode *inode,
387 u32 mask)
388{
389 struct inotify_user_watch *watch;
390 int ret;
391
392 if (atomic_read(&dev->user->inotify_watches) >=
393 inotify_max_user_watches)
394 return -ENOSPC;
395
396 watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
397 if (unlikely(!watch))
398 return -ENOMEM;
399
400 /* save a reference to device and bump the count to make it official */
401 get_inotify_dev(dev);
402 watch->dev = dev;
403
404 atomic_inc(&dev->user->inotify_watches);
405
406 inotify_init_watch(&watch->wdata);
407 ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
408 if (ret < 0)
409 free_inotify_user_watch(&watch->wdata);
410
411 return ret;
412}
413
414/* Device Interface */
415
416static unsigned int inotify_poll(struct file *file, poll_table *wait) 125static unsigned int inotify_poll(struct file *file, poll_table *wait)
417{ 126{
418 struct inotify_device *dev = file->private_data; 127 struct fsnotify_group *group = file->private_data;
419 int ret = 0; 128 int ret = 0;
420 129
421 poll_wait(file, &dev->wq, wait); 130 poll_wait(file, &group->notification_waitq, wait);
422 mutex_lock(&dev->ev_mutex); 131 mutex_lock(&group->notification_mutex);
423 if (!list_empty(&dev->events)) 132 if (!fsnotify_notify_queue_is_empty(group))
424 ret = POLLIN | POLLRDNORM; 133 ret = POLLIN | POLLRDNORM;
425 mutex_unlock(&dev->ev_mutex); 134 mutex_unlock(&group->notification_mutex);
426 135
427 return ret; 136 return ret;
428} 137}
@@ -432,26 +141,29 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
432 * enough to fit in "count". Return an error pointer if 141 * enough to fit in "count". Return an error pointer if
433 * not large enough. 142 * not large enough.
434 * 143 *
435 * Called with the device ev_mutex held. 144 * Called with the group->notification_mutex held.
436 */ 145 */
437static struct inotify_kernel_event *get_one_event(struct inotify_device *dev, 146static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
438 size_t count) 147 size_t count)
439{ 148{
440 size_t event_size = sizeof(struct inotify_event); 149 size_t event_size = sizeof(struct inotify_event);
441 struct inotify_kernel_event *kevent; 150 struct fsnotify_event *event;
442 151
443 if (list_empty(&dev->events)) 152 if (fsnotify_notify_queue_is_empty(group))
444 return NULL; 153 return NULL;
445 154
446 kevent = inotify_dev_get_event(dev); 155 event = fsnotify_peek_notify_event(group);
447 if (kevent->name) 156
448 event_size += kevent->event.len; 157 event_size += roundup(event->name_len, event_size);
449 158
450 if (event_size > count) 159 if (event_size > count)
451 return ERR_PTR(-EINVAL); 160 return ERR_PTR(-EINVAL);
452 161
453 remove_kevent(dev, kevent); 162 /* held the notification_mutex the whole time, so this is the
454 return kevent; 163 * same event we peeked above */
164 fsnotify_remove_notify_event(group);
165
166 return event;
455} 167}
456 168
457/* 169/*
@@ -460,51 +172,92 @@ static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
460 * We already checked that the event size is smaller than the 172 * We already checked that the event size is smaller than the
461 * buffer we had in "get_one_event()" above. 173 * buffer we had in "get_one_event()" above.
462 */ 174 */
463static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent, 175static ssize_t copy_event_to_user(struct fsnotify_group *group,
176 struct fsnotify_event *event,
464 char __user *buf) 177 char __user *buf)
465{ 178{
179 struct inotify_event inotify_event;
180 struct fsnotify_event_private_data *fsn_priv;
181 struct inotify_event_private_data *priv;
466 size_t event_size = sizeof(struct inotify_event); 182 size_t event_size = sizeof(struct inotify_event);
183 size_t name_len;
184
185 /* we get the inotify watch descriptor from the event private data */
186 spin_lock(&event->lock);
187 fsn_priv = fsnotify_remove_priv_from_event(group, event);
188 spin_unlock(&event->lock);
189
190 if (!fsn_priv)
191 inotify_event.wd = -1;
192 else {
193 priv = container_of(fsn_priv, struct inotify_event_private_data,
194 fsnotify_event_priv_data);
195 inotify_event.wd = priv->wd;
196 inotify_free_event_priv(fsn_priv);
197 }
198
199 /* round up event->name_len so it is a multiple of event_size
200 * plus an extra byte for the terminating '\0'.
201 */
202 name_len = roundup(event->name_len + 1, event_size);
203 inotify_event.len = name_len;
204
205 inotify_event.mask = inotify_mask_to_arg(event->mask);
206 inotify_event.cookie = event->sync_cookie;
467 207
468 if (copy_to_user(buf, &kevent->event, event_size)) 208 /* send the main event */
209 if (copy_to_user(buf, &inotify_event, event_size))
469 return -EFAULT; 210 return -EFAULT;
470 211
471 if (kevent->name) { 212 buf += event_size;
472 buf += event_size;
473 213
474 if (copy_to_user(buf, kevent->name, kevent->event.len)) 214 /*
215 * fsnotify only stores the pathname, so here we have to send the pathname
216 * and then pad that pathname out to a multiple of sizeof(inotify_event)
217 * with zeros. I get my zeros from the nul_inotify_event.
218 */
219 if (name_len) {
220 unsigned int len_to_zero = name_len - event->name_len;
221 /* copy the path name */
222 if (copy_to_user(buf, event->file_name, event->name_len))
475 return -EFAULT; 223 return -EFAULT;
224 buf += event->name_len;
476 225
477 event_size += kevent->event.len; 226 /* fill userspace with 0's */
227 if (clear_user(buf, len_to_zero))
228 return -EFAULT;
229 buf += len_to_zero;
230 event_size += name_len;
478 } 231 }
232
479 return event_size; 233 return event_size;
480} 234}
481 235
482static ssize_t inotify_read(struct file *file, char __user *buf, 236static ssize_t inotify_read(struct file *file, char __user *buf,
483 size_t count, loff_t *pos) 237 size_t count, loff_t *pos)
484{ 238{
485 struct inotify_device *dev; 239 struct fsnotify_group *group;
240 struct fsnotify_event *kevent;
486 char __user *start; 241 char __user *start;
487 int ret; 242 int ret;
488 DEFINE_WAIT(wait); 243 DEFINE_WAIT(wait);
489 244
490 start = buf; 245 start = buf;
491 dev = file->private_data; 246 group = file->private_data;
492 247
493 while (1) { 248 while (1) {
494 struct inotify_kernel_event *kevent; 249 prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
495 250
496 prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); 251 mutex_lock(&group->notification_mutex);
497 252 kevent = get_one_event(group, count);
498 mutex_lock(&dev->ev_mutex); 253 mutex_unlock(&group->notification_mutex);
499 kevent = get_one_event(dev, count);
500 mutex_unlock(&dev->ev_mutex);
501 254
502 if (kevent) { 255 if (kevent) {
503 ret = PTR_ERR(kevent); 256 ret = PTR_ERR(kevent);
504 if (IS_ERR(kevent)) 257 if (IS_ERR(kevent))
505 break; 258 break;
506 ret = copy_event_to_user(kevent, buf); 259 ret = copy_event_to_user(group, kevent, buf);
507 free_kevent(kevent); 260 fsnotify_put_event(kevent);
508 if (ret < 0) 261 if (ret < 0)
509 break; 262 break;
510 buf += ret; 263 buf += ret;
@@ -525,7 +278,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
525 schedule(); 278 schedule();
526 } 279 }
527 280
528 finish_wait(&dev->wq, &wait); 281 finish_wait(&group->notification_waitq, &wait);
529 if (start != buf && ret != -EFAULT) 282 if (start != buf && ret != -EFAULT)
530 ret = buf - start; 283 ret = buf - start;
531 return ret; 284 return ret;
@@ -533,25 +286,22 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
533 286
534static int inotify_fasync(int fd, struct file *file, int on) 287static int inotify_fasync(int fd, struct file *file, int on)
535{ 288{
536 struct inotify_device *dev = file->private_data; 289 struct fsnotify_group *group = file->private_data;
537 290
538 return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO; 291 return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
539} 292}
540 293
541static int inotify_release(struct inode *ignored, struct file *file) 294static int inotify_release(struct inode *ignored, struct file *file)
542{ 295{
543 struct inotify_device *dev = file->private_data; 296 struct fsnotify_group *group = file->private_data;
297 struct user_struct *user = group->inotify_data.user;
544 298
545 inotify_destroy(dev->ih); 299 fsnotify_clear_marks_by_group(group);
546 300
547 /* destroy all of the events on this device */ 301 /* free this group, matching get was inotify_init->fsnotify_obtain_group */
548 mutex_lock(&dev->ev_mutex); 302 fsnotify_put_group(group);
549 while (!list_empty(&dev->events))
550 inotify_dev_event_dequeue(dev);
551 mutex_unlock(&dev->ev_mutex);
552 303
553 /* free this device: the put matching the get in inotify_init() */ 304 atomic_dec(&user->inotify_devs);
554 put_inotify_dev(dev);
555 305
556 return 0; 306 return 0;
557} 307}
@@ -559,16 +309,27 @@ static int inotify_release(struct inode *ignored, struct file *file)
559static long inotify_ioctl(struct file *file, unsigned int cmd, 309static long inotify_ioctl(struct file *file, unsigned int cmd,
560 unsigned long arg) 310 unsigned long arg)
561{ 311{
562 struct inotify_device *dev; 312 struct fsnotify_group *group;
313 struct fsnotify_event_holder *holder;
314 struct fsnotify_event *event;
563 void __user *p; 315 void __user *p;
564 int ret = -ENOTTY; 316 int ret = -ENOTTY;
317 size_t send_len = 0;
565 318
566 dev = file->private_data; 319 group = file->private_data;
567 p = (void __user *) arg; 320 p = (void __user *) arg;
568 321
569 switch (cmd) { 322 switch (cmd) {
570 case FIONREAD: 323 case FIONREAD:
571 ret = put_user(dev->queue_size, (int __user *) p); 324 mutex_lock(&group->notification_mutex);
325 list_for_each_entry(holder, &group->notification_list, event_list) {
326 event = holder->event;
327 send_len += sizeof(struct inotify_event);
328 send_len += roundup(event->name_len,
329 sizeof(struct inotify_event));
330 }
331 mutex_unlock(&group->notification_mutex);
332 ret = put_user(send_len, (int __user *) p);
572 break; 333 break;
573 } 334 }
574 335
@@ -576,23 +337,311 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
576} 337}
577 338
578static const struct file_operations inotify_fops = { 339static const struct file_operations inotify_fops = {
579 .poll = inotify_poll, 340 .poll = inotify_poll,
580 .read = inotify_read, 341 .read = inotify_read,
581 .fasync = inotify_fasync, 342 .fasync = inotify_fasync,
582 .release = inotify_release, 343 .release = inotify_release,
583 .unlocked_ioctl = inotify_ioctl, 344 .unlocked_ioctl = inotify_ioctl,
584 .compat_ioctl = inotify_ioctl, 345 .compat_ioctl = inotify_ioctl,
585}; 346};
586 347
587static const struct inotify_operations inotify_user_ops = {
588 .handle_event = inotify_dev_queue_event,
589 .destroy_watch = free_inotify_user_watch,
590};
591 348
349/*
350 * find_inode - resolve a user-given path to a specific inode
351 */
352static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
353{
354 int error;
355
356 error = user_path_at(AT_FDCWD, dirname, flags, path);
357 if (error)
358 return error;
359 /* you can only watch an inode if you have read permissions on it */
360 error = inode_permission(path->dentry->d_inode, MAY_READ);
361 if (error)
362 path_put(path);
363 return error;
364}
365
366/*
367 * Remove the mark from the idr (if present) and drop the reference
368 * on the mark because it was in the idr.
369 */
370static void inotify_remove_from_idr(struct fsnotify_group *group,
371 struct inotify_inode_mark_entry *ientry)
372{
373 struct idr *idr;
374 struct fsnotify_mark_entry *entry;
375 struct inotify_inode_mark_entry *found_ientry;
376 int wd;
377
378 spin_lock(&group->inotify_data.idr_lock);
379 idr = &group->inotify_data.idr;
380 wd = ientry->wd;
381
382 if (wd == -1)
383 goto out;
384
385 entry = idr_find(&group->inotify_data.idr, wd);
386 if (unlikely(!entry))
387 goto out;
388
389 found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
390 if (unlikely(found_ientry != ientry)) {
391 /* We found an entry in the idr with the right wd, but it's
392 * not the entry we were told to remove. eparis seriously
393 * fucked up somewhere. */
394 WARN_ON(1);
395 ientry->wd = -1;
396 goto out;
397 }
398
399 /* One ref for being in the idr, one ref held by the caller */
400 BUG_ON(atomic_read(&entry->refcnt) < 2);
401
402 idr_remove(idr, wd);
403 ientry->wd = -1;
404
405 /* removed from the idr, drop that ref */
406 fsnotify_put_mark(entry);
407out:
408 spin_unlock(&group->inotify_data.idr_lock);
409}
410
411/*
412 * Send IN_IGNORED for this wd, remove this wd from the idr.
413 */
414void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
415 struct fsnotify_group *group)
416{
417 struct inotify_inode_mark_entry *ientry;
418 struct fsnotify_event *ignored_event;
419 struct inotify_event_private_data *event_priv;
420 struct fsnotify_event_private_data *fsn_event_priv;
421 int ret;
422
423 ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
424 FSNOTIFY_EVENT_NONE, NULL, 0,
425 GFP_NOFS);
426 if (!ignored_event)
427 return;
428
429 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
430
431 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
432 if (unlikely(!event_priv))
433 goto skip_send_ignore;
434
435 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
436
437 fsn_event_priv->group = group;
438 event_priv->wd = ientry->wd;
439
440 ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
441 if (ret)
442 inotify_free_event_priv(fsn_event_priv);
443
444skip_send_ignore:
445
446 /* matches the reference taken when the event was created */
447 fsnotify_put_event(ignored_event);
448
449 /* remove this entry from the idr */
450 inotify_remove_from_idr(group, ientry);
451
452 atomic_dec(&group->inotify_data.user->inotify_watches);
453}
454
455/* ding dong the mark is dead */
456static void inotify_free_mark(struct fsnotify_mark_entry *entry)
457{
458 struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
459
460 kmem_cache_free(inotify_inode_mark_cachep, ientry);
461}
462
463static int inotify_update_existing_watch(struct fsnotify_group *group,
464 struct inode *inode,
465 u32 arg)
466{
467 struct fsnotify_mark_entry *entry;
468 struct inotify_inode_mark_entry *ientry;
469 __u32 old_mask, new_mask;
470 __u32 mask;
471 int add = (arg & IN_MASK_ADD);
472 int ret;
473
474 /* don't allow invalid bits: we don't want flags set */
475 mask = inotify_arg_to_mask(arg);
476 if (unlikely(!mask))
477 return -EINVAL;
478
479 spin_lock(&inode->i_lock);
480 entry = fsnotify_find_mark_entry(group, inode);
481 spin_unlock(&inode->i_lock);
482 if (!entry)
483 return -ENOENT;
484
485 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
486
487 spin_lock(&entry->lock);
488
489 old_mask = entry->mask;
490 if (add) {
491 entry->mask |= mask;
492 new_mask = entry->mask;
493 } else {
494 entry->mask = mask;
495 new_mask = entry->mask;
496 }
497
498 spin_unlock(&entry->lock);
499
500 if (old_mask != new_mask) {
501 /* more bits in old than in new? */
502 int dropped = (old_mask & ~new_mask);
503 /* more bits in this entry than the inode's mask? */
504 int do_inode = (new_mask & ~inode->i_fsnotify_mask);
505 /* more bits in this entry than the group? */
506 int do_group = (new_mask & ~group->mask);
507
508 /* update the inode with this new entry */
509 if (dropped || do_inode)
510 fsnotify_recalc_inode_mask(inode);
511
512 /* update the group mask with the new mask */
513 if (dropped || do_group)
514 fsnotify_recalc_group_mask(group);
515 }
516
517 /* return the wd */
518 ret = ientry->wd;
519
520 /* match the get from fsnotify_find_mark_entry() */
521 fsnotify_put_mark(entry);
522
523 return ret;
524}
525
526static int inotify_new_watch(struct fsnotify_group *group,
527 struct inode *inode,
528 u32 arg)
529{
530 struct inotify_inode_mark_entry *tmp_ientry;
531 __u32 mask;
532 int ret;
533
534 /* don't allow invalid bits: we don't want flags set */
535 mask = inotify_arg_to_mask(arg);
536 if (unlikely(!mask))
537 return -EINVAL;
538
539 tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
540 if (unlikely(!tmp_ientry))
541 return -ENOMEM;
542
543 fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
544 tmp_ientry->fsn_entry.mask = mask;
545 tmp_ientry->wd = -1;
546
547 ret = -ENOSPC;
548 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
549 goto out_err;
550retry:
551 ret = -ENOMEM;
552 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
553 goto out_err;
554
555 spin_lock(&group->inotify_data.idr_lock);
556 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
557 group->inotify_data.last_wd,
558 &tmp_ientry->wd);
559 spin_unlock(&group->inotify_data.idr_lock);
560 if (ret) {
561 /* idr was out of memory allocate and try again */
562 if (ret == -EAGAIN)
563 goto retry;
564 goto out_err;
565 }
566
567 /* we put the mark on the idr, take a reference */
568 fsnotify_get_mark(&tmp_ientry->fsn_entry);
569
570 /* we are on the idr, now get on the inode */
571 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
572 if (ret) {
573 /* we failed to get on the inode, get off the idr */
574 inotify_remove_from_idr(group, tmp_ientry);
575 goto out_err;
576 }
577
578 /* update the idr hint, who cares about races, it's just a hint */
579 group->inotify_data.last_wd = tmp_ientry->wd;
580
581 /* increment the number of watches the user has */
582 atomic_inc(&group->inotify_data.user->inotify_watches);
583
584 /* return the watch descriptor for this new entry */
585 ret = tmp_ientry->wd;
586
587 /* match the ref from fsnotify_init_markentry() */
588 fsnotify_put_mark(&tmp_ientry->fsn_entry);
589
590out_err:
591 if (ret < 0)
592 kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
593
594 return ret;
595}
596
597static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
598{
599 int ret = 0;
600
601retry:
602 /* try to update and existing watch with the new arg */
603 ret = inotify_update_existing_watch(group, inode, arg);
604 /* no mark present, try to add a new one */
605 if (ret == -ENOENT)
606 ret = inotify_new_watch(group, inode, arg);
607 /*
608 * inotify_new_watch could race with another thread which did an
609 * inotify_new_watch between the update_existing and the add watch
610 * here, go back and try to update an existing mark again.
611 */
612 if (ret == -EEXIST)
613 goto retry;
614
615 return ret;
616}
617
618static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
619{
620 struct fsnotify_group *group;
621 unsigned int grp_num;
622
623 /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
624 grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
625 group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
626 if (IS_ERR(group))
627 return group;
628
629 group->max_events = max_events;
630
631 spin_lock_init(&group->inotify_data.idr_lock);
632 idr_init(&group->inotify_data.idr);
633 group->inotify_data.last_wd = 1;
634 group->inotify_data.user = user;
635 group->inotify_data.fa = NULL;
636
637 return group;
638}
639
640
641/* inotify syscalls */
592SYSCALL_DEFINE1(inotify_init1, int, flags) 642SYSCALL_DEFINE1(inotify_init1, int, flags)
593{ 643{
594 struct inotify_device *dev; 644 struct fsnotify_group *group;
595 struct inotify_handle *ih;
596 struct user_struct *user; 645 struct user_struct *user;
597 struct file *filp; 646 struct file *filp;
598 int fd, ret; 647 int fd, ret;
@@ -621,45 +670,27 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
621 goto out_free_uid; 670 goto out_free_uid;
622 } 671 }
623 672
624 dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL); 673 /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
625 if (unlikely(!dev)) { 674 group = inotify_new_group(user, inotify_max_queued_events);
626 ret = -ENOMEM; 675 if (IS_ERR(group)) {
676 ret = PTR_ERR(group);
627 goto out_free_uid; 677 goto out_free_uid;
628 } 678 }
629 679
630 ih = inotify_init(&inotify_user_ops);
631 if (IS_ERR(ih)) {
632 ret = PTR_ERR(ih);
633 goto out_free_dev;
634 }
635 dev->ih = ih;
636 dev->fa = NULL;
637
638 filp->f_op = &inotify_fops; 680 filp->f_op = &inotify_fops;
639 filp->f_path.mnt = mntget(inotify_mnt); 681 filp->f_path.mnt = mntget(inotify_mnt);
640 filp->f_path.dentry = dget(inotify_mnt->mnt_root); 682 filp->f_path.dentry = dget(inotify_mnt->mnt_root);
641 filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; 683 filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
642 filp->f_mode = FMODE_READ; 684 filp->f_mode = FMODE_READ;
643 filp->f_flags = O_RDONLY | (flags & O_NONBLOCK); 685 filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
644 filp->private_data = dev; 686 filp->private_data = group;
645 687
646 INIT_LIST_HEAD(&dev->events);
647 init_waitqueue_head(&dev->wq);
648 mutex_init(&dev->ev_mutex);
649 mutex_init(&dev->up_mutex);
650 dev->event_count = 0;
651 dev->queue_size = 0;
652 dev->max_events = inotify_max_queued_events;
653 dev->user = user;
654 atomic_set(&dev->count, 0);
655
656 get_inotify_dev(dev);
657 atomic_inc(&user->inotify_devs); 688 atomic_inc(&user->inotify_devs);
689
658 fd_install(fd, filp); 690 fd_install(fd, filp);
659 691
660 return fd; 692 return fd;
661out_free_dev: 693
662 kfree(dev);
663out_free_uid: 694out_free_uid:
664 free_uid(user); 695 free_uid(user);
665 put_filp(filp); 696 put_filp(filp);
@@ -676,8 +707,8 @@ SYSCALL_DEFINE0(inotify_init)
676SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, 707SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
677 u32, mask) 708 u32, mask)
678{ 709{
710 struct fsnotify_group *group;
679 struct inode *inode; 711 struct inode *inode;
680 struct inotify_device *dev;
681 struct path path; 712 struct path path;
682 struct file *filp; 713 struct file *filp;
683 int ret, fput_needed; 714 int ret, fput_needed;
@@ -698,20 +729,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
698 if (mask & IN_ONLYDIR) 729 if (mask & IN_ONLYDIR)
699 flags |= LOOKUP_DIRECTORY; 730 flags |= LOOKUP_DIRECTORY;
700 731
701 ret = find_inode(pathname, &path, flags); 732 ret = inotify_find_inode(pathname, &path, flags);
702 if (unlikely(ret)) 733 if (ret)
703 goto fput_and_out; 734 goto fput_and_out;
704 735
705 /* inode held in place by reference to path; dev by fget on fd */ 736 /* inode held in place by reference to path; group by fget on fd */
706 inode = path.dentry->d_inode; 737 inode = path.dentry->d_inode;
707 dev = filp->private_data; 738 group = filp->private_data;
708 739
709 mutex_lock(&dev->up_mutex); 740 /* create/update an inode mark */
710 ret = inotify_find_update_watch(dev->ih, inode, mask); 741 ret = inotify_update_watch(group, inode, mask);
711 if (ret == -ENOENT) 742 if (unlikely(ret))
712 ret = create_watch(dev, inode, mask); 743 goto path_put_and_out;
713 mutex_unlock(&dev->up_mutex);
714 744
745path_put_and_out:
715 path_put(&path); 746 path_put(&path);
716fput_and_out: 747fput_and_out:
717 fput_light(filp, fput_needed); 748 fput_light(filp, fput_needed);
@@ -720,9 +751,10 @@ fput_and_out:
720 751
721SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) 752SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
722{ 753{
754 struct fsnotify_group *group;
755 struct fsnotify_mark_entry *entry;
723 struct file *filp; 756 struct file *filp;
724 struct inotify_device *dev; 757 int ret = 0, fput_needed;
725 int ret, fput_needed;
726 758
727 filp = fget_light(fd, &fput_needed); 759 filp = fget_light(fd, &fput_needed);
728 if (unlikely(!filp)) 760 if (unlikely(!filp))
@@ -734,10 +766,20 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
734 goto out; 766 goto out;
735 } 767 }
736 768
737 dev = filp->private_data; 769 group = filp->private_data;
770
771 spin_lock(&group->inotify_data.idr_lock);
772 entry = idr_find(&group->inotify_data.idr, wd);
773 if (unlikely(!entry)) {
774 spin_unlock(&group->inotify_data.idr_lock);
775 ret = -EINVAL;
776 goto out;
777 }
778 fsnotify_get_mark(entry);
779 spin_unlock(&group->inotify_data.idr_lock);
738 780
739 /* we free our watch data when we get IN_IGNORED */ 781 fsnotify_destroy_mark_by_entry(entry);
740 ret = inotify_rm_wd(dev->ih, wd); 782 fsnotify_put_mark(entry);
741 783
742out: 784out:
743 fput_light(filp, fput_needed); 785 fput_light(filp, fput_needed);
@@ -753,9 +795,9 @@ inotify_get_sb(struct file_system_type *fs_type, int flags,
753} 795}
754 796
755static struct file_system_type inotify_fs_type = { 797static struct file_system_type inotify_fs_type = {
756 .name = "inotifyfs", 798 .name = "inotifyfs",
757 .get_sb = inotify_get_sb, 799 .get_sb = inotify_get_sb,
758 .kill_sb = kill_anon_super, 800 .kill_sb = kill_anon_super,
759}; 801};
760 802
761/* 803/*
@@ -775,18 +817,13 @@ static int __init inotify_user_setup(void)
775 if (IS_ERR(inotify_mnt)) 817 if (IS_ERR(inotify_mnt))
776 panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt)); 818 panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
777 819
820 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
821 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
822
778 inotify_max_queued_events = 16384; 823 inotify_max_queued_events = 16384;
779 inotify_max_user_instances = 128; 824 inotify_max_user_instances = 128;
780 inotify_max_user_watches = 8192; 825 inotify_max_user_watches = 8192;
781 826
782 watch_cachep = kmem_cache_create("inotify_watch_cache",
783 sizeof(struct inotify_user_watch),
784 0, SLAB_PANIC, NULL);
785 event_cachep = kmem_cache_create("inotify_event_cache",
786 sizeof(struct inotify_kernel_event),
787 0, SLAB_PANIC, NULL);
788
789 return 0; 827 return 0;
790} 828}
791
792module_init(inotify_user_setup); 829module_init(inotify_user_setup);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
new file mode 100644
index 000000000000..3816d5750dd5
--- /dev/null
+++ b/fs/notify/notification.c
@@ -0,0 +1,421 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19/*
20 * Basic idea behind the notification queue: An fsnotify group (like inotify)
21 * sends the userspace notification about events asyncronously some time after
22 * the event happened. When inotify gets an event it will need to add that
23 * event to the group notify queue. Since a single event might need to be on
24 * multiple group's notification queues we can't add the event directly to each
25 * queue and instead add a small "event_holder" to each queue. This event_holder
26 * has a pointer back to the original event. Since the majority of events are
27 * going to end up on one, and only one, notification queue we embed one
28 * event_holder into each event. This means we have a single allocation instead
29 * of always needing two. If the embedded event_holder is already in use by
30 * another group a new event_holder (from fsnotify_event_holder_cachep) will be
31 * allocated and used.
32 */
33
34#include <linux/fs.h>
35#include <linux/init.h>
36#include <linux/kernel.h>
37#include <linux/list.h>
38#include <linux/module.h>
39#include <linux/mount.h>
40#include <linux/mutex.h>
41#include <linux/namei.h>
42#include <linux/path.h>
43#include <linux/slab.h>
44#include <linux/spinlock.h>
45
46#include <asm/atomic.h>
47
48#include <linux/fsnotify_backend.h>
49#include "fsnotify.h"
50
51static struct kmem_cache *fsnotify_event_cachep;
52static struct kmem_cache *fsnotify_event_holder_cachep;
53/*
54 * This is a magic event we send when the q is too full. Since it doesn't
55 * hold real event information we just keep one system wide and use it any time
56 * it is needed. It's refcnt is set 1 at kernel init time and will never
57 * get set to 0 so it will never get 'freed'
58 */
59static struct fsnotify_event q_overflow_event;
60static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
61
62/**
63 * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
64 * Called from fsnotify_move, which is inlined into filesystem modules.
65 */
66u32 fsnotify_get_cookie(void)
67{
68 return atomic_inc_return(&fsnotify_sync_cookie);
69}
70EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
71
72/* return true if the notify queue is empty, false otherwise */
73bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
74{
75 BUG_ON(!mutex_is_locked(&group->notification_mutex));
76 return list_empty(&group->notification_list) ? true : false;
77}
78
79void fsnotify_get_event(struct fsnotify_event *event)
80{
81 atomic_inc(&event->refcnt);
82}
83
84void fsnotify_put_event(struct fsnotify_event *event)
85{
86 if (!event)
87 return;
88
89 if (atomic_dec_and_test(&event->refcnt)) {
90 if (event->data_type == FSNOTIFY_EVENT_PATH)
91 path_put(&event->path);
92
93 BUG_ON(!list_empty(&event->private_data_list));
94
95 kfree(event->file_name);
96 kmem_cache_free(fsnotify_event_cachep, event);
97 }
98}
99
100struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
101{
102 return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
103}
104
105void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
106{
107 kmem_cache_free(fsnotify_event_holder_cachep, holder);
108}
109
110/*
111 * Find the private data that the group previously attached to this event when
112 * the group added the event to the notification queue (fsnotify_add_notify_event)
113 */
114struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
115{
116 struct fsnotify_event_private_data *lpriv;
117 struct fsnotify_event_private_data *priv = NULL;
118
119 assert_spin_locked(&event->lock);
120
121 list_for_each_entry(lpriv, &event->private_data_list, event_list) {
122 if (lpriv->group == group) {
123 priv = lpriv;
124 list_del(&priv->event_list);
125 break;
126 }
127 }
128 return priv;
129}
130
131/*
132 * Check if 2 events contain the same information. We do not compare private data
133 * but at this moment that isn't a problem for any know fsnotify listeners.
134 */
135static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
136{
137 if ((old->mask == new->mask) &&
138 (old->to_tell == new->to_tell) &&
139 (old->data_type == new->data_type) &&
140 (old->name_len == new->name_len)) {
141 switch (old->data_type) {
142 case (FSNOTIFY_EVENT_INODE):
143 /* remember, after old was put on the wait_q we aren't
144 * allowed to look at the inode any more, only thing
145 * left to check was if the file_name is the same */
146 if (old->name_len &&
147 !strcmp(old->file_name, new->file_name))
148 return true;
149 break;
150 case (FSNOTIFY_EVENT_PATH):
151 if ((old->path.mnt == new->path.mnt) &&
152 (old->path.dentry == new->path.dentry))
153 return true;
154 break;
155 case (FSNOTIFY_EVENT_NONE):
156 if (old->mask & FS_Q_OVERFLOW)
157 return true;
158 else if (old->mask & FS_IN_IGNORED)
159 return false;
160 return false;
161 };
162 }
163 return false;
164}
165
166/*
167 * Add an event to the group notification queue. The group can later pull this
168 * event off the queue to deal with. If the event is successfully added to the
169 * group's notification queue, a reference is taken on event.
170 */
171int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
172 struct fsnotify_event_private_data *priv)
173{
174 struct fsnotify_event_holder *holder = NULL;
175 struct list_head *list = &group->notification_list;
176 struct fsnotify_event_holder *last_holder;
177 struct fsnotify_event *last_event;
178 int ret = 0;
179
180 /*
181 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
182 * Check if we expect to be able to use that holder. If not alloc a new
183 * holder.
184 * For the overflow event it's possible that something will use the in
185 * event holder before we get the lock so we may need to jump back and
186 * alloc a new holder, this can't happen for most events...
187 */
188 if (!list_empty(&event->holder.event_list)) {
189alloc_holder:
190 holder = fsnotify_alloc_event_holder();
191 if (!holder)
192 return -ENOMEM;
193 }
194
195 mutex_lock(&group->notification_mutex);
196
197 if (group->q_len >= group->max_events) {
198 event = &q_overflow_event;
199 ret = -EOVERFLOW;
200 /* sorry, no private data on the overflow event */
201 priv = NULL;
202 }
203
204 spin_lock(&event->lock);
205
206 if (list_empty(&event->holder.event_list)) {
207 if (unlikely(holder))
208 fsnotify_destroy_event_holder(holder);
209 holder = &event->holder;
210 } else if (unlikely(!holder)) {
211 /* between the time we checked above and got the lock the in
212 * event holder was used, go back and get a new one */
213 spin_unlock(&event->lock);
214 mutex_unlock(&group->notification_mutex);
215 goto alloc_holder;
216 }
217
218 if (!list_empty(list)) {
219 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
220 last_event = last_holder->event;
221 if (event_compare(last_event, event)) {
222 spin_unlock(&event->lock);
223 mutex_unlock(&group->notification_mutex);
224 if (holder != &event->holder)
225 fsnotify_destroy_event_holder(holder);
226 return -EEXIST;
227 }
228 }
229
230 group->q_len++;
231 holder->event = event;
232
233 fsnotify_get_event(event);
234 list_add_tail(&holder->event_list, list);
235 if (priv)
236 list_add_tail(&priv->event_list, &event->private_data_list);
237 spin_unlock(&event->lock);
238 mutex_unlock(&group->notification_mutex);
239
240 wake_up(&group->notification_waitq);
241 return ret;
242}
243
244/*
245 * Remove and return the first event from the notification list. There is a
246 * reference held on this event since it was on the list. It is the responsibility
247 * of the caller to drop this reference.
248 */
249struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
250{
251 struct fsnotify_event *event;
252 struct fsnotify_event_holder *holder;
253
254 BUG_ON(!mutex_is_locked(&group->notification_mutex));
255
256 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
257
258 event = holder->event;
259
260 spin_lock(&event->lock);
261 holder->event = NULL;
262 list_del_init(&holder->event_list);
263 spin_unlock(&event->lock);
264
265 /* event == holder means we are referenced through the in event holder */
266 if (holder != &event->holder)
267 fsnotify_destroy_event_holder(holder);
268
269 group->q_len--;
270
271 return event;
272}
273
274/*
275 * This will not remove the event, that must be done with fsnotify_remove_notify_event()
276 */
277struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
278{
279 struct fsnotify_event *event;
280 struct fsnotify_event_holder *holder;
281
282 BUG_ON(!mutex_is_locked(&group->notification_mutex));
283
284 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
285 event = holder->event;
286
287 return event;
288}
289
290/*
291 * Called when a group is being torn down to clean up any outstanding
292 * event notifications.
293 */
294void fsnotify_flush_notify(struct fsnotify_group *group)
295{
296 struct fsnotify_event *event;
297 struct fsnotify_event_private_data *priv;
298
299 mutex_lock(&group->notification_mutex);
300 while (!fsnotify_notify_queue_is_empty(group)) {
301 event = fsnotify_remove_notify_event(group);
302 /* if they don't implement free_event_priv they better not have attached any */
303 if (group->ops->free_event_priv) {
304 spin_lock(&event->lock);
305 priv = fsnotify_remove_priv_from_event(group, event);
306 spin_unlock(&event->lock);
307 if (priv)
308 group->ops->free_event_priv(priv);
309 }
310 fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
311 }
312 mutex_unlock(&group->notification_mutex);
313}
314
315static void initialize_event(struct fsnotify_event *event)
316{
317 event->holder.event = NULL;
318 INIT_LIST_HEAD(&event->holder.event_list);
319 atomic_set(&event->refcnt, 1);
320
321 spin_lock_init(&event->lock);
322
323 event->path.dentry = NULL;
324 event->path.mnt = NULL;
325 event->inode = NULL;
326 event->data_type = FSNOTIFY_EVENT_NONE;
327
328 INIT_LIST_HEAD(&event->private_data_list);
329
330 event->to_tell = NULL;
331
332 event->file_name = NULL;
333 event->name_len = 0;
334
335 event->sync_cookie = 0;
336}
337
338/*
339 * fsnotify_create_event - Allocate a new event which will be sent to each
340 * group's handle_event function if the group was interested in this
341 * particular event.
342 *
343 * @to_tell the inode which is supposed to receive the event (sometimes a
344 * parent of the inode to which the event happened.
345 * @mask what actually happened.
346 * @data pointer to the object which was actually affected
347 * @data_type flag indication if the data is a file, path, inode, nothing...
348 * @name the filename, if available
349 */
350struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
351 int data_type, const char *name, u32 cookie,
352 gfp_t gfp)
353{
354 struct fsnotify_event *event;
355
356 event = kmem_cache_alloc(fsnotify_event_cachep, gfp);
357 if (!event)
358 return NULL;
359
360 initialize_event(event);
361
362 if (name) {
363 event->file_name = kstrdup(name, gfp);
364 if (!event->file_name) {
365 kmem_cache_free(fsnotify_event_cachep, event);
366 return NULL;
367 }
368 event->name_len = strlen(event->file_name);
369 }
370
371 event->sync_cookie = cookie;
372 event->to_tell = to_tell;
373
374 switch (data_type) {
375 case FSNOTIFY_EVENT_FILE: {
376 struct file *file = data;
377 struct path *path = &file->f_path;
378 event->path.dentry = path->dentry;
379 event->path.mnt = path->mnt;
380 path_get(&event->path);
381 event->data_type = FSNOTIFY_EVENT_PATH;
382 break;
383 }
384 case FSNOTIFY_EVENT_PATH: {
385 struct path *path = data;
386 event->path.dentry = path->dentry;
387 event->path.mnt = path->mnt;
388 path_get(&event->path);
389 event->data_type = FSNOTIFY_EVENT_PATH;
390 break;
391 }
392 case FSNOTIFY_EVENT_INODE:
393 event->inode = data;
394 event->data_type = FSNOTIFY_EVENT_INODE;
395 break;
396 case FSNOTIFY_EVENT_NONE:
397 event->inode = NULL;
398 event->path.dentry = NULL;
399 event->path.mnt = NULL;
400 break;
401 default:
402 BUG();
403 }
404
405 event->mask = mask;
406
407 return event;
408}
409
410__init int fsnotify_notification_init(void)
411{
412 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
413 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
414
415 initialize_event(&q_overflow_event);
416 q_overflow_event.mask = FS_Q_OVERFLOW;
417
418 return 0;
419}
420subsys_initcall(fsnotify_notification_init);
421
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 82c5085559c6..9938034762cc 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -27,6 +27,7 @@
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/quotaops.h> 28#include <linux/quotaops.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/log2.h>
30 31
31#include "aops.h" 32#include "aops.h"
32#include "attrib.h" 33#include "attrib.h"
@@ -1570,7 +1571,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
1570 ntfs_debug("Index collation rule is 0x%x.", 1571 ntfs_debug("Index collation rule is 0x%x.",
1571 le32_to_cpu(ir->collation_rule)); 1572 le32_to_cpu(ir->collation_rule));
1572 ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); 1573 ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
1573 if (ni->itype.index.block_size & (ni->itype.index.block_size - 1)) { 1574 if (!is_power_of_2(ni->itype.index.block_size)) {
1574 ntfs_error(vi->i_sb, "Index block size (%u) is not a power of " 1575 ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
1575 "two.", ni->itype.index.block_size); 1576 "two.", ni->itype.index.block_size);
1576 goto unm_err_out; 1577 goto unm_err_out;
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index d7932e95b1fd..89b02985c054 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -26,6 +26,7 @@
26#include <linux/highmem.h> 26#include <linux/highmem.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/bitops.h> 28#include <linux/bitops.h>
29#include <linux/log2.h>
29 30
30#include "attrib.h" 31#include "attrib.h"
31#include "aops.h" 32#include "aops.h"
@@ -65,7 +66,7 @@ static bool ntfs_check_restart_page_header(struct inode *vi,
65 logfile_log_page_size < NTFS_BLOCK_SIZE || 66 logfile_log_page_size < NTFS_BLOCK_SIZE ||
66 logfile_system_page_size & 67 logfile_system_page_size &
67 (logfile_system_page_size - 1) || 68 (logfile_system_page_size - 1) ||
68 logfile_log_page_size & (logfile_log_page_size - 1)) { 69 !is_power_of_2(logfile_log_page_size)) {
69 ntfs_error(vi->i_sb, "$LogFile uses unsupported page size."); 70 ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
70 return false; 71 return false;
71 } 72 }
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index f76951dcd4a6..abaaa1cbf8de 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -25,7 +25,7 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/spinlock.h> 27#include <linux/spinlock.h>
28#include <linux/blkdev.h> /* For bdev_hardsect_size(). */ 28#include <linux/blkdev.h> /* For bdev_logical_block_size(). */
29#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
@@ -443,6 +443,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
443 ntfs_volume *vol = NTFS_SB(sb); 443 ntfs_volume *vol = NTFS_SB(sb);
444 444
445 ntfs_debug("Entering with remount options string: %s", opt); 445 ntfs_debug("Entering with remount options string: %s", opt);
446
447 lock_kernel();
446#ifndef NTFS_RW 448#ifndef NTFS_RW
447 /* For read-only compiled driver, enforce read-only flag. */ 449 /* For read-only compiled driver, enforce read-only flag. */
448 *flags |= MS_RDONLY; 450 *flags |= MS_RDONLY;
@@ -466,15 +468,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
466 if (NVolErrors(vol)) { 468 if (NVolErrors(vol)) {
467 ntfs_error(sb, "Volume has errors and is read-only%s", 469 ntfs_error(sb, "Volume has errors and is read-only%s",
468 es); 470 es);
471 unlock_kernel();
469 return -EROFS; 472 return -EROFS;
470 } 473 }
471 if (vol->vol_flags & VOLUME_IS_DIRTY) { 474 if (vol->vol_flags & VOLUME_IS_DIRTY) {
472 ntfs_error(sb, "Volume is dirty and read-only%s", es); 475 ntfs_error(sb, "Volume is dirty and read-only%s", es);
476 unlock_kernel();
473 return -EROFS; 477 return -EROFS;
474 } 478 }
475 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { 479 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
476 ntfs_error(sb, "Volume has been modified by chkdsk " 480 ntfs_error(sb, "Volume has been modified by chkdsk "
477 "and is read-only%s", es); 481 "and is read-only%s", es);
482 unlock_kernel();
478 return -EROFS; 483 return -EROFS;
479 } 484 }
480 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { 485 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -482,11 +487,13 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
482 "(0x%x) and is read-only%s", 487 "(0x%x) and is read-only%s",
483 (unsigned)le16_to_cpu(vol->vol_flags), 488 (unsigned)le16_to_cpu(vol->vol_flags),
484 es); 489 es);
490 unlock_kernel();
485 return -EROFS; 491 return -EROFS;
486 } 492 }
487 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { 493 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
488 ntfs_error(sb, "Failed to set dirty bit in volume " 494 ntfs_error(sb, "Failed to set dirty bit in volume "
489 "information flags%s", es); 495 "information flags%s", es);
496 unlock_kernel();
490 return -EROFS; 497 return -EROFS;
491 } 498 }
492#if 0 499#if 0
@@ -506,18 +513,21 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
506 ntfs_error(sb, "Failed to empty journal $LogFile%s", 513 ntfs_error(sb, "Failed to empty journal $LogFile%s",
507 es); 514 es);
508 NVolSetErrors(vol); 515 NVolSetErrors(vol);
516 unlock_kernel();
509 return -EROFS; 517 return -EROFS;
510 } 518 }
511 if (!ntfs_mark_quotas_out_of_date(vol)) { 519 if (!ntfs_mark_quotas_out_of_date(vol)) {
512 ntfs_error(sb, "Failed to mark quotas out of date%s", 520 ntfs_error(sb, "Failed to mark quotas out of date%s",
513 es); 521 es);
514 NVolSetErrors(vol); 522 NVolSetErrors(vol);
523 unlock_kernel();
515 return -EROFS; 524 return -EROFS;
516 } 525 }
517 if (!ntfs_stamp_usnjrnl(vol)) { 526 if (!ntfs_stamp_usnjrnl(vol)) {
518 ntfs_error(sb, "Failed to stamp transation log " 527 ntfs_error(sb, "Failed to stamp transation log "
519 "($UsnJrnl)%s", es); 528 "($UsnJrnl)%s", es);
520 NVolSetErrors(vol); 529 NVolSetErrors(vol);
530 unlock_kernel();
521 return -EROFS; 531 return -EROFS;
522 } 532 }
523 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 533 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -533,8 +543,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
533 543
534 // TODO: Deal with *flags. 544 // TODO: Deal with *flags.
535 545
536 if (!parse_options(vol, opt)) 546 if (!parse_options(vol, opt)) {
547 unlock_kernel();
537 return -EINVAL; 548 return -EINVAL;
549 }
550 unlock_kernel();
538 ntfs_debug("Done."); 551 ntfs_debug("Done.");
539 return 0; 552 return 0;
540} 553}
@@ -2246,6 +2259,9 @@ static void ntfs_put_super(struct super_block *sb)
2246 ntfs_volume *vol = NTFS_SB(sb); 2259 ntfs_volume *vol = NTFS_SB(sb);
2247 2260
2248 ntfs_debug("Entering."); 2261 ntfs_debug("Entering.");
2262
2263 lock_kernel();
2264
2249#ifdef NTFS_RW 2265#ifdef NTFS_RW
2250 /* 2266 /*
2251 * Commit all inodes while they are still open in case some of them 2267 * Commit all inodes while they are still open in case some of them
@@ -2373,39 +2389,12 @@ static void ntfs_put_super(struct super_block *sb)
2373 vol->mftmirr_ino = NULL; 2389 vol->mftmirr_ino = NULL;
2374 } 2390 }
2375 /* 2391 /*
2376 * If any dirty inodes are left, throw away all mft data page cache 2392 * We should have no dirty inodes left, due to
2377 * pages to allow a clean umount. This should never happen any more 2393 * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
2378 * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as 2394 * the underlying mft records are written out and cleaned.
2379 * the underlying mft records are written out and cleaned. If it does,
2380 * happen anyway, we want to know...
2381 */ 2395 */
2382 ntfs_commit_inode(vol->mft_ino); 2396 ntfs_commit_inode(vol->mft_ino);
2383 write_inode_now(vol->mft_ino, 1); 2397 write_inode_now(vol->mft_ino, 1);
2384 if (sb_has_dirty_inodes(sb)) {
2385 const char *s1, *s2;
2386
2387 mutex_lock(&vol->mft_ino->i_mutex);
2388 truncate_inode_pages(vol->mft_ino->i_mapping, 0);
2389 mutex_unlock(&vol->mft_ino->i_mutex);
2390 write_inode_now(vol->mft_ino, 1);
2391 if (sb_has_dirty_inodes(sb)) {
2392 static const char *_s1 = "inodes";
2393 static const char *_s2 = "";
2394 s1 = _s1;
2395 s2 = _s2;
2396 } else {
2397 static const char *_s1 = "mft pages";
2398 static const char *_s2 = "They have been thrown "
2399 "away. ";
2400 s1 = _s1;
2401 s2 = _s2;
2402 }
2403 ntfs_error(sb, "Dirty %s found at umount time. %sYou should "
2404 "run chkdsk. Please email "
2405 "linux-ntfs-dev@lists.sourceforge.net and say "
2406 "that you saw this message. Thank you.", s1,
2407 s2);
2408 }
2409#endif /* NTFS_RW */ 2398#endif /* NTFS_RW */
2410 2399
2411 iput(vol->mft_ino); 2400 iput(vol->mft_ino);
@@ -2444,7 +2433,8 @@ static void ntfs_put_super(struct super_block *sb)
2444 } 2433 }
2445 sb->s_fs_info = NULL; 2434 sb->s_fs_info = NULL;
2446 kfree(vol); 2435 kfree(vol);
2447 return; 2436
2437 unlock_kernel();
2448} 2438}
2449 2439
2450/** 2440/**
@@ -2785,13 +2775,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2785 goto err_out_now; 2775 goto err_out_now;
2786 2776
2787 /* We support sector sizes up to the PAGE_CACHE_SIZE. */ 2777 /* We support sector sizes up to the PAGE_CACHE_SIZE. */
2788 if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) { 2778 if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
2789 if (!silent) 2779 if (!silent)
2790 ntfs_error(sb, "Device has unsupported sector size " 2780 ntfs_error(sb, "Device has unsupported sector size "
2791 "(%i). The maximum supported sector " 2781 "(%i). The maximum supported sector "
2792 "size on this architecture is %lu " 2782 "size on this architecture is %lu "
2793 "bytes.", 2783 "bytes.",
2794 bdev_hardsect_size(sb->s_bdev), 2784 bdev_logical_block_size(sb->s_bdev),
2795 PAGE_CACHE_SIZE); 2785 PAGE_CACHE_SIZE);
2796 goto err_out_now; 2786 goto err_out_now;
2797 } 2787 }
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 678a067d9251..ab513ddaeff2 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -475,6 +475,12 @@ struct ocfs2_path {
475#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) 475#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
476#define path_num_items(_path) ((_path)->p_tree_depth + 1) 476#define path_num_items(_path) ((_path)->p_tree_depth + 1)
477 477
478static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
479 u32 cpos);
480static void ocfs2_adjust_rightmost_records(struct inode *inode,
481 handle_t *handle,
482 struct ocfs2_path *path,
483 struct ocfs2_extent_rec *insert_rec);
478/* 484/*
479 * Reset the actual path elements so that we can re-use the structure 485 * Reset the actual path elements so that we can re-use the structure
480 * to build another path. Generally, this involves freeing the buffer 486 * to build another path. Generally, this involves freeing the buffer
@@ -1013,6 +1019,54 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
1013} 1019}
1014 1020
1015/* 1021/*
1022 * Change range of the branches in the right most path according to the leaf
1023 * extent block's rightmost record.
1024 */
1025static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1026 struct inode *inode,
1027 struct ocfs2_extent_tree *et)
1028{
1029 int status;
1030 struct ocfs2_path *path = NULL;
1031 struct ocfs2_extent_list *el;
1032 struct ocfs2_extent_rec *rec;
1033
1034 path = ocfs2_new_path_from_et(et);
1035 if (!path) {
1036 status = -ENOMEM;
1037 return status;
1038 }
1039
1040 status = ocfs2_find_path(inode, path, UINT_MAX);
1041 if (status < 0) {
1042 mlog_errno(status);
1043 goto out;
1044 }
1045
1046 status = ocfs2_extend_trans(handle, path_num_items(path) +
1047 handle->h_buffer_credits);
1048 if (status < 0) {
1049 mlog_errno(status);
1050 goto out;
1051 }
1052
1053 status = ocfs2_journal_access_path(inode, handle, path);
1054 if (status < 0) {
1055 mlog_errno(status);
1056 goto out;
1057 }
1058
1059 el = path_leaf_el(path);
1060 rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
1061
1062 ocfs2_adjust_rightmost_records(inode, handle, path, rec);
1063
1064out:
1065 ocfs2_free_path(path);
1066 return status;
1067}
1068
1069/*
1016 * Add an entire tree branch to our inode. eb_bh is the extent block 1070 * Add an entire tree branch to our inode. eb_bh is the extent block
1017 * to start at, if we don't want to start the branch at the dinode 1071 * to start at, if we don't want to start the branch at the dinode
1018 * structure. 1072 * structure.
@@ -1038,7 +1092,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1038 struct ocfs2_extent_block *eb; 1092 struct ocfs2_extent_block *eb;
1039 struct ocfs2_extent_list *eb_el; 1093 struct ocfs2_extent_list *eb_el;
1040 struct ocfs2_extent_list *el; 1094 struct ocfs2_extent_list *el;
1041 u32 new_cpos; 1095 u32 new_cpos, root_end;
1042 1096
1043 mlog_entry_void(); 1097 mlog_entry_void();
1044 1098
@@ -1055,6 +1109,27 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1055 1109
1056 new_blocks = le16_to_cpu(el->l_tree_depth); 1110 new_blocks = le16_to_cpu(el->l_tree_depth);
1057 1111
1112 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1113 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1114 root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
1115
1116 /*
1117 * If there is a gap before the root end and the real end
1118 * of the righmost leaf block, we need to remove the gap
1119 * between new_cpos and root_end first so that the tree
1120 * is consistent after we add a new branch(it will start
1121 * from new_cpos).
1122 */
1123 if (root_end > new_cpos) {
1124 mlog(0, "adjust the cluster end from %u to %u\n",
1125 root_end, new_cpos);
1126 status = ocfs2_adjust_rightmost_branch(handle, inode, et);
1127 if (status) {
1128 mlog_errno(status);
1129 goto bail;
1130 }
1131 }
1132
1058 /* allocate the number of new eb blocks we need */ 1133 /* allocate the number of new eb blocks we need */
1059 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), 1134 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1060 GFP_KERNEL); 1135 GFP_KERNEL);
@@ -1071,9 +1146,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1071 goto bail; 1146 goto bail;
1072 } 1147 }
1073 1148
1074 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1075 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1076
1077 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 1149 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1078 * linked with the rest of the tree. 1150 * linked with the rest of the tree.
1079 * conversly, new_eb_bhs[0] is the new bottommost leaf. 1151 * conversly, new_eb_bhs[0] is the new bottommost leaf.
@@ -1842,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1842 * immediately to their right. 1914 * immediately to their right.
1843 */ 1915 */
1844 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1916 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1845 if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { 1917 if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
1918 BUG_ON(right_child_el->l_tree_depth);
1846 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); 1919 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1847 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); 1920 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1848 } 1921 }
@@ -2404,15 +2477,37 @@ out_ret_path:
2404 return ret; 2477 return ret;
2405} 2478}
2406 2479
2407static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, 2480static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2408 struct ocfs2_path *path) 2481 int subtree_index, struct ocfs2_path *path)
2409{ 2482{
2410 int i, idx; 2483 int i, idx, ret;
2411 struct ocfs2_extent_rec *rec; 2484 struct ocfs2_extent_rec *rec;
2412 struct ocfs2_extent_list *el; 2485 struct ocfs2_extent_list *el;
2413 struct ocfs2_extent_block *eb; 2486 struct ocfs2_extent_block *eb;
2414 u32 range; 2487 u32 range;
2415 2488
2489 /*
2490 * In normal tree rotation process, we will never touch the
2491 * tree branch above subtree_index and ocfs2_extend_rotate_transaction
2492 * doesn't reserve the credits for them either.
2493 *
2494 * But we do have a special case here which will update the rightmost
2495 * records for all the bh in the path.
2496 * So we have to allocate extra credits and access them.
2497 */
2498 ret = ocfs2_extend_trans(handle,
2499 handle->h_buffer_credits + subtree_index);
2500 if (ret) {
2501 mlog_errno(ret);
2502 goto out;
2503 }
2504
2505 ret = ocfs2_journal_access_path(inode, handle, path);
2506 if (ret) {
2507 mlog_errno(ret);
2508 goto out;
2509 }
2510
2416 /* Path should always be rightmost. */ 2511 /* Path should always be rightmost. */
2417 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 2512 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2418 BUG_ON(eb->h_next_leaf_blk != 0ULL); 2513 BUG_ON(eb->h_next_leaf_blk != 0ULL);
@@ -2433,6 +2528,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2433 2528
2434 ocfs2_journal_dirty(handle, path->p_node[i].bh); 2529 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2435 } 2530 }
2531out:
2532 return ret;
2436} 2533}
2437 2534
2438static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, 2535static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
@@ -2645,7 +2742,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2645 if (del_right_subtree) { 2742 if (del_right_subtree) {
2646 ocfs2_unlink_subtree(inode, handle, left_path, right_path, 2743 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2647 subtree_index, dealloc); 2744 subtree_index, dealloc);
2648 ocfs2_update_edge_lengths(inode, handle, left_path); 2745 ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
2746 left_path);
2747 if (ret) {
2748 mlog_errno(ret);
2749 goto out;
2750 }
2649 2751
2650 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2752 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2651 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 2753 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -2962,7 +3064,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2962 3064
2963 ocfs2_unlink_subtree(inode, handle, left_path, path, 3065 ocfs2_unlink_subtree(inode, handle, left_path, path,
2964 subtree_index, dealloc); 3066 subtree_index, dealloc);
2965 ocfs2_update_edge_lengths(inode, handle, left_path); 3067 ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
3068 left_path);
3069 if (ret) {
3070 mlog_errno(ret);
3071 goto out;
3072 }
2966 3073
2967 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 3074 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2968 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 3075 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -6744,7 +6851,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6744 } 6851 }
6745 status = 0; 6852 status = 0;
6746bail: 6853bail:
6747 6854 brelse(last_eb_bh);
6748 mlog_exit(status); 6855 mlog_exit(status);
6749 return status; 6856 return status;
6750} 6857}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b2c52b3a1484..b401654011a2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
193 (unsigned long long)OCFS2_I(inode)->ip_blkno); 193 (unsigned long long)OCFS2_I(inode)->ip_blkno);
194 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); 194 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
195 dump_stack(); 195 dump_stack();
196 goto bail;
196 } 197 }
197 198
198 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 199 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
@@ -894,18 +895,17 @@ struct ocfs2_write_cluster_desc {
894 */ 895 */
895 unsigned c_new; 896 unsigned c_new;
896 unsigned c_unwritten; 897 unsigned c_unwritten;
898 unsigned c_needs_zero;
897}; 899};
898 900
899static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
900{
901 return d->c_new || d->c_unwritten;
902}
903
904struct ocfs2_write_ctxt { 901struct ocfs2_write_ctxt {
905 /* Logical cluster position / len of write */ 902 /* Logical cluster position / len of write */
906 u32 w_cpos; 903 u32 w_cpos;
907 u32 w_clen; 904 u32 w_clen;
908 905
906 /* First cluster allocated in a nonsparse extend */
907 u32 w_first_new_cpos;
908
909 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; 909 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
910 910
911 /* 911 /*
@@ -983,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
983 return -ENOMEM; 983 return -ENOMEM;
984 984
985 wc->w_cpos = pos >> osb->s_clustersize_bits; 985 wc->w_cpos = pos >> osb->s_clustersize_bits;
986 wc->w_first_new_cpos = UINT_MAX;
986 cend = (pos + len - 1) >> osb->s_clustersize_bits; 987 cend = (pos + len - 1) >> osb->s_clustersize_bits;
987 wc->w_clen = cend - wc->w_cpos + 1; 988 wc->w_clen = cend - wc->w_cpos + 1;
988 get_bh(di_bh); 989 get_bh(di_bh);
@@ -1217,20 +1218,18 @@ out:
1217 */ 1218 */
1218static int ocfs2_write_cluster(struct address_space *mapping, 1219static int ocfs2_write_cluster(struct address_space *mapping,
1219 u32 phys, unsigned int unwritten, 1220 u32 phys, unsigned int unwritten,
1221 unsigned int should_zero,
1220 struct ocfs2_alloc_context *data_ac, 1222 struct ocfs2_alloc_context *data_ac,
1221 struct ocfs2_alloc_context *meta_ac, 1223 struct ocfs2_alloc_context *meta_ac,
1222 struct ocfs2_write_ctxt *wc, u32 cpos, 1224 struct ocfs2_write_ctxt *wc, u32 cpos,
1223 loff_t user_pos, unsigned user_len) 1225 loff_t user_pos, unsigned user_len)
1224{ 1226{
1225 int ret, i, new, should_zero = 0; 1227 int ret, i, new;
1226 u64 v_blkno, p_blkno; 1228 u64 v_blkno, p_blkno;
1227 struct inode *inode = mapping->host; 1229 struct inode *inode = mapping->host;
1228 struct ocfs2_extent_tree et; 1230 struct ocfs2_extent_tree et;
1229 1231
1230 new = phys == 0 ? 1 : 0; 1232 new = phys == 0 ? 1 : 0;
1231 if (new || unwritten)
1232 should_zero = 1;
1233
1234 if (new) { 1233 if (new) {
1235 u32 tmp_pos; 1234 u32 tmp_pos;
1236 1235
@@ -1301,7 +1300,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1301 if (tmpret) { 1300 if (tmpret) {
1302 mlog_errno(tmpret); 1301 mlog_errno(tmpret);
1303 if (ret == 0) 1302 if (ret == 0)
1304 tmpret = ret; 1303 ret = tmpret;
1305 } 1304 }
1306 } 1305 }
1307 1306
@@ -1341,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1341 local_len = osb->s_clustersize - cluster_off; 1340 local_len = osb->s_clustersize - cluster_off;
1342 1341
1343 ret = ocfs2_write_cluster(mapping, desc->c_phys, 1342 ret = ocfs2_write_cluster(mapping, desc->c_phys,
1344 desc->c_unwritten, data_ac, meta_ac, 1343 desc->c_unwritten,
1344 desc->c_needs_zero,
1345 data_ac, meta_ac,
1345 wc, desc->c_cpos, pos, local_len); 1346 wc, desc->c_cpos, pos, local_len);
1346 if (ret) { 1347 if (ret) {
1347 mlog_errno(ret); 1348 mlog_errno(ret);
@@ -1391,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1391 * newly allocated cluster. 1392 * newly allocated cluster.
1392 */ 1393 */
1393 desc = &wc->w_desc[0]; 1394 desc = &wc->w_desc[0];
1394 if (ocfs2_should_zero_cluster(desc)) 1395 if (desc->c_needs_zero)
1395 ocfs2_figure_cluster_boundaries(osb, 1396 ocfs2_figure_cluster_boundaries(osb,
1396 desc->c_cpos, 1397 desc->c_cpos,
1397 &wc->w_target_from, 1398 &wc->w_target_from,
1398 NULL); 1399 NULL);
1399 1400
1400 desc = &wc->w_desc[wc->w_clen - 1]; 1401 desc = &wc->w_desc[wc->w_clen - 1];
1401 if (ocfs2_should_zero_cluster(desc)) 1402 if (desc->c_needs_zero)
1402 ocfs2_figure_cluster_boundaries(osb, 1403 ocfs2_figure_cluster_boundaries(osb,
1403 desc->c_cpos, 1404 desc->c_cpos,
1404 NULL, 1405 NULL,
@@ -1466,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode,
1466 phys++; 1467 phys++;
1467 } 1468 }
1468 1469
1470 /*
1471 * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
1472 * file that got extended. w_first_new_cpos tells us
1473 * where the newly allocated clusters are so we can
1474 * zero them.
1475 */
1476 if (desc->c_cpos >= wc->w_first_new_cpos) {
1477 BUG_ON(phys == 0);
1478 desc->c_needs_zero = 1;
1479 }
1480
1469 desc->c_phys = phys; 1481 desc->c_phys = phys;
1470 if (phys == 0) { 1482 if (phys == 0) {
1471 desc->c_new = 1; 1483 desc->c_new = 1;
1484 desc->c_needs_zero = 1;
1472 *clusters_to_alloc = *clusters_to_alloc + 1; 1485 *clusters_to_alloc = *clusters_to_alloc + 1;
1473 } 1486 }
1474 if (ext_flags & OCFS2_EXT_UNWRITTEN) 1487
1488 if (ext_flags & OCFS2_EXT_UNWRITTEN) {
1475 desc->c_unwritten = 1; 1489 desc->c_unwritten = 1;
1490 desc->c_needs_zero = 1;
1491 }
1476 1492
1477 num_clusters--; 1493 num_clusters--;
1478 } 1494 }
@@ -1632,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
1632 if (newsize <= i_size_read(inode)) 1648 if (newsize <= i_size_read(inode))
1633 return 0; 1649 return 0;
1634 1650
1635 ret = ocfs2_extend_no_holes(inode, newsize, newsize - len); 1651 ret = ocfs2_extend_no_holes(inode, newsize, pos);
1636 if (ret) 1652 if (ret)
1637 mlog_errno(ret); 1653 mlog_errno(ret);
1638 1654
1655 wc->w_first_new_cpos =
1656 ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
1657
1639 return ret; 1658 return ret;
1640} 1659}
1641 1660
@@ -1644,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1644 struct page **pagep, void **fsdata, 1663 struct page **pagep, void **fsdata,
1645 struct buffer_head *di_bh, struct page *mmap_page) 1664 struct buffer_head *di_bh, struct page *mmap_page)
1646{ 1665{
1647 int ret, credits = OCFS2_INODE_UPDATE_CREDITS; 1666 int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
1648 unsigned int clusters_to_alloc, extents_to_split; 1667 unsigned int clusters_to_alloc, extents_to_split;
1649 struct ocfs2_write_ctxt *wc; 1668 struct ocfs2_write_ctxt *wc;
1650 struct inode *inode = mapping->host; 1669 struct inode *inode = mapping->host;
@@ -1722,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1722 1741
1723 } 1742 }
1724 1743
1725 ocfs2_set_target_boundaries(osb, wc, pos, len, 1744 /*
1726 clusters_to_alloc + extents_to_split); 1745 * We have to zero sparse allocated clusters, unwritten extent clusters,
1746 * and non-sparse clusters we just extended. For non-sparse writes,
1747 * we know zeros will only be needed in the first and/or last cluster.
1748 */
1749 if (clusters_to_alloc || extents_to_split ||
1750 wc->w_desc[0].c_needs_zero ||
1751 wc->w_desc[wc->w_clen - 1].c_needs_zero)
1752 cluster_of_pages = 1;
1753 else
1754 cluster_of_pages = 0;
1755
1756 ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
1727 1757
1728 handle = ocfs2_start_trans(osb, credits); 1758 handle = ocfs2_start_trans(osb, credits);
1729 if (IS_ERR(handle)) { 1759 if (IS_ERR(handle)) {
@@ -1756,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1756 * extent. 1786 * extent.
1757 */ 1787 */
1758 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, 1788 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
1759 clusters_to_alloc + extents_to_split, 1789 cluster_of_pages, mmap_page);
1760 mmap_page);
1761 if (ret) { 1790 if (ret) {
1762 mlog_errno(ret); 1791 mlog_errno(ret);
1763 goto out_quota; 1792 goto out_quota;
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 2a947c44e594..a1163b8b417c 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -22,6 +22,9 @@
22#include <linux/crc32.h> 22#include <linux/crc32.h>
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/bitops.h> 24#include <linux/bitops.h>
25#include <linux/debugfs.h>
26#include <linux/module.h>
27#include <linux/fs.h>
25#include <asm/byteorder.h> 28#include <asm/byteorder.h>
26 29
27#include <cluster/masklog.h> 30#include <cluster/masklog.h>
@@ -222,6 +225,155 @@ void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
222 ocfs2_hamming_fix(data, blocksize * 8, 0, fix); 225 ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
223} 226}
224 227
228
229/*
230 * Debugfs handling.
231 */
232
233#ifdef CONFIG_DEBUG_FS
234
235static int blockcheck_u64_get(void *data, u64 *val)
236{
237 *val = *(u64 *)data;
238 return 0;
239}
240DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
241
242static struct dentry *blockcheck_debugfs_create(const char *name,
243 struct dentry *parent,
244 u64 *value)
245{
246 return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
247 &blockcheck_fops);
248}
249
250static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
251{
252 if (stats) {
253 debugfs_remove(stats->b_debug_check);
254 stats->b_debug_check = NULL;
255 debugfs_remove(stats->b_debug_failure);
256 stats->b_debug_failure = NULL;
257 debugfs_remove(stats->b_debug_recover);
258 stats->b_debug_recover = NULL;
259 debugfs_remove(stats->b_debug_dir);
260 stats->b_debug_dir = NULL;
261 }
262}
263
264static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
265 struct dentry *parent)
266{
267 int rc = -EINVAL;
268
269 if (!stats)
270 goto out;
271
272 stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
273 if (!stats->b_debug_dir)
274 goto out;
275
276 stats->b_debug_check =
277 blockcheck_debugfs_create("blocks_checked",
278 stats->b_debug_dir,
279 &stats->b_check_count);
280
281 stats->b_debug_failure =
282 blockcheck_debugfs_create("checksums_failed",
283 stats->b_debug_dir,
284 &stats->b_failure_count);
285
286 stats->b_debug_recover =
287 blockcheck_debugfs_create("ecc_recoveries",
288 stats->b_debug_dir,
289 &stats->b_recover_count);
290 if (stats->b_debug_check && stats->b_debug_failure &&
291 stats->b_debug_recover)
292 rc = 0;
293
294out:
295 if (rc)
296 ocfs2_blockcheck_debug_remove(stats);
297 return rc;
298}
299#else
300static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
301 struct dentry *parent)
302{
303 return 0;
304}
305
306static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
307{
308}
309#endif /* CONFIG_DEBUG_FS */
310
311/* Always-called wrappers for starting and stopping the debugfs files */
312int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
313 struct dentry *parent)
314{
315 return ocfs2_blockcheck_debug_install(stats, parent);
316}
317
318void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats)
319{
320 ocfs2_blockcheck_debug_remove(stats);
321}
322
323static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats)
324{
325 u64 new_count;
326
327 if (!stats)
328 return;
329
330 spin_lock(&stats->b_lock);
331 stats->b_check_count++;
332 new_count = stats->b_check_count;
333 spin_unlock(&stats->b_lock);
334
335 if (!new_count)
336 mlog(ML_NOTICE, "Block check count has wrapped\n");
337}
338
339static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats)
340{
341 u64 new_count;
342
343 if (!stats)
344 return;
345
346 spin_lock(&stats->b_lock);
347 stats->b_failure_count++;
348 new_count = stats->b_failure_count;
349 spin_unlock(&stats->b_lock);
350
351 if (!new_count)
352 mlog(ML_NOTICE, "Checksum failure count has wrapped\n");
353}
354
355static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats)
356{
357 u64 new_count;
358
359 if (!stats)
360 return;
361
362 spin_lock(&stats->b_lock);
363 stats->b_recover_count++;
364 new_count = stats->b_recover_count;
365 spin_unlock(&stats->b_lock);
366
367 if (!new_count)
368 mlog(ML_NOTICE, "ECC recovery count has wrapped\n");
369}
370
371
372
373/*
374 * These are the low-level APIs for using the ocfs2_block_check structure.
375 */
376
225/* 377/*
226 * This function generates check information for a block. 378 * This function generates check information for a block.
227 * data is the block to be checked. bc is a pointer to the 379 * data is the block to be checked. bc is a pointer to the
@@ -266,12 +418,15 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
266 * Again, the data passed in should be the on-disk endian. 418 * Again, the data passed in should be the on-disk endian.
267 */ 419 */
268int ocfs2_block_check_validate(void *data, size_t blocksize, 420int ocfs2_block_check_validate(void *data, size_t blocksize,
269 struct ocfs2_block_check *bc) 421 struct ocfs2_block_check *bc,
422 struct ocfs2_blockcheck_stats *stats)
270{ 423{
271 int rc = 0; 424 int rc = 0;
272 struct ocfs2_block_check check; 425 struct ocfs2_block_check check;
273 u32 crc, ecc; 426 u32 crc, ecc;
274 427
428 ocfs2_blockcheck_inc_check(stats);
429
275 check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); 430 check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
276 check.bc_ecc = le16_to_cpu(bc->bc_ecc); 431 check.bc_ecc = le16_to_cpu(bc->bc_ecc);
277 432
@@ -282,6 +437,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
282 if (crc == check.bc_crc32e) 437 if (crc == check.bc_crc32e)
283 goto out; 438 goto out;
284 439
440 ocfs2_blockcheck_inc_failure(stats);
285 mlog(ML_ERROR, 441 mlog(ML_ERROR,
286 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", 442 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
287 (unsigned int)check.bc_crc32e, (unsigned int)crc); 443 (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -292,8 +448,10 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
292 448
293 /* And check the crc32 again */ 449 /* And check the crc32 again */
294 crc = crc32_le(~0, data, blocksize); 450 crc = crc32_le(~0, data, blocksize);
295 if (crc == check.bc_crc32e) 451 if (crc == check.bc_crc32e) {
452 ocfs2_blockcheck_inc_recover(stats);
296 goto out; 453 goto out;
454 }
297 455
298 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", 456 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
299 (unsigned int)check.bc_crc32e, (unsigned int)crc); 457 (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -366,7 +524,8 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
366 * Again, the data passed in should be the on-disk endian. 524 * Again, the data passed in should be the on-disk endian.
367 */ 525 */
368int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, 526int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
369 struct ocfs2_block_check *bc) 527 struct ocfs2_block_check *bc,
528 struct ocfs2_blockcheck_stats *stats)
370{ 529{
371 int i, rc = 0; 530 int i, rc = 0;
372 struct ocfs2_block_check check; 531 struct ocfs2_block_check check;
@@ -377,6 +536,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
377 if (!nr) 536 if (!nr)
378 return 0; 537 return 0;
379 538
539 ocfs2_blockcheck_inc_check(stats);
540
380 check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); 541 check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
381 check.bc_ecc = le16_to_cpu(bc->bc_ecc); 542 check.bc_ecc = le16_to_cpu(bc->bc_ecc);
382 543
@@ -388,6 +549,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
388 if (crc == check.bc_crc32e) 549 if (crc == check.bc_crc32e)
389 goto out; 550 goto out;
390 551
552 ocfs2_blockcheck_inc_failure(stats);
391 mlog(ML_ERROR, 553 mlog(ML_ERROR,
392 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", 554 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
393 (unsigned int)check.bc_crc32e, (unsigned int)crc); 555 (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -416,8 +578,10 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
416 /* And check the crc32 again */ 578 /* And check the crc32 again */
417 for (i = 0, crc = ~0; i < nr; i++) 579 for (i = 0, crc = ~0; i < nr; i++)
418 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); 580 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
419 if (crc == check.bc_crc32e) 581 if (crc == check.bc_crc32e) {
582 ocfs2_blockcheck_inc_recover(stats);
420 goto out; 583 goto out;
584 }
421 585
422 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", 586 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
423 (unsigned int)check.bc_crc32e, (unsigned int)crc); 587 (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -448,9 +612,11 @@ int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
448 struct ocfs2_block_check *bc) 612 struct ocfs2_block_check *bc)
449{ 613{
450 int rc = 0; 614 int rc = 0;
615 struct ocfs2_super *osb = OCFS2_SB(sb);
451 616
452 if (ocfs2_meta_ecc(OCFS2_SB(sb))) 617 if (ocfs2_meta_ecc(osb))
453 rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc); 618 rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc,
619 &osb->osb_ecc_stats);
454 620
455 return rc; 621 return rc;
456} 622}
@@ -468,9 +634,11 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
468 struct ocfs2_block_check *bc) 634 struct ocfs2_block_check *bc)
469{ 635{
470 int rc = 0; 636 int rc = 0;
637 struct ocfs2_super *osb = OCFS2_SB(sb);
471 638
472 if (ocfs2_meta_ecc(OCFS2_SB(sb))) 639 if (ocfs2_meta_ecc(osb))
473 rc = ocfs2_block_check_validate_bhs(bhs, nr, bc); 640 rc = ocfs2_block_check_validate_bhs(bhs, nr, bc,
641 &osb->osb_ecc_stats);
474 642
475 return rc; 643 return rc;
476} 644}
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
index 70ec3feda32f..d4b69febf70a 100644
--- a/fs/ocfs2/blockcheck.h
+++ b/fs/ocfs2/blockcheck.h
@@ -21,6 +21,24 @@
21#define OCFS2_BLOCKCHECK_H 21#define OCFS2_BLOCKCHECK_H
22 22
23 23
24/* Count errors and error correction from blockcheck.c */
25struct ocfs2_blockcheck_stats {
26 spinlock_t b_lock;
27 u64 b_check_count; /* Number of blocks we've checked */
28 u64 b_failure_count; /* Number of failed checksums */
29 u64 b_recover_count; /* Number of blocks fixed by ecc */
30
31 /*
32 * debugfs entries, used if this is passed to
33 * ocfs2_blockcheck_stats_debugfs_install()
34 */
35 struct dentry *b_debug_dir; /* Parent of the debugfs files */
36 struct dentry *b_debug_check; /* Exposes b_check_count */
37 struct dentry *b_debug_failure; /* Exposes b_failure_count */
38 struct dentry *b_debug_recover; /* Exposes b_recover_count */
39};
40
41
24/* High level block API */ 42/* High level block API */
25void ocfs2_compute_meta_ecc(struct super_block *sb, void *data, 43void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
26 struct ocfs2_block_check *bc); 44 struct ocfs2_block_check *bc);
@@ -37,11 +55,18 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
37void ocfs2_block_check_compute(void *data, size_t blocksize, 55void ocfs2_block_check_compute(void *data, size_t blocksize,
38 struct ocfs2_block_check *bc); 56 struct ocfs2_block_check *bc);
39int ocfs2_block_check_validate(void *data, size_t blocksize, 57int ocfs2_block_check_validate(void *data, size_t blocksize,
40 struct ocfs2_block_check *bc); 58 struct ocfs2_block_check *bc,
59 struct ocfs2_blockcheck_stats *stats);
41void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, 60void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
42 struct ocfs2_block_check *bc); 61 struct ocfs2_block_check *bc);
43int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, 62int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
44 struct ocfs2_block_check *bc); 63 struct ocfs2_block_check *bc,
64 struct ocfs2_blockcheck_stats *stats);
65
66/* Debug Initialization */
67int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
68 struct dentry *parent);
69void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats);
45 70
46/* 71/*
47 * Hamming code functions 72 * Hamming code functions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4f85eceab376..09cc25d04611 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1371,7 +1371,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1371 1371
1372 bdevname(reg->hr_bdev, reg->hr_dev_name); 1372 bdevname(reg->hr_bdev, reg->hr_dev_name);
1373 1373
1374 sectsize = bdev_hardsect_size(reg->hr_bdev); 1374 sectsize = bdev_logical_block_size(reg->hr_bdev);
1375 if (sectsize != reg->hr_block_bytes) { 1375 if (sectsize != reg->hr_block_bytes) {
1376 mlog(ML_ERROR, 1376 mlog(ML_ERROR,
1377 "blocksize %u incorrect for device, expected %d", 1377 "blocksize %u incorrect for device, expected %d",
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 7e72a81bc2d4..696c32e50716 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -48,34 +48,33 @@
48 * only emit the appropriage printk() when the caller passes in a constant 48 * only emit the appropriage printk() when the caller passes in a constant
49 * mask, as is almost always the case. 49 * mask, as is almost always the case.
50 * 50 *
51 * All this bitmask nonsense is hidden from the /proc interface so that Joel 51 * All this bitmask nonsense is managed from the files under
52 * doesn't have an aneurism. Reading the file gives a straight forward 52 * /sys/fs/o2cb/logmask/. Reading the files gives a straightforward
53 * indication of which bits are on or off: 53 * indication of which bits are allowed (allow) or denied (off/deny).
54 * ENTRY off 54 * ENTRY deny
55 * EXIT off 55 * EXIT deny
56 * TCP off 56 * TCP off
57 * MSG off 57 * MSG off
58 * SOCKET off 58 * SOCKET off
59 * ERROR off 59 * ERROR allow
60 * NOTICE on 60 * NOTICE allow
61 * 61 *
62 * Writing changes the state of a given bit and requires a strictly formatted 62 * Writing changes the state of a given bit and requires a strictly formatted
63 * single write() call: 63 * single write() call:
64 * 64 *
65 * write(fd, "ENTRY on", 8); 65 * write(fd, "allow", 5);
66 * 66 *
67 * would turn the entry bit on. "1" is also accepted in the place of "on", and 67 * Echoing allow/deny/off string into the logmask files can flip the bits
68 * "off" and "0" behave as expected. 68 * on or off as expected; here is the bash script for example:
69 * 69 *
70 * Some trivial shell can flip all the bits on or off: 70 * log_mask="/sys/fs/o2cb/log_mask"
71 * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do
72 * echo allow >"$log_mask"/"$node"
73 * done
71 * 74 *
72 * log_mask="/proc/fs/ocfs2_nodemanager/log_mask" 75 * The debugfs.ocfs2 tool can also flip the bits with the -l option:
73 * cat $log_mask | ( 76 *
74 * while read bit status; do 77 * debugfs.ocfs2 -l TCP allow
75 * # $1 is "on" or "off", say
76 * echo "$bit $1" > $log_mask
77 * done
78 * )
79 */ 78 */
80 79
81/* for task_struct */ 80/* for task_struct */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9fbe849f6344..334f231a422c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -974,7 +974,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
974int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, 974int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
975 size_t caller_veclen, u8 target_node, int *status) 975 size_t caller_veclen, u8 target_node, int *status)
976{ 976{
977 int ret, error = 0; 977 int ret;
978 struct o2net_msg *msg = NULL; 978 struct o2net_msg *msg = NULL;
979 size_t veclen, caller_bytes = 0; 979 size_t veclen, caller_bytes = 0;
980 struct kvec *vec = NULL; 980 struct kvec *vec = NULL;
@@ -1015,10 +1015,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
1015 1015
1016 o2net_set_nst_sock_time(&nst); 1016 o2net_set_nst_sock_time(&nst);
1017 1017
1018 ret = wait_event_interruptible(nn->nn_sc_wq, 1018 wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret));
1019 o2net_tx_can_proceed(nn, &sc, &error));
1020 if (!ret && error)
1021 ret = error;
1022 if (ret) 1019 if (ret)
1023 goto out; 1020 goto out;
1024 1021
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b574431a031d..2f28b7de2c8d 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -310,22 +310,19 @@ out_attach:
310 return ret; 310 return ret;
311} 311}
312 312
313static DEFINE_SPINLOCK(dentry_list_lock); 313DEFINE_SPINLOCK(dentry_list_lock);
314 314
315/* We limit the number of dentry locks to drop in one go. We have 315/* We limit the number of dentry locks to drop in one go. We have
316 * this limit so that we don't starve other users of ocfs2_wq. */ 316 * this limit so that we don't starve other users of ocfs2_wq. */
317#define DL_INODE_DROP_COUNT 64 317#define DL_INODE_DROP_COUNT 64
318 318
319/* Drop inode references from dentry locks */ 319/* Drop inode references from dentry locks */
320void ocfs2_drop_dl_inodes(struct work_struct *work) 320static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
321{ 321{
322 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
323 dentry_lock_work);
324 struct ocfs2_dentry_lock *dl; 322 struct ocfs2_dentry_lock *dl;
325 int drop_count = DL_INODE_DROP_COUNT;
326 323
327 spin_lock(&dentry_list_lock); 324 spin_lock(&dentry_list_lock);
328 while (osb->dentry_lock_list && drop_count--) { 325 while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
329 dl = osb->dentry_lock_list; 326 dl = osb->dentry_lock_list;
330 osb->dentry_lock_list = dl->dl_next; 327 osb->dentry_lock_list = dl->dl_next;
331 spin_unlock(&dentry_list_lock); 328 spin_unlock(&dentry_list_lock);
@@ -333,11 +330,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work)
333 kfree(dl); 330 kfree(dl);
334 spin_lock(&dentry_list_lock); 331 spin_lock(&dentry_list_lock);
335 } 332 }
336 if (osb->dentry_lock_list) 333 spin_unlock(&dentry_list_lock);
334}
335
336void ocfs2_drop_dl_inodes(struct work_struct *work)
337{
338 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
339 dentry_lock_work);
340
341 __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
342 /*
343 * Don't queue dropping if umount is in progress. We flush the
344 * list in ocfs2_dismount_volume
345 */
346 spin_lock(&dentry_list_lock);
347 if (osb->dentry_lock_list &&
348 !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
337 queue_work(ocfs2_wq, &osb->dentry_lock_work); 349 queue_work(ocfs2_wq, &osb->dentry_lock_work);
338 spin_unlock(&dentry_list_lock); 350 spin_unlock(&dentry_list_lock);
339} 351}
340 352
353/* Flush the whole work queue */
354void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
355{
356 __ocfs2_drop_dl_inodes(osb, -1);
357}
358
341/* 359/*
342 * ocfs2_dentry_iput() and friends. 360 * ocfs2_dentry_iput() and friends.
343 * 361 *
@@ -368,7 +386,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
368 /* We leave dropping of inode reference to ocfs2_wq as that can 386 /* We leave dropping of inode reference to ocfs2_wq as that can
369 * possibly lead to inode deletion which gets tricky */ 387 * possibly lead to inode deletion which gets tricky */
370 spin_lock(&dentry_list_lock); 388 spin_lock(&dentry_list_lock);
371 if (!osb->dentry_lock_list) 389 if (!osb->dentry_lock_list &&
390 !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
372 queue_work(ocfs2_wq, &osb->dentry_lock_work); 391 queue_work(ocfs2_wq, &osb->dentry_lock_work);
373 dl->dl_next = osb->dentry_lock_list; 392 dl->dl_next = osb->dentry_lock_list;
374 osb->dentry_lock_list = dl; 393 osb->dentry_lock_list = dl;
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index faa12e75f98d..f5dd1789acf1 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -49,10 +49,13 @@ struct ocfs2_dentry_lock {
49int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, 49int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
50 u64 parent_blkno); 50 u64 parent_blkno);
51 51
52extern spinlock_t dentry_list_lock;
53
52void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 54void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
53 struct ocfs2_dentry_lock *dl); 55 struct ocfs2_dentry_lock *dl);
54 56
55void ocfs2_drop_dl_inodes(struct work_struct *work); 57void ocfs2_drop_dl_inodes(struct work_struct *work);
58void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
56 59
57struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, 60struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
58 int skip_unhashed); 61 int skip_unhashed);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c5752305627c..b358f3bf896d 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2900,6 +2900,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2900 alloc = ocfs2_clusters_for_bytes(sb, bytes); 2900 alloc = ocfs2_clusters_for_bytes(sb, bytes);
2901 dx_alloc = 0; 2901 dx_alloc = 0;
2902 2902
2903 down_write(&oi->ip_alloc_sem);
2904
2903 if (ocfs2_supports_indexed_dirs(osb)) { 2905 if (ocfs2_supports_indexed_dirs(osb)) {
2904 credits += ocfs2_add_dir_index_credits(sb); 2906 credits += ocfs2_add_dir_index_credits(sb);
2905 2907
@@ -2940,8 +2942,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2940 goto out; 2942 goto out;
2941 } 2943 }
2942 2944
2943 down_write(&oi->ip_alloc_sem);
2944
2945 /* 2945 /*
2946 * Prepare for worst case allocation scenario of two separate 2946 * Prepare for worst case allocation scenario of two separate
2947 * extents in the unindexed tree. 2947 * extents in the unindexed tree.
@@ -2953,7 +2953,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2953 if (IS_ERR(handle)) { 2953 if (IS_ERR(handle)) {
2954 ret = PTR_ERR(handle); 2954 ret = PTR_ERR(handle);
2955 mlog_errno(ret); 2955 mlog_errno(ret);
2956 goto out_sem; 2956 goto out;
2957 } 2957 }
2958 2958
2959 if (vfs_dq_alloc_space_nodirty(dir, 2959 if (vfs_dq_alloc_space_nodirty(dir,
@@ -3172,10 +3172,8 @@ out_commit:
3172 3172
3173 ocfs2_commit_trans(osb, handle); 3173 ocfs2_commit_trans(osb, handle);
3174 3174
3175out_sem:
3176 up_write(&oi->ip_alloc_sem);
3177
3178out: 3175out:
3176 up_write(&oi->ip_alloc_sem);
3179 if (data_ac) 3177 if (data_ac)
3180 ocfs2_free_alloc_context(data_ac); 3178 ocfs2_free_alloc_context(data_ac);
3181 if (meta_ac) 3179 if (meta_ac)
@@ -3322,11 +3320,15 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
3322 brelse(new_bh); 3320 brelse(new_bh);
3323 new_bh = NULL; 3321 new_bh = NULL;
3324 3322
3323 down_write(&OCFS2_I(dir)->ip_alloc_sem);
3324 drop_alloc_sem = 1;
3325 dir_i_size = i_size_read(dir); 3325 dir_i_size = i_size_read(dir);
3326 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; 3326 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
3327 goto do_extend; 3327 goto do_extend;
3328 } 3328 }
3329 3329
3330 down_write(&OCFS2_I(dir)->ip_alloc_sem);
3331 drop_alloc_sem = 1;
3330 dir_i_size = i_size_read(dir); 3332 dir_i_size = i_size_read(dir);
3331 mlog(0, "extending dir %llu (i_size = %lld)\n", 3333 mlog(0, "extending dir %llu (i_size = %lld)\n",
3332 (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size); 3334 (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
@@ -3370,9 +3372,6 @@ do_extend:
3370 credits++; /* For attaching the new dirent block to the 3372 credits++; /* For attaching the new dirent block to the
3371 * dx_root */ 3373 * dx_root */
3372 3374
3373 down_write(&OCFS2_I(dir)->ip_alloc_sem);
3374 drop_alloc_sem = 1;
3375
3376 handle = ocfs2_start_trans(osb, credits); 3375 handle = ocfs2_start_trans(osb, credits);
3377 if (IS_ERR(handle)) { 3376 if (IS_ERR(handle)) {
3378 status = PTR_ERR(handle); 3377 status = PTR_ERR(handle);
@@ -3435,10 +3434,10 @@ bail_bh:
3435 *new_de_bh = new_bh; 3434 *new_de_bh = new_bh;
3436 get_bh(*new_de_bh); 3435 get_bh(*new_de_bh);
3437bail: 3436bail:
3438 if (drop_alloc_sem)
3439 up_write(&OCFS2_I(dir)->ip_alloc_sem);
3440 if (handle) 3437 if (handle)
3441 ocfs2_commit_trans(osb, handle); 3438 ocfs2_commit_trans(osb, handle);
3439 if (drop_alloc_sem)
3440 up_write(&OCFS2_I(dir)->ip_alloc_sem);
3442 3441
3443 if (data_ac) 3442 if (data_ac)
3444 ocfs2_free_alloc_context(data_ac); 3443 ocfs2_free_alloc_context(data_ac);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index d07ddbe4b283..81eff8e58322 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
103 lock->ast_pending, lock->ml.type); 103 lock->ast_pending, lock->ml.type);
104 BUG(); 104 BUG();
105 } 105 }
106 BUG_ON(!list_empty(&lock->ast_list));
107 if (lock->ast_pending) 106 if (lock->ast_pending)
108 mlog(0, "lock has an ast getting flushed right now\n"); 107 mlog(0, "lock has an ast getting flushed right now\n");
109 108
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index bcb9260c3735..43e6e3280569 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
1118 1118
1119 mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", 1119 mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
1120 dlm->name, res->lockname.len, res->lockname.name, 1120 dlm->name, res->lockname.len, res->lockname.name,
1121 orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", 1121 orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
1122 send_to); 1122 send_to);
1123 1123
1124 /* send it */ 1124 /* send it */
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index fcf879ed6930..756f5b0998e0 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -122,7 +122,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
122 * that still has AST's pending... */ 122 * that still has AST's pending... */
123 in_use = !list_empty(&lock->ast_list); 123 in_use = !list_empty(&lock->ast_list);
124 spin_unlock(&dlm->ast_lock); 124 spin_unlock(&dlm->ast_lock);
125 if (in_use) { 125 if (in_use && !(flags & LKM_CANCEL)) {
126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " 126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
127 "while waiting for an ast!", res->lockname.len, 127 "while waiting for an ast!", res->lockname.len,
128 res->lockname.name); 128 res->lockname.name);
@@ -131,7 +131,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
131 131
132 spin_lock(&res->spinlock); 132 spin_lock(&res->spinlock);
133 if (res->state & DLM_LOCK_RES_IN_PROGRESS) { 133 if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
134 if (master_node) { 134 if (master_node && !(flags & LKM_CANCEL)) {
135 mlog(ML_ERROR, "lockres in progress!\n"); 135 mlog(ML_ERROR, "lockres in progress!\n");
136 spin_unlock(&res->spinlock); 136 spin_unlock(&res->spinlock);
137 return DLM_FORWARD; 137 return DLM_FORWARD;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e15fc7d50827..110bb57c46ab 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -92,6 +92,9 @@ struct ocfs2_unblock_ctl {
92 enum ocfs2_unblock_action unblock_action; 92 enum ocfs2_unblock_action unblock_action;
93}; 93};
94 94
95/* Lockdep class keys */
96struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
97
95static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, 98static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
96 int new_level); 99 int new_level);
97static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres); 100static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres);
@@ -248,6 +251,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
248 .flags = 0, 251 .flags = 0,
249}; 252};
250 253
254static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
255 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
256};
257
251static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { 258static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
252 .get_osb = ocfs2_get_dentry_osb, 259 .get_osb = ocfs2_get_dentry_osb,
253 .post_unlock = ocfs2_dentry_post_unlock, 260 .post_unlock = ocfs2_dentry_post_unlock,
@@ -313,9 +320,16 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
313 u32 dlm_flags); 320 u32 dlm_flags);
314static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, 321static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
315 int wanted); 322 int wanted);
316static void ocfs2_cluster_unlock(struct ocfs2_super *osb, 323static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
317 struct ocfs2_lock_res *lockres, 324 struct ocfs2_lock_res *lockres,
318 int level); 325 int level, unsigned long caller_ip);
326static inline void ocfs2_cluster_unlock(struct ocfs2_super *osb,
327 struct ocfs2_lock_res *lockres,
328 int level)
329{
330 __ocfs2_cluster_unlock(osb, lockres, level, _RET_IP_);
331}
332
319static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres); 333static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
320static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres); 334static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
321static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres); 335static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
@@ -485,6 +499,13 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
485 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); 499 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
486 500
487 ocfs2_init_lock_stats(res); 501 ocfs2_init_lock_stats(res);
502#ifdef CONFIG_DEBUG_LOCK_ALLOC
503 if (type != OCFS2_LOCK_TYPE_OPEN)
504 lockdep_init_map(&res->l_lockdep_map, ocfs2_lock_type_strings[type],
505 &lockdep_keys[type], 0);
506 else
507 res->l_lockdep_map.key = NULL;
508#endif
488} 509}
489 510
490void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) 511void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
@@ -637,6 +658,15 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
637 &ocfs2_nfs_sync_lops, osb); 658 &ocfs2_nfs_sync_lops, osb);
638} 659}
639 660
661static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
662 struct ocfs2_super *osb)
663{
664 ocfs2_lock_res_init_once(res);
665 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
666 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
667 &ocfs2_orphan_scan_lops, osb);
668}
669
640void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, 670void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
641 struct ocfs2_file_private *fp) 671 struct ocfs2_file_private *fp)
642{ 672{
@@ -1239,11 +1269,13 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
1239 return ret; 1269 return ret;
1240} 1270}
1241 1271
1242static int ocfs2_cluster_lock(struct ocfs2_super *osb, 1272static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
1243 struct ocfs2_lock_res *lockres, 1273 struct ocfs2_lock_res *lockres,
1244 int level, 1274 int level,
1245 u32 lkm_flags, 1275 u32 lkm_flags,
1246 int arg_flags) 1276 int arg_flags,
1277 int l_subclass,
1278 unsigned long caller_ip)
1247{ 1279{
1248 struct ocfs2_mask_waiter mw; 1280 struct ocfs2_mask_waiter mw;
1249 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); 1281 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
@@ -1386,13 +1418,37 @@ out:
1386 } 1418 }
1387 ocfs2_update_lock_stats(lockres, level, &mw, ret); 1419 ocfs2_update_lock_stats(lockres, level, &mw, ret);
1388 1420
1421#ifdef CONFIG_DEBUG_LOCK_ALLOC
1422 if (!ret && lockres->l_lockdep_map.key != NULL) {
1423 if (level == DLM_LOCK_PR)
1424 rwsem_acquire_read(&lockres->l_lockdep_map, l_subclass,
1425 !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
1426 caller_ip);
1427 else
1428 rwsem_acquire(&lockres->l_lockdep_map, l_subclass,
1429 !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
1430 caller_ip);
1431 }
1432#endif
1389 mlog_exit(ret); 1433 mlog_exit(ret);
1390 return ret; 1434 return ret;
1391} 1435}
1392 1436
1393static void ocfs2_cluster_unlock(struct ocfs2_super *osb, 1437static inline int ocfs2_cluster_lock(struct ocfs2_super *osb,
1394 struct ocfs2_lock_res *lockres, 1438 struct ocfs2_lock_res *lockres,
1395 int level) 1439 int level,
1440 u32 lkm_flags,
1441 int arg_flags)
1442{
1443 return __ocfs2_cluster_lock(osb, lockres, level, lkm_flags, arg_flags,
1444 0, _RET_IP_);
1445}
1446
1447
1448static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
1449 struct ocfs2_lock_res *lockres,
1450 int level,
1451 unsigned long caller_ip)
1396{ 1452{
1397 unsigned long flags; 1453 unsigned long flags;
1398 1454
@@ -1401,6 +1457,10 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1401 ocfs2_dec_holders(lockres, level); 1457 ocfs2_dec_holders(lockres, level);
1402 ocfs2_downconvert_on_unlock(osb, lockres); 1458 ocfs2_downconvert_on_unlock(osb, lockres);
1403 spin_unlock_irqrestore(&lockres->l_lock, flags); 1459 spin_unlock_irqrestore(&lockres->l_lock, flags);
1460#ifdef CONFIG_DEBUG_LOCK_ALLOC
1461 if (lockres->l_lockdep_map.key != NULL)
1462 rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
1463#endif
1404 mlog_exit_void(); 1464 mlog_exit_void();
1405} 1465}
1406 1466
@@ -1972,7 +2032,8 @@ static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
1972{ 2032{
1973 struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); 2033 struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
1974 2034
1975 if (lvb->lvb_version == OCFS2_LVB_VERSION 2035 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)
2036 && lvb->lvb_version == OCFS2_LVB_VERSION
1976 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) 2037 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
1977 return 1; 2038 return 1;
1978 return 0; 2039 return 0;
@@ -2145,10 +2206,11 @@ static int ocfs2_assign_bh(struct inode *inode,
2145 * returns < 0 error if the callback will never be called, otherwise 2206 * returns < 0 error if the callback will never be called, otherwise
2146 * the result of the lock will be communicated via the callback. 2207 * the result of the lock will be communicated via the callback.
2147 */ 2208 */
2148int ocfs2_inode_lock_full(struct inode *inode, 2209int ocfs2_inode_lock_full_nested(struct inode *inode,
2149 struct buffer_head **ret_bh, 2210 struct buffer_head **ret_bh,
2150 int ex, 2211 int ex,
2151 int arg_flags) 2212 int arg_flags,
2213 int subclass)
2152{ 2214{
2153 int status, level, acquired; 2215 int status, level, acquired;
2154 u32 dlm_flags; 2216 u32 dlm_flags;
@@ -2186,7 +2248,8 @@ int ocfs2_inode_lock_full(struct inode *inode,
2186 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 2248 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
2187 dlm_flags |= DLM_LKF_NOQUEUE; 2249 dlm_flags |= DLM_LKF_NOQUEUE;
2188 2250
2189 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); 2251 status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
2252 arg_flags, subclass, _RET_IP_);
2190 if (status < 0) { 2253 if (status < 0) {
2191 if (status != -EAGAIN && status != -EIOCBRETRY) 2254 if (status != -EAGAIN && status != -EIOCBRETRY)
2192 mlog_errno(status); 2255 mlog_errno(status);
@@ -2352,6 +2415,47 @@ void ocfs2_inode_unlock(struct inode *inode,
2352 mlog_exit_void(); 2415 mlog_exit_void();
2353} 2416}
2354 2417
2418int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
2419{
2420 struct ocfs2_lock_res *lockres;
2421 struct ocfs2_orphan_scan_lvb *lvb;
2422 int status = 0;
2423
2424 if (ocfs2_is_hard_readonly(osb))
2425 return -EROFS;
2426
2427 if (ocfs2_mount_local(osb))
2428 return 0;
2429
2430 lockres = &osb->osb_orphan_scan.os_lockres;
2431 status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
2432 if (status < 0)
2433 return status;
2434
2435 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
2436 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
2437 lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
2438 *seqno = be32_to_cpu(lvb->lvb_os_seqno);
2439 else
2440 *seqno = osb->osb_orphan_scan.os_seqno + 1;
2441
2442 return status;
2443}
2444
2445void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno)
2446{
2447 struct ocfs2_lock_res *lockres;
2448 struct ocfs2_orphan_scan_lvb *lvb;
2449
2450 if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) {
2451 lockres = &osb->osb_orphan_scan.os_lockres;
2452 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
2453 lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
2454 lvb->lvb_os_seqno = cpu_to_be32(seqno);
2455 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
2456 }
2457}
2458
2355int ocfs2_super_lock(struct ocfs2_super *osb, 2459int ocfs2_super_lock(struct ocfs2_super *osb,
2356 int ex) 2460 int ex)
2357{ 2461{
@@ -2842,6 +2946,7 @@ local:
2842 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); 2946 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2843 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); 2947 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2844 ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); 2948 ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
2949 ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
2845 2950
2846 osb->cconn = conn; 2951 osb->cconn = conn;
2847 2952
@@ -2878,6 +2983,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
2878 ocfs2_lock_res_free(&osb->osb_super_lockres); 2983 ocfs2_lock_res_free(&osb->osb_super_lockres);
2879 ocfs2_lock_res_free(&osb->osb_rename_lockres); 2984 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2880 ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres); 2985 ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
2986 ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
2881 2987
2882 ocfs2_cluster_disconnect(osb->cconn, hangup_pending); 2988 ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
2883 osb->cconn = NULL; 2989 osb->cconn = NULL;
@@ -3061,6 +3167,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
3061 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); 3167 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
3062 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); 3168 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
3063 ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres); 3169 ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
3170 ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
3064} 3171}
3065 3172
3066int ocfs2_drop_inode_locks(struct inode *inode) 3173int ocfs2_drop_inode_locks(struct inode *inode)
@@ -3576,7 +3683,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
3576 struct ocfs2_global_disk_dqinfo *gdinfo; 3683 struct ocfs2_global_disk_dqinfo *gdinfo;
3577 int status = 0; 3684 int status = 0;
3578 3685
3579 if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) { 3686 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
3687 lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
3580 info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace); 3688 info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
3581 info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace); 3689 info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
3582 oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms); 3690 oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e1fd5721cd7f..7553836931de 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
62 __be32 lvb_free_entry; 62 __be32 lvb_free_entry;
63}; 63};
64 64
65#define OCFS2_ORPHAN_LVB_VERSION 1
66
67struct ocfs2_orphan_scan_lvb {
68 __u8 lvb_version;
69 __u8 lvb_reserved[3];
70 __be32 lvb_os_seqno;
71};
72
65/* ocfs2_inode_lock_full() 'arg_flags' flags */ 73/* ocfs2_inode_lock_full() 'arg_flags' flags */
66/* don't wait on recovery. */ 74/* don't wait on recovery. */
67#define OCFS2_META_LOCK_RECOVERY (0x01) 75#define OCFS2_META_LOCK_RECOVERY (0x01)
@@ -70,6 +78,14 @@ struct ocfs2_qinfo_lvb {
70/* don't block waiting for the downconvert thread, instead return -EAGAIN */ 78/* don't block waiting for the downconvert thread, instead return -EAGAIN */
71#define OCFS2_LOCK_NONBLOCK (0x04) 79#define OCFS2_LOCK_NONBLOCK (0x04)
72 80
81/* Locking subclasses of inode cluster lock */
82enum {
83 OI_LS_NORMAL = 0,
84 OI_LS_PARENT,
85 OI_LS_RENAME1,
86 OI_LS_RENAME2,
87};
88
73int ocfs2_dlm_init(struct ocfs2_super *osb); 89int ocfs2_dlm_init(struct ocfs2_super *osb);
74void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending); 90void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
75void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); 91void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
@@ -96,23 +112,32 @@ void ocfs2_open_unlock(struct inode *inode);
96int ocfs2_inode_lock_atime(struct inode *inode, 112int ocfs2_inode_lock_atime(struct inode *inode,
97 struct vfsmount *vfsmnt, 113 struct vfsmount *vfsmnt,
98 int *level); 114 int *level);
99int ocfs2_inode_lock_full(struct inode *inode, 115int ocfs2_inode_lock_full_nested(struct inode *inode,
100 struct buffer_head **ret_bh, 116 struct buffer_head **ret_bh,
101 int ex, 117 int ex,
102 int arg_flags); 118 int arg_flags,
119 int subclass);
103int ocfs2_inode_lock_with_page(struct inode *inode, 120int ocfs2_inode_lock_with_page(struct inode *inode,
104 struct buffer_head **ret_bh, 121 struct buffer_head **ret_bh,
105 int ex, 122 int ex,
106 struct page *page); 123 struct page *page);
124/* Variants without special locking class or flags */
125#define ocfs2_inode_lock_full(i, r, e, f)\
126 ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL)
127#define ocfs2_inode_lock_nested(i, b, e, s)\
128 ocfs2_inode_lock_full_nested(i, b, e, 0, s)
107/* 99% of the time we don't want to supply any additional flags -- 129/* 99% of the time we don't want to supply any additional flags --
108 * those are for very specific cases only. */ 130 * those are for very specific cases only. */
109#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0) 131#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
110void ocfs2_inode_unlock(struct inode *inode, 132void ocfs2_inode_unlock(struct inode *inode,
111 int ex); 133 int ex);
112int ocfs2_super_lock(struct ocfs2_super *osb, 134int ocfs2_super_lock(struct ocfs2_super *osb,
113 int ex); 135 int ex);
114void ocfs2_super_unlock(struct ocfs2_super *osb, 136void ocfs2_super_unlock(struct ocfs2_super *osb,
115 int ex); 137 int ex);
138int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno);
139void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno);
140
116int ocfs2_rename_lock(struct ocfs2_super *osb); 141int ocfs2_rename_lock(struct ocfs2_super *osb);
117void ocfs2_rename_unlock(struct ocfs2_super *osb); 142void ocfs2_rename_unlock(struct ocfs2_super *osb);
118int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex); 143int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c2a87c885b73..aa501d3f93f1 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -187,6 +187,9 @@ static int ocfs2_sync_file(struct file *file,
187 if (err) 187 if (err)
188 goto bail; 188 goto bail;
189 189
190 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
191 goto bail;
192
190 journal = osb->journal->j_journal; 193 journal = osb->journal->j_journal;
191 err = jbd2_journal_force_commit(journal); 194 err = jbd2_journal_force_commit(journal);
192 195
@@ -894,9 +897,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
894 struct ocfs2_super *osb = OCFS2_SB(sb); 897 struct ocfs2_super *osb = OCFS2_SB(sb);
895 struct buffer_head *bh = NULL; 898 struct buffer_head *bh = NULL;
896 handle_t *handle = NULL; 899 handle_t *handle = NULL;
897 int locked[MAXQUOTAS] = {0, 0}; 900 int qtype;
898 int credits, qtype; 901 struct dquot *transfer_from[MAXQUOTAS] = { };
899 struct ocfs2_mem_dqinfo *oinfo; 902 struct dquot *transfer_to[MAXQUOTAS] = { };
900 903
901 mlog_entry("(0x%p, '%.*s')\n", dentry, 904 mlog_entry("(0x%p, '%.*s')\n", dentry,
902 dentry->d_name.len, dentry->d_name.name); 905 dentry->d_name.len, dentry->d_name.name);
@@ -969,30 +972,37 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
969 972
970 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 973 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
971 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 974 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
972 credits = OCFS2_INODE_UPDATE_CREDITS; 975 /*
976 * Gather pointers to quota structures so that allocation /
977 * freeing of quota structures happens here and not inside
978 * vfs_dq_transfer() where we have problems with lock ordering
979 */
973 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid 980 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
974 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 981 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
975 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 982 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
976 oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv; 983 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
977 status = ocfs2_lock_global_qf(oinfo, 1); 984 USRQUOTA);
978 if (status < 0) 985 transfer_from[USRQUOTA] = dqget(sb, inode->i_uid,
986 USRQUOTA);
987 if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
988 status = -ESRCH;
979 goto bail_unlock; 989 goto bail_unlock;
980 credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) + 990 }
981 ocfs2_calc_qdel_credits(sb, USRQUOTA);
982 locked[USRQUOTA] = 1;
983 } 991 }
984 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid 992 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
985 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 993 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
986 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 994 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
987 oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv; 995 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
988 status = ocfs2_lock_global_qf(oinfo, 1); 996 GRPQUOTA);
989 if (status < 0) 997 transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid,
998 GRPQUOTA);
999 if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
1000 status = -ESRCH;
990 goto bail_unlock; 1001 goto bail_unlock;
991 credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) + 1002 }
992 ocfs2_calc_qdel_credits(sb, GRPQUOTA);
993 locked[GRPQUOTA] = 1;
994 } 1003 }
995 handle = ocfs2_start_trans(osb, credits); 1004 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
1005 2 * ocfs2_quota_trans_credits(sb));
996 if (IS_ERR(handle)) { 1006 if (IS_ERR(handle)) {
997 status = PTR_ERR(handle); 1007 status = PTR_ERR(handle);
998 mlog_errno(status); 1008 mlog_errno(status);
@@ -1030,12 +1040,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1030bail_commit: 1040bail_commit:
1031 ocfs2_commit_trans(osb, handle); 1041 ocfs2_commit_trans(osb, handle);
1032bail_unlock: 1042bail_unlock:
1033 for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
1034 if (!locked[qtype])
1035 continue;
1036 oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
1037 ocfs2_unlock_global_qf(oinfo, 1);
1038 }
1039 ocfs2_inode_unlock(inode, 1); 1043 ocfs2_inode_unlock(inode, 1);
1040bail_unlock_rw: 1044bail_unlock_rw:
1041 if (size_change) 1045 if (size_change)
@@ -1043,6 +1047,12 @@ bail_unlock_rw:
1043bail: 1047bail:
1044 brelse(bh); 1048 brelse(bh);
1045 1049
1050 /* Release quota pointers in case we acquired them */
1051 for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
1052 dqput(transfer_to[qtype]);
1053 dqput(transfer_from[qtype]);
1054 }
1055
1046 if (!status && attr->ia_valid & ATTR_MODE) { 1056 if (!status && attr->ia_valid & ATTR_MODE) {
1047 status = ocfs2_acl_chmod(inode); 1057 status = ocfs2_acl_chmod(inode);
1048 if (status < 0) 1058 if (status < 0)
@@ -1841,6 +1851,7 @@ relock:
1841 if (ret) 1851 if (ret)
1842 goto out_dio; 1852 goto out_dio;
1843 1853
1854 count = ocount;
1844 ret = generic_write_checks(file, ppos, &count, 1855 ret = generic_write_checks(file, ppos, &count,
1845 S_ISBLK(inode->i_mode)); 1856 S_ISBLK(inode->i_mode));
1846 if (ret) 1857 if (ret)
@@ -1908,8 +1919,10 @@ out_sems:
1908 1919
1909 mutex_unlock(&inode->i_mutex); 1920 mutex_unlock(&inode->i_mutex);
1910 1921
1922 if (written)
1923 ret = written;
1911 mlog_exit(ret); 1924 mlog_exit(ret);
1912 return written ? written : ret; 1925 return ret;
1913} 1926}
1914 1927
1915static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, 1928static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
@@ -2016,7 +2029,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
2016 size_t len, 2029 size_t len,
2017 unsigned int flags) 2030 unsigned int flags)
2018{ 2031{
2019 int ret = 0; 2032 int ret = 0, lock_level = 0;
2020 struct inode *inode = in->f_path.dentry->d_inode; 2033 struct inode *inode = in->f_path.dentry->d_inode;
2021 2034
2022 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, 2035 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
@@ -2027,12 +2040,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
2027 /* 2040 /*
2028 * See the comment in ocfs2_file_aio_read() 2041 * See the comment in ocfs2_file_aio_read()
2029 */ 2042 */
2030 ret = ocfs2_inode_lock(inode, NULL, 0); 2043 ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
2031 if (ret < 0) { 2044 if (ret < 0) {
2032 mlog_errno(ret); 2045 mlog_errno(ret);
2033 goto bail; 2046 goto bail;
2034 } 2047 }
2035 ocfs2_inode_unlock(inode, 0); 2048 ocfs2_inode_unlock(inode, lock_level);
2036 2049
2037 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 2050 ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2038 2051
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 10e1fa87396a..4dc8890ba316 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -215,6 +215,8 @@ bail:
215static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) 215static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
216{ 216{
217 struct ocfs2_find_inode_args *args = opaque; 217 struct ocfs2_find_inode_args *args = opaque;
218 static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
219 ocfs2_file_ip_alloc_sem_key;
218 220
219 mlog_entry("inode = %p, opaque = %p\n", inode, opaque); 221 mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
220 222
@@ -223,6 +225,15 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
223 if (args->fi_sysfile_type != 0) 225 if (args->fi_sysfile_type != 0)
224 lockdep_set_class(&inode->i_mutex, 226 lockdep_set_class(&inode->i_mutex,
225 &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); 227 &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
228 if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
229 args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
230 args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
231 args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE)
232 lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
233 &ocfs2_quota_ip_alloc_sem_key);
234 else
235 lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
236 &ocfs2_file_ip_alloc_sem_key);
226 237
227 mlog_exit(0); 238 mlog_exit(0);
228 return 0; 239 return 0;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 9fcd36dcc9a0..467b413bec21 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,7 +7,6 @@
7 7
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/smp_lock.h>
11 10
12#define MLOG_MASK_PREFIX ML_INODE 11#define MLOG_MASK_PREFIX ML_INODE
13#include <cluster/masklog.h> 12#include <cluster/masklog.h>
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a20a0f1e37fd..c48b93ac6b65 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -28,6 +28,8 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/kthread.h> 30#include <linux/kthread.h>
31#include <linux/time.h>
32#include <linux/random.h>
31 33
32#define MLOG_MASK_PREFIX ML_JOURNAL 34#define MLOG_MASK_PREFIX ML_JOURNAL
33#include <cluster/masklog.h> 35#include <cluster/masklog.h>
@@ -52,6 +54,8 @@
52 54
53DEFINE_SPINLOCK(trans_inc_lock); 55DEFINE_SPINLOCK(trans_inc_lock);
54 56
57#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
58
55static int ocfs2_force_read_journal(struct inode *inode); 59static int ocfs2_force_read_journal(struct inode *inode);
56static int ocfs2_recover_node(struct ocfs2_super *osb, 60static int ocfs2_recover_node(struct ocfs2_super *osb,
57 int node_num, int slot_num); 61 int node_num, int slot_num);
@@ -1841,6 +1845,134 @@ bail:
1841 return status; 1845 return status;
1842} 1846}
1843 1847
1848/*
1849 * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
1850 * randomness to the timeout to minimize multple nodes firing the timer at the
1851 * same time.
1852 */
1853static inline unsigned long ocfs2_orphan_scan_timeout(void)
1854{
1855 unsigned long time;
1856
1857 get_random_bytes(&time, sizeof(time));
1858 time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
1859 return msecs_to_jiffies(time);
1860}
1861
1862/*
1863 * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
1864 * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
1865 * is done to catch any orphans that are left over in orphan directories.
1866 *
1867 * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
1868 * seconds. It gets an EX lock on os_lockres and checks sequence number
1869 * stored in LVB. If the sequence number has changed, it means some other
1870 * node has done the scan. This node skips the scan and tracks the
1871 * sequence number. If the sequence number didn't change, it means a scan
1872 * hasn't happened. The node queues a scan and increments the
1873 * sequence number in the LVB.
1874 */
1875void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1876{
1877 struct ocfs2_orphan_scan *os;
1878 int status, i;
1879 u32 seqno = 0;
1880
1881 os = &osb->osb_orphan_scan;
1882
1883 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
1884 goto out;
1885
1886 status = ocfs2_orphan_scan_lock(osb, &seqno);
1887 if (status < 0) {
1888 if (status != -EAGAIN)
1889 mlog_errno(status);
1890 goto out;
1891 }
1892
1893 /* Do no queue the tasks if the volume is being umounted */
1894 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
1895 goto unlock;
1896
1897 if (os->os_seqno != seqno) {
1898 os->os_seqno = seqno;
1899 goto unlock;
1900 }
1901
1902 for (i = 0; i < osb->max_slots; i++)
1903 ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
1904 NULL);
1905 /*
1906 * We queued a recovery on orphan slots, increment the sequence
1907 * number and update LVB so other node will skip the scan for a while
1908 */
1909 seqno++;
1910 os->os_count++;
1911 os->os_scantime = CURRENT_TIME;
1912unlock:
1913 ocfs2_orphan_scan_unlock(osb, seqno);
1914out:
1915 return;
1916}
1917
1918/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
1919void ocfs2_orphan_scan_work(struct work_struct *work)
1920{
1921 struct ocfs2_orphan_scan *os;
1922 struct ocfs2_super *osb;
1923
1924 os = container_of(work, struct ocfs2_orphan_scan,
1925 os_orphan_scan_work.work);
1926 osb = os->os_osb;
1927
1928 mutex_lock(&os->os_lock);
1929 ocfs2_queue_orphan_scan(osb);
1930 if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
1931 schedule_delayed_work(&os->os_orphan_scan_work,
1932 ocfs2_orphan_scan_timeout());
1933 mutex_unlock(&os->os_lock);
1934}
1935
1936void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
1937{
1938 struct ocfs2_orphan_scan *os;
1939
1940 os = &osb->osb_orphan_scan;
1941 if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) {
1942 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
1943 mutex_lock(&os->os_lock);
1944 cancel_delayed_work(&os->os_orphan_scan_work);
1945 mutex_unlock(&os->os_lock);
1946 }
1947}
1948
1949void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
1950{
1951 struct ocfs2_orphan_scan *os;
1952
1953 os = &osb->osb_orphan_scan;
1954 os->os_osb = osb;
1955 os->os_count = 0;
1956 os->os_seqno = 0;
1957 mutex_init(&os->os_lock);
1958 INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
1959}
1960
1961void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
1962{
1963 struct ocfs2_orphan_scan *os;
1964
1965 os = &osb->osb_orphan_scan;
1966 os->os_scantime = CURRENT_TIME;
1967 if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
1968 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
1969 else {
1970 atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
1971 schedule_delayed_work(&os->os_orphan_scan_work,
1972 ocfs2_orphan_scan_timeout());
1973 }
1974}
1975
1844struct ocfs2_orphan_filldir_priv { 1976struct ocfs2_orphan_filldir_priv {
1845 struct inode *head; 1977 struct inode *head;
1846 struct ocfs2_super *osb; 1978 struct ocfs2_super *osb;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index eb7b76331eb7..2c3222aec622 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -144,6 +144,11 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
144} 144}
145 145
146/* Exported only for the journal struct init code in super.c. Do not call. */ 146/* Exported only for the journal struct init code in super.c. Do not call. */
147void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
148void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
149void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
150void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
151
147void ocfs2_complete_recovery(struct work_struct *work); 152void ocfs2_complete_recovery(struct work_struct *work);
148void ocfs2_wait_for_recovery(struct ocfs2_super *osb); 153void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
149 154
@@ -325,20 +330,27 @@ int ocfs2_journal_dirty(handle_t *handle,
325/* extended attribute block update */ 330/* extended attribute block update */
326#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 331#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
327 332
333/* Update of a single quota block */
334#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
335
328/* global quotafile inode update, data block */ 336/* global quotafile inode update, data block */
329#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) 337#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
338 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
330 339
340#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
331/* 341/*
332 * The two writes below can accidentally see global info dirty due 342 * The two writes below can accidentally see global info dirty due
333 * to set_info() quotactl so make them prepared for the writes. 343 * to set_info() quotactl so make them prepared for the writes.
334 */ 344 */
335/* quota data block, global info */ 345/* quota data block, global info */
336/* Write to local quota file */ 346/* Write to local quota file */
337#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1) 347#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
348 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
338 349
339/* global quota data block, local quota data block, global quota inode, 350/* global quota data block, local quota data block, global quota inode,
340 * global quota info */ 351 * global quota info */
341#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3) 352#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
353 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
342 354
343static inline int ocfs2_quota_trans_credits(struct super_block *sb) 355static inline int ocfs2_quota_trans_credits(struct super_block *sb)
344{ 356{
@@ -351,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb)
351 return credits; 363 return credits;
352} 364}
353 365
354/* Number of credits needed for removing quota structure from file */
355int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
356/* Number of credits needed for initialization of new quota structure */
357int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
358
359/* group extend. inode update and last group update. */ 366/* group extend. inode update and last group update. */
360#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) 367#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
361 368
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 33464c6b60a2..8601f934010b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -118,7 +118,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
118 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len, 118 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
119 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); 119 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
120 120
121 status = ocfs2_inode_lock(dir, NULL, 0); 121 status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT);
122 if (status < 0) { 122 if (status < 0) {
123 if (status != -ENOENT) 123 if (status != -ENOENT)
124 mlog_errno(status); 124 mlog_errno(status);
@@ -636,7 +636,7 @@ static int ocfs2_link(struct dentry *old_dentry,
636 if (S_ISDIR(inode->i_mode)) 636 if (S_ISDIR(inode->i_mode))
637 return -EPERM; 637 return -EPERM;
638 638
639 err = ocfs2_inode_lock(dir, &parent_fe_bh, 1); 639 err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
640 if (err < 0) { 640 if (err < 0) {
641 if (err != -ENOENT) 641 if (err != -ENOENT)
642 mlog_errno(err); 642 mlog_errno(err);
@@ -800,7 +800,8 @@ static int ocfs2_unlink(struct inode *dir,
800 return -EPERM; 800 return -EPERM;
801 } 801 }
802 802
803 status = ocfs2_inode_lock(dir, &parent_node_bh, 1); 803 status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1,
804 OI_LS_PARENT);
804 if (status < 0) { 805 if (status < 0) {
805 if (status != -ENOENT) 806 if (status != -ENOENT)
806 mlog_errno(status); 807 mlog_errno(status);
@@ -978,7 +979,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
978 inode1 = tmpinode; 979 inode1 = tmpinode;
979 } 980 }
980 /* lock id2 */ 981 /* lock id2 */
981 status = ocfs2_inode_lock(inode2, bh2, 1); 982 status = ocfs2_inode_lock_nested(inode2, bh2, 1,
983 OI_LS_RENAME1);
982 if (status < 0) { 984 if (status < 0) {
983 if (status != -ENOENT) 985 if (status != -ENOENT)
984 mlog_errno(status); 986 mlog_errno(status);
@@ -987,7 +989,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
987 } 989 }
988 990
989 /* lock id1 */ 991 /* lock id1 */
990 status = ocfs2_inode_lock(inode1, bh1, 1); 992 status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
991 if (status < 0) { 993 if (status < 0) {
992 /* 994 /*
993 * An error return must mean that no cluster locks 995 * An error return must mean that no cluster locks
@@ -1103,7 +1105,8 @@ static int ocfs2_rename(struct inode *old_dir,
1103 * won't have to concurrently downconvert the inode and the 1105 * won't have to concurrently downconvert the inode and the
1104 * dentry locks. 1106 * dentry locks.
1105 */ 1107 */
1106 status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1); 1108 status = ocfs2_inode_lock_nested(old_inode, &old_inode_bh, 1,
1109 OI_LS_PARENT);
1107 if (status < 0) { 1110 if (status < 0) {
1108 if (status != -ENOENT) 1111 if (status != -ENOENT)
1109 mlog_errno(status); 1112 mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1386281950db..39e1d5a39505 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -34,6 +34,7 @@
34#include <linux/workqueue.h> 34#include <linux/workqueue.h>
35#include <linux/kref.h> 35#include <linux/kref.h>
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/lockdep.h>
37#ifndef CONFIG_OCFS2_COMPAT_JBD 38#ifndef CONFIG_OCFS2_COMPAT_JBD
38# include <linux/jbd2.h> 39# include <linux/jbd2.h>
39#else 40#else
@@ -47,6 +48,9 @@
47#include "ocfs2_fs.h" 48#include "ocfs2_fs.h"
48#include "ocfs2_lockid.h" 49#include "ocfs2_lockid.h"
49 50
51/* For struct ocfs2_blockcheck_stats */
52#include "blockcheck.h"
53
50/* Most user visible OCFS2 inodes will have very few pieces of 54/* Most user visible OCFS2 inodes will have very few pieces of
51 * metadata, but larger files (including bitmaps, etc) must be taken 55 * metadata, but larger files (including bitmaps, etc) must be taken
52 * into account when designing an access scheme. We allow a small 56 * into account when designing an access scheme. We allow a small
@@ -149,6 +153,25 @@ struct ocfs2_lock_res {
149 unsigned int l_lock_max_exmode; /* Max wait for EX */ 153 unsigned int l_lock_max_exmode; /* Max wait for EX */
150 unsigned int l_lock_refresh; /* Disk refreshes */ 154 unsigned int l_lock_refresh; /* Disk refreshes */
151#endif 155#endif
156#ifdef CONFIG_DEBUG_LOCK_ALLOC
157 struct lockdep_map l_lockdep_map;
158#endif
159};
160
161enum ocfs2_orphan_scan_state {
162 ORPHAN_SCAN_ACTIVE,
163 ORPHAN_SCAN_INACTIVE
164};
165
166struct ocfs2_orphan_scan {
167 struct mutex os_lock;
168 struct ocfs2_super *os_osb;
169 struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
170 struct delayed_work os_orphan_scan_work;
171 struct timespec os_scantime; /* time this node ran the scan */
172 u32 os_count; /* tracks node specific scans */
173 u32 os_seqno; /* tracks cluster wide scans */
174 atomic_t os_state; /* ACTIVE or INACTIVE */
152}; 175};
153 176
154struct ocfs2_dlm_debug { 177struct ocfs2_dlm_debug {
@@ -201,10 +224,12 @@ enum ocfs2_mount_options
201 OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ 224 OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
202}; 225};
203 226
204#define OCFS2_OSB_SOFT_RO 0x0001 227#define OCFS2_OSB_SOFT_RO 0x0001
205#define OCFS2_OSB_HARD_RO 0x0002 228#define OCFS2_OSB_HARD_RO 0x0002
206#define OCFS2_OSB_ERROR_FS 0x0004 229#define OCFS2_OSB_ERROR_FS 0x0004
207#define OCFS2_DEFAULT_ATIME_QUANTUM 60 230#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008
231
232#define OCFS2_DEFAULT_ATIME_QUANTUM 60
208 233
209struct ocfs2_journal; 234struct ocfs2_journal;
210struct ocfs2_slot_info; 235struct ocfs2_slot_info;
@@ -295,6 +320,7 @@ struct ocfs2_super
295 struct ocfs2_dinode *local_alloc_copy; 320 struct ocfs2_dinode *local_alloc_copy;
296 struct ocfs2_quota_recovery *quota_rec; 321 struct ocfs2_quota_recovery *quota_rec;
297 322
323 struct ocfs2_blockcheck_stats osb_ecc_stats;
298 struct ocfs2_alloc_stats alloc_stats; 324 struct ocfs2_alloc_stats alloc_stats;
299 char dev_str[20]; /* "major,minor" of the device */ 325 char dev_str[20]; /* "major,minor" of the device */
300 326
@@ -341,6 +367,8 @@ struct ocfs2_super
341 unsigned int *osb_orphan_wipes; 367 unsigned int *osb_orphan_wipes;
342 wait_queue_head_t osb_wipe_event; 368 wait_queue_head_t osb_wipe_event;
343 369
370 struct ocfs2_orphan_scan osb_orphan_scan;
371
344 /* used to protect metaecc calculation check of xattr. */ 372 /* used to protect metaecc calculation check of xattr. */
345 spinlock_t osb_xattr_lock; 373 spinlock_t osb_xattr_lock;
346 374
@@ -464,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
464 spin_unlock(&osb->osb_lock); 492 spin_unlock(&osb->osb_lock);
465} 493}
466 494
495
496static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb,
497 unsigned long flag)
498{
499 unsigned long ret;
500
501 spin_lock(&osb->osb_lock);
502 ret = osb->osb_flags & flag;
503 spin_unlock(&osb->osb_lock);
504 return ret;
505}
506
467static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, 507static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
468 int hard) 508 int hard)
469{ 509{
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index a53ce87481bf..c212cf5a2bdf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -48,6 +48,7 @@ enum ocfs2_lock_type {
48 OCFS2_LOCK_TYPE_FLOCK, 48 OCFS2_LOCK_TYPE_FLOCK,
49 OCFS2_LOCK_TYPE_QINFO, 49 OCFS2_LOCK_TYPE_QINFO,
50 OCFS2_LOCK_TYPE_NFS_SYNC, 50 OCFS2_LOCK_TYPE_NFS_SYNC,
51 OCFS2_LOCK_TYPE_ORPHAN_SCAN,
51 OCFS2_NUM_LOCK_TYPES 52 OCFS2_NUM_LOCK_TYPES
52}; 53};
53 54
@@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
85 case OCFS2_LOCK_TYPE_NFS_SYNC: 86 case OCFS2_LOCK_TYPE_NFS_SYNC:
86 c = 'Y'; 87 c = 'Y';
87 break; 88 break;
89 case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
90 c = 'P';
91 break;
88 default: 92 default:
89 c = '\0'; 93 c = '\0';
90 } 94 }
@@ -104,6 +108,8 @@ static char *ocfs2_lock_type_strings[] = {
104 [OCFS2_LOCK_TYPE_OPEN] = "Open", 108 [OCFS2_LOCK_TYPE_OPEN] = "Open",
105 [OCFS2_LOCK_TYPE_FLOCK] = "Flock", 109 [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
106 [OCFS2_LOCK_TYPE_QINFO] = "Quota", 110 [OCFS2_LOCK_TYPE_QINFO] = "Quota",
111 [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
112 [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
107}; 113};
108 114
109static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 115static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 7365e2e08706..3fb96fcd4c81 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo {
50 unsigned int dqi_chunks; /* Number of chunks in local quota file */ 50 unsigned int dqi_chunks; /* Number of chunks in local quota file */
51 unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ 51 unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
52 unsigned int dqi_syncms; /* How often should we sync with other nodes */ 52 unsigned int dqi_syncms; /* How often should we sync with other nodes */
53 unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */
54 struct list_head dqi_chunk; /* List of chunks */ 53 struct list_head dqi_chunk; /* List of chunks */
55 struct inode *dqi_gqinode; /* Global quota file inode */ 54 struct inode *dqi_gqinode; /* Global quota file inode */
56 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ 55 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 1ed0f7c86869..44f2a5e1d042 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -23,6 +23,7 @@
23#include "sysfile.h" 23#include "sysfile.h"
24#include "dlmglue.h" 24#include "dlmglue.h"
25#include "uptodate.h" 25#include "uptodate.h"
26#include "super.h"
26#include "quota.h" 27#include "quota.h"
27 28
28static struct workqueue_struct *ocfs2_quota_wq = NULL; 29static struct workqueue_struct *ocfs2_quota_wq = NULL;
@@ -69,6 +70,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
69 d->dqb_curspace = cpu_to_le64(m->dqb_curspace); 70 d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
70 d->dqb_btime = cpu_to_le64(m->dqb_btime); 71 d->dqb_btime = cpu_to_le64(m->dqb_btime);
71 d->dqb_itime = cpu_to_le64(m->dqb_itime); 72 d->dqb_itime = cpu_to_le64(m->dqb_itime);
73 d->dqb_pad1 = d->dqb_pad2 = 0;
72} 74}
73 75
74static int ocfs2_global_is_id(void *dp, struct dquot *dquot) 76static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
@@ -113,6 +115,15 @@ int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
113 int rc = 0; 115 int rc = 0;
114 struct buffer_head *tmp = *bh; 116 struct buffer_head *tmp = *bh;
115 117
118 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
119 ocfs2_error(inode->i_sb,
120 "Quota file %llu is probably corrupted! Requested "
121 "to read block %Lu but file has size only %Lu\n",
122 (unsigned long long)OCFS2_I(inode)->ip_blkno,
123 (unsigned long long)v_block,
124 (unsigned long long)i_size_read(inode));
125 return -EIO;
126 }
116 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, 127 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
117 ocfs2_validate_quota_block); 128 ocfs2_validate_quota_block);
118 if (rc) 129 if (rc)
@@ -211,14 +222,13 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
211 222
212 mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA); 223 mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
213 if (gqinode->i_size < off + len) { 224 if (gqinode->i_size < off + len) {
214 down_write(&OCFS2_I(gqinode)->ip_alloc_sem); 225 loff_t rounded_end =
215 err = ocfs2_extend_no_holes(gqinode, off + len, off); 226 ocfs2_align_bytes_to_blocks(sb, off + len);
216 up_write(&OCFS2_I(gqinode)->ip_alloc_sem); 227
217 if (err < 0) 228 /* Space is already allocated in ocfs2_global_read_dquot() */
218 goto out;
219 err = ocfs2_simple_size_update(gqinode, 229 err = ocfs2_simple_size_update(gqinode,
220 oinfo->dqi_gqi_bh, 230 oinfo->dqi_gqi_bh,
221 off + len); 231 rounded_end);
222 if (err < 0) 232 if (err < 0)
223 goto out; 233 goto out;
224 new = 1; 234 new = 1;
@@ -234,7 +244,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
234 } 244 }
235 if (err) { 245 if (err) {
236 mlog_errno(err); 246 mlog_errno(err);
237 return err; 247 goto out;
238 } 248 }
239 lock_buffer(bh); 249 lock_buffer(bh);
240 if (new) 250 if (new)
@@ -342,7 +352,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
342 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); 352 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
343 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); 353 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
344 oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms); 354 oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
345 oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
346 oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); 355 oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
347 oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); 356 oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
348 oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); 357 oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
@@ -352,7 +361,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
352 oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); 361 oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
353 INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); 362 INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
354 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, 363 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
355 oinfo->dqi_syncjiff); 364 msecs_to_jiffies(oinfo->dqi_syncms));
356 365
357out_err: 366out_err:
358 mlog_exit(status); 367 mlog_exit(status);
@@ -402,13 +411,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type)
402 return err; 411 return err;
403} 412}
404 413
414static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
415{
416 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
417
418 /*
419 * We may need to allocate tree blocks and a leaf block but not the
420 * root block
421 */
422 return oinfo->dqi_gi.dqi_qtree_depth;
423}
424
425static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
426{
427 /* We modify all the allocated blocks, tree root, and info block */
428 return (ocfs2_global_qinit_alloc(sb, type) + 2) *
429 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
430}
431
405/* Read in information from global quota file and acquire a reference to it. 432/* Read in information from global quota file and acquire a reference to it.
406 * dquot_acquire() has already started the transaction and locked quota file */ 433 * dquot_acquire() has already started the transaction and locked quota file */
407int ocfs2_global_read_dquot(struct dquot *dquot) 434int ocfs2_global_read_dquot(struct dquot *dquot)
408{ 435{
409 int err, err2, ex = 0; 436 int err, err2, ex = 0;
410 struct ocfs2_mem_dqinfo *info = 437 struct super_block *sb = dquot->dq_sb;
411 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 438 int type = dquot->dq_type;
439 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
440 struct ocfs2_super *osb = OCFS2_SB(sb);
441 struct inode *gqinode = info->dqi_gqinode;
442 int need_alloc = ocfs2_global_qinit_alloc(sb, type);
443 handle_t *handle = NULL;
412 444
413 err = ocfs2_qinfo_lock(info, 0); 445 err = ocfs2_qinfo_lock(info, 0);
414 if (err < 0) 446 if (err < 0)
@@ -419,13 +451,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
419 OCFS2_DQUOT(dquot)->dq_use_count++; 451 OCFS2_DQUOT(dquot)->dq_use_count++;
420 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; 452 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
421 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; 453 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
454 ocfs2_qinfo_unlock(info, 0);
455
422 if (!dquot->dq_off) { /* No real quota entry? */ 456 if (!dquot->dq_off) { /* No real quota entry? */
423 /* Upgrade to exclusive lock for allocation */
424 err = ocfs2_qinfo_lock(info, 1);
425 if (err < 0)
426 goto out_qlock;
427 ex = 1; 457 ex = 1;
458 /*
459 * Add blocks to quota file before we start a transaction since
460 * locking allocators ranks above a transaction start
461 */
462 WARN_ON(journal_current_handle());
463 down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
464 err = ocfs2_extend_no_holes(gqinode,
465 gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
466 gqinode->i_size);
467 up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
468 if (err < 0)
469 goto out;
428 } 470 }
471
472 handle = ocfs2_start_trans(osb,
473 ocfs2_calc_global_qinit_credits(sb, type));
474 if (IS_ERR(handle)) {
475 err = PTR_ERR(handle);
476 goto out;
477 }
478 err = ocfs2_qinfo_lock(info, ex);
479 if (err < 0)
480 goto out_trans;
429 err = qtree_write_dquot(&info->dqi_gi, dquot); 481 err = qtree_write_dquot(&info->dqi_gi, dquot);
430 if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) { 482 if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
431 err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type); 483 err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
@@ -435,7 +487,11 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
435out_qlock: 487out_qlock:
436 if (ex) 488 if (ex)
437 ocfs2_qinfo_unlock(info, 1); 489 ocfs2_qinfo_unlock(info, 1);
438 ocfs2_qinfo_unlock(info, 0); 490 else
491 ocfs2_qinfo_unlock(info, 0);
492out_trans:
493 if (handle)
494 ocfs2_commit_trans(osb, handle);
439out: 495out:
440 if (err < 0) 496 if (err < 0)
441 mlog_errno(err); 497 mlog_errno(err);
@@ -605,7 +661,7 @@ static void qsync_work_fn(struct work_struct *work)
605 661
606 dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); 662 dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
607 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, 663 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
608 oinfo->dqi_syncjiff); 664 msecs_to_jiffies(oinfo->dqi_syncms));
609} 665}
610 666
611/* 667/*
@@ -633,20 +689,18 @@ out:
633 return status; 689 return status;
634} 690}
635 691
636int ocfs2_calc_qdel_credits(struct super_block *sb, int type) 692static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
637{ 693{
638 struct ocfs2_mem_dqinfo *oinfo; 694 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
639 int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 695 /*
640 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; 696 * We modify tree, leaf block, global info, local chunk header,
641 697 * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
642 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) 698 * accounts for inode update
643 return 0; 699 */
644 700 return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
645 oinfo = sb_dqinfo(sb, type)->dqi_priv; 701 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
646 /* We modify tree, leaf block, global info, local chunk header, 702 OCFS2_QINFO_WRITE_CREDITS +
647 * global and local inode */ 703 OCFS2_INODE_UPDATE_CREDITS;
648 return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
649 2 * OCFS2_INODE_UPDATE_CREDITS;
650} 704}
651 705
652static int ocfs2_release_dquot(struct dquot *dquot) 706static int ocfs2_release_dquot(struct dquot *dquot)
@@ -678,33 +732,10 @@ out:
678 return status; 732 return status;
679} 733}
680 734
681int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
682{
683 struct ocfs2_mem_dqinfo *oinfo;
684 int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
685 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
686 struct ocfs2_dinode *lfe, *gfe;
687
688 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
689 return 0;
690
691 oinfo = sb_dqinfo(sb, type)->dqi_priv;
692 gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
693 lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
694 /* We can extend local file + global file. In local file we
695 * can modify info, chunk header block and dquot block. In
696 * global file we can modify info, tree and leaf block */
697 return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
698 ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
699 3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
700}
701
702static int ocfs2_acquire_dquot(struct dquot *dquot) 735static int ocfs2_acquire_dquot(struct dquot *dquot)
703{ 736{
704 handle_t *handle;
705 struct ocfs2_mem_dqinfo *oinfo = 737 struct ocfs2_mem_dqinfo *oinfo =
706 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 738 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
707 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
708 int status = 0; 739 int status = 0;
709 740
710 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); 741 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
@@ -713,16 +744,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
713 status = ocfs2_lock_global_qf(oinfo, 1); 744 status = ocfs2_lock_global_qf(oinfo, 1);
714 if (status < 0) 745 if (status < 0)
715 goto out; 746 goto out;
716 handle = ocfs2_start_trans(osb,
717 ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
718 if (IS_ERR(handle)) {
719 status = PTR_ERR(handle);
720 mlog_errno(status);
721 goto out_ilock;
722 }
723 status = dquot_acquire(dquot); 747 status = dquot_acquire(dquot);
724 ocfs2_commit_trans(osb, handle);
725out_ilock:
726 ocfs2_unlock_global_qf(oinfo, 1); 748 ocfs2_unlock_global_qf(oinfo, 1);
727out: 749out:
728 mlog_exit(status); 750 mlog_exit(status);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 07deec5e9721..bdb09cb6e1fe 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -20,6 +20,7 @@
20#include "sysfile.h" 20#include "sysfile.h"
21#include "dlmglue.h" 21#include "dlmglue.h"
22#include "quota.h" 22#include "quota.h"
23#include "uptodate.h"
23 24
24/* Number of local quota structures per block */ 25/* Number of local quota structures per block */
25static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) 26static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -100,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
100 handle_t *handle; 101 handle_t *handle;
101 int status; 102 int status;
102 103
103 handle = ocfs2_start_trans(OCFS2_SB(sb), 1); 104 handle = ocfs2_start_trans(OCFS2_SB(sb),
105 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
104 if (IS_ERR(handle)) { 106 if (IS_ERR(handle)) {
105 status = PTR_ERR(handle); 107 status = PTR_ERR(handle);
106 mlog_errno(status); 108 mlog_errno(status);
@@ -444,10 +446,6 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
444 446
445 mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type); 447 mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
446 448
447 status = ocfs2_lock_global_qf(oinfo, 1);
448 if (status < 0)
449 goto out;
450
451 list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) { 449 list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
452 chunk = rchunk->rc_chunk; 450 chunk = rchunk->rc_chunk;
453 hbh = NULL; 451 hbh = NULL;
@@ -480,12 +478,18 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
480 type); 478 type);
481 goto out_put_bh; 479 goto out_put_bh;
482 } 480 }
481 status = ocfs2_lock_global_qf(oinfo, 1);
482 if (status < 0) {
483 mlog_errno(status);
484 goto out_put_dquot;
485 }
486
483 handle = ocfs2_start_trans(OCFS2_SB(sb), 487 handle = ocfs2_start_trans(OCFS2_SB(sb),
484 OCFS2_QSYNC_CREDITS); 488 OCFS2_QSYNC_CREDITS);
485 if (IS_ERR(handle)) { 489 if (IS_ERR(handle)) {
486 status = PTR_ERR(handle); 490 status = PTR_ERR(handle);
487 mlog_errno(status); 491 mlog_errno(status);
488 goto out_put_dquot; 492 goto out_drop_lock;
489 } 493 }
490 mutex_lock(&sb_dqopt(sb)->dqio_mutex); 494 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
491 spin_lock(&dq_data_lock); 495 spin_lock(&dq_data_lock);
@@ -523,6 +527,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
523out_commit: 527out_commit:
524 mutex_unlock(&sb_dqopt(sb)->dqio_mutex); 528 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
525 ocfs2_commit_trans(OCFS2_SB(sb), handle); 529 ocfs2_commit_trans(OCFS2_SB(sb), handle);
530out_drop_lock:
531 ocfs2_unlock_global_qf(oinfo, 1);
526out_put_dquot: 532out_put_dquot:
527 dqput(dquot); 533 dqput(dquot);
528out_put_bh: 534out_put_bh:
@@ -537,8 +543,6 @@ out_put_bh:
537 if (status < 0) 543 if (status < 0)
538 break; 544 break;
539 } 545 }
540 ocfs2_unlock_global_qf(oinfo, 1);
541out:
542 if (status < 0) 546 if (status < 0)
543 free_recovery_list(&(rec->r_list[type])); 547 free_recovery_list(&(rec->r_list[type]));
544 mlog_exit(status); 548 mlog_exit(status);
@@ -608,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
608 goto out_bh; 612 goto out_bh;
609 /* Mark quota file as clean if we are recovering quota file of 613 /* Mark quota file as clean if we are recovering quota file of
610 * some other node. */ 614 * some other node. */
611 handle = ocfs2_start_trans(osb, 1); 615 handle = ocfs2_start_trans(osb,
616 OCFS2_LOCAL_QINFO_WRITE_CREDITS);
612 if (IS_ERR(handle)) { 617 if (IS_ERR(handle)) {
613 status = PTR_ERR(handle); 618 status = PTR_ERR(handle);
614 mlog_errno(status); 619 mlog_errno(status);
@@ -655,6 +660,9 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
655 struct ocfs2_quota_recovery *rec; 660 struct ocfs2_quota_recovery *rec;
656 int locked = 0; 661 int locked = 0;
657 662
663 /* We don't need the lock and we have to acquire quota file locks
664 * which will later depend on this lock */
665 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
658 info->dqi_maxblimit = 0x7fffffffffffffffLL; 666 info->dqi_maxblimit = 0x7fffffffffffffffLL;
659 info->dqi_maxilimit = 0x7fffffffffffffffLL; 667 info->dqi_maxilimit = 0x7fffffffffffffffLL;
660 oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); 668 oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
@@ -733,6 +741,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
733 goto out_err; 741 goto out_err;
734 } 742 }
735 743
744 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
736 return 0; 745 return 0;
737out_err: 746out_err:
738 if (oinfo) { 747 if (oinfo) {
@@ -746,6 +755,7 @@ out_err:
746 kfree(oinfo); 755 kfree(oinfo);
747 } 756 }
748 brelse(bh); 757 brelse(bh);
758 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
749 return -1; 759 return -1;
750} 760}
751 761
@@ -933,7 +943,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
933 struct ocfs2_local_disk_chunk *dchunk; 943 struct ocfs2_local_disk_chunk *dchunk;
934 int status; 944 int status;
935 handle_t *handle; 945 handle_t *handle;
936 struct buffer_head *bh = NULL; 946 struct buffer_head *bh = NULL, *dbh = NULL;
937 u64 p_blkno; 947 u64 p_blkno;
938 948
939 /* We are protected by dqio_sem so no locking needed */ 949 /* We are protected by dqio_sem so no locking needed */
@@ -957,32 +967,35 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
957 mlog_errno(status); 967 mlog_errno(status);
958 goto out; 968 goto out;
959 } 969 }
970 /* Local quota info and two new blocks we initialize */
971 handle = ocfs2_start_trans(OCFS2_SB(sb),
972 OCFS2_LOCAL_QINFO_WRITE_CREDITS +
973 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
974 if (IS_ERR(handle)) {
975 status = PTR_ERR(handle);
976 mlog_errno(status);
977 goto out;
978 }
960 979
980 /* Initialize chunk header */
961 down_read(&OCFS2_I(lqinode)->ip_alloc_sem); 981 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
962 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, 982 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
963 &p_blkno, NULL, NULL); 983 &p_blkno, NULL, NULL);
964 up_read(&OCFS2_I(lqinode)->ip_alloc_sem); 984 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
965 if (status < 0) { 985 if (status < 0) {
966 mlog_errno(status); 986 mlog_errno(status);
967 goto out; 987 goto out_trans;
968 } 988 }
969 bh = sb_getblk(sb, p_blkno); 989 bh = sb_getblk(sb, p_blkno);
970 if (!bh) { 990 if (!bh) {
971 status = -ENOMEM; 991 status = -ENOMEM;
972 mlog_errno(status); 992 mlog_errno(status);
973 goto out; 993 goto out_trans;
974 } 994 }
975 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; 995 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
976 996 ocfs2_set_new_buffer_uptodate(lqinode, bh);
977 handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
978 if (IS_ERR(handle)) {
979 status = PTR_ERR(handle);
980 mlog_errno(status);
981 goto out;
982 }
983
984 status = ocfs2_journal_access_dq(handle, lqinode, bh, 997 status = ocfs2_journal_access_dq(handle, lqinode, bh,
985 OCFS2_JOURNAL_ACCESS_WRITE); 998 OCFS2_JOURNAL_ACCESS_CREATE);
986 if (status < 0) { 999 if (status < 0) {
987 mlog_errno(status); 1000 mlog_errno(status);
988 goto out_trans; 1001 goto out_trans;
@@ -992,7 +1005,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
992 memset(dchunk->dqc_bitmap, 0, 1005 memset(dchunk->dqc_bitmap, 0,
993 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - 1006 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
994 OCFS2_QBLK_RESERVED_SPACE); 1007 OCFS2_QBLK_RESERVED_SPACE);
995 set_buffer_uptodate(bh);
996 unlock_buffer(bh); 1008 unlock_buffer(bh);
997 status = ocfs2_journal_dirty(handle, bh); 1009 status = ocfs2_journal_dirty(handle, bh);
998 if (status < 0) { 1010 if (status < 0) {
@@ -1000,6 +1012,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1000 goto out_trans; 1012 goto out_trans;
1001 } 1013 }
1002 1014
1015 /* Initialize new block with structures */
1016 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1017 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
1018 &p_blkno, NULL, NULL);
1019 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1020 if (status < 0) {
1021 mlog_errno(status);
1022 goto out_trans;
1023 }
1024 dbh = sb_getblk(sb, p_blkno);
1025 if (!dbh) {
1026 status = -ENOMEM;
1027 mlog_errno(status);
1028 goto out_trans;
1029 }
1030 ocfs2_set_new_buffer_uptodate(lqinode, dbh);
1031 status = ocfs2_journal_access_dq(handle, lqinode, dbh,
1032 OCFS2_JOURNAL_ACCESS_CREATE);
1033 if (status < 0) {
1034 mlog_errno(status);
1035 goto out_trans;
1036 }
1037 lock_buffer(dbh);
1038 memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
1039 unlock_buffer(dbh);
1040 status = ocfs2_journal_dirty(handle, dbh);
1041 if (status < 0) {
1042 mlog_errno(status);
1043 goto out_trans;
1044 }
1045
1046 /* Update local quotafile info */
1003 oinfo->dqi_blocks += 2; 1047 oinfo->dqi_blocks += 2;
1004 oinfo->dqi_chunks++; 1048 oinfo->dqi_chunks++;
1005 status = ocfs2_local_write_info(sb, type); 1049 status = ocfs2_local_write_info(sb, type);
@@ -1024,6 +1068,7 @@ out_trans:
1024 ocfs2_commit_trans(OCFS2_SB(sb), handle); 1068 ocfs2_commit_trans(OCFS2_SB(sb), handle);
1025out: 1069out:
1026 brelse(bh); 1070 brelse(bh);
1071 brelse(dbh);
1027 kmem_cache_free(ocfs2_qf_chunk_cachep, chunk); 1072 kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
1028 return ERR_PTR(status); 1073 return ERR_PTR(status);
1029} 1074}
@@ -1041,6 +1086,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1041 struct ocfs2_local_disk_chunk *dchunk; 1086 struct ocfs2_local_disk_chunk *dchunk;
1042 int epb = ol_quota_entries_per_block(sb); 1087 int epb = ol_quota_entries_per_block(sb);
1043 unsigned int chunk_blocks; 1088 unsigned int chunk_blocks;
1089 struct buffer_head *bh;
1090 u64 p_blkno;
1044 int status; 1091 int status;
1045 handle_t *handle; 1092 handle_t *handle;
1046 1093
@@ -1068,12 +1115,49 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1068 mlog_errno(status); 1115 mlog_errno(status);
1069 goto out; 1116 goto out;
1070 } 1117 }
1071 handle = ocfs2_start_trans(OCFS2_SB(sb), 2); 1118
1119 /* Get buffer from the just added block */
1120 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1121 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
1122 &p_blkno, NULL, NULL);
1123 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1124 if (status < 0) {
1125 mlog_errno(status);
1126 goto out;
1127 }
1128 bh = sb_getblk(sb, p_blkno);
1129 if (!bh) {
1130 status = -ENOMEM;
1131 mlog_errno(status);
1132 goto out;
1133 }
1134 ocfs2_set_new_buffer_uptodate(lqinode, bh);
1135
1136 /* Local quota info, chunk header and the new block we initialize */
1137 handle = ocfs2_start_trans(OCFS2_SB(sb),
1138 OCFS2_LOCAL_QINFO_WRITE_CREDITS +
1139 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
1072 if (IS_ERR(handle)) { 1140 if (IS_ERR(handle)) {
1073 status = PTR_ERR(handle); 1141 status = PTR_ERR(handle);
1074 mlog_errno(status); 1142 mlog_errno(status);
1075 goto out; 1143 goto out;
1076 } 1144 }
1145 /* Zero created block */
1146 status = ocfs2_journal_access_dq(handle, lqinode, bh,
1147 OCFS2_JOURNAL_ACCESS_CREATE);
1148 if (status < 0) {
1149 mlog_errno(status);
1150 goto out_trans;
1151 }
1152 lock_buffer(bh);
1153 memset(bh->b_data, 0, sb->s_blocksize);
1154 unlock_buffer(bh);
1155 status = ocfs2_journal_dirty(handle, bh);
1156 if (status < 0) {
1157 mlog_errno(status);
1158 goto out_trans;
1159 }
1160 /* Update chunk header */
1077 status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, 1161 status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
1078 OCFS2_JOURNAL_ACCESS_WRITE); 1162 OCFS2_JOURNAL_ACCESS_WRITE);
1079 if (status < 0) { 1163 if (status < 0) {
@@ -1090,6 +1174,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1090 mlog_errno(status); 1174 mlog_errno(status);
1091 goto out_trans; 1175 goto out_trans;
1092 } 1176 }
1177 /* Update file header */
1093 oinfo->dqi_blocks++; 1178 oinfo->dqi_blocks++;
1094 status = ocfs2_local_write_info(sb, type); 1179 status = ocfs2_local_write_info(sb, type);
1095 if (status < 0) { 1180 if (status < 0) {
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index fcd120f1493a..e49c41050264 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -17,6 +17,7 @@
17 * General Public License for more details. 17 * General Public License for more details.
18 */ 18 */
19 19
20#include <linux/kernel.h>
20#include <linux/crc32.h> 21#include <linux/crc32.h>
21#include <linux/module.h> 22#include <linux/module.h>
22 23
@@ -153,7 +154,7 @@ static int status_map[] = {
153 154
154static int dlm_status_to_errno(enum dlm_status status) 155static int dlm_status_to_errno(enum dlm_status status)
155{ 156{
156 BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0]))); 157 BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
157 158
158 return status_map[status]; 159 return status_map[status];
159} 160}
@@ -236,6 +237,16 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
236 return dlm_status_to_errno(lksb->lksb_o2dlm.status); 237 return dlm_status_to_errno(lksb->lksb_o2dlm.status);
237} 238}
238 239
240/*
241 * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB
242 * contents, it will zero out the LVB. Thus the caller can always trust
243 * the contents.
244 */
245static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
246{
247 return 1;
248}
249
239static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb) 250static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
240{ 251{
241 return (void *)(lksb->lksb_o2dlm.lvb); 252 return (void *)(lksb->lksb_o2dlm.lvb);
@@ -354,6 +365,7 @@ static struct ocfs2_stack_operations o2cb_stack_ops = {
354 .dlm_lock = o2cb_dlm_lock, 365 .dlm_lock = o2cb_dlm_lock,
355 .dlm_unlock = o2cb_dlm_unlock, 366 .dlm_unlock = o2cb_dlm_unlock,
356 .lock_status = o2cb_dlm_lock_status, 367 .lock_status = o2cb_dlm_lock_status,
368 .lvb_valid = o2cb_dlm_lvb_valid,
357 .lock_lvb = o2cb_dlm_lvb, 369 .lock_lvb = o2cb_dlm_lvb,
358 .dump_lksb = o2cb_dump_lksb, 370 .dump_lksb = o2cb_dump_lksb,
359}; 371};
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 9b76d41a8ac6..ff4c798a5635 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -738,6 +738,13 @@ static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
738 return lksb->lksb_fsdlm.sb_status; 738 return lksb->lksb_fsdlm.sb_status;
739} 739}
740 740
741static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
742{
743 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
744
745 return !invalid;
746}
747
741static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb) 748static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
742{ 749{
743 if (!lksb->lksb_fsdlm.sb_lvbptr) 750 if (!lksb->lksb_fsdlm.sb_lvbptr)
@@ -873,6 +880,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
873 .dlm_lock = user_dlm_lock, 880 .dlm_lock = user_dlm_lock,
874 .dlm_unlock = user_dlm_unlock, 881 .dlm_unlock = user_dlm_unlock,
875 .lock_status = user_dlm_lock_status, 882 .lock_status = user_dlm_lock_status,
883 .lvb_valid = user_dlm_lvb_valid,
876 .lock_lvb = user_dlm_lvb, 884 .lock_lvb = user_dlm_lvb,
877 .plock = user_plock, 885 .plock = user_plock,
878 .dump_lksb = user_dlm_dump_lksb, 886 .dump_lksb = user_dlm_dump_lksb,
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 68b668b0e60a..3f2f1c45b7b6 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -6,7 +6,7 @@
6 * Code which implements an OCFS2 specific interface to underlying 6 * Code which implements an OCFS2 specific interface to underlying
7 * cluster stacks. 7 * cluster stacks.
8 * 8 *
9 * Copyright (C) 2007 Oracle. All rights reserved. 9 * Copyright (C) 2007, 2009 Oracle. All rights reserved.
10 * 10 *
11 * This program is free software; you can redistribute it and/or 11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public 12 * modify it under the terms of the GNU General Public
@@ -271,11 +271,12 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
271} 271}
272EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); 272EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
273 273
274/* 274int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
275 * Why don't we cast to ocfs2_meta_lvb? The "clean" answer is that we 275{
276 * don't cast at the glue level. The real answer is that the header 276 return active_stack->sp_ops->lvb_valid(lksb);
277 * ordering is nigh impossible. 277}
278 */ 278EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
279
279void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb) 280void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
280{ 281{
281 return active_stack->sp_ops->lock_lvb(lksb); 282 return active_stack->sp_ops->lock_lvb(lksb);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index c571af375ef8..03a44d60eac9 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -186,6 +186,11 @@ struct ocfs2_stack_operations {
186 int (*lock_status)(union ocfs2_dlm_lksb *lksb); 186 int (*lock_status)(union ocfs2_dlm_lksb *lksb);
187 187
188 /* 188 /*
189 * Return non-zero if the LVB is valid.
190 */
191 int (*lvb_valid)(union ocfs2_dlm_lksb *lksb);
192
193 /*
189 * Pull the lvb pointer off of the stack-specific lksb. 194 * Pull the lvb pointer off of the stack-specific lksb.
190 */ 195 */
191 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); 196 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
@@ -252,6 +257,7 @@ int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
252 struct ocfs2_lock_res *astarg); 257 struct ocfs2_lock_res *astarg);
253 258
254int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); 259int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
260int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb);
255void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); 261void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
256void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); 262void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
257 263
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8439f6b324b9..73a16d4666dc 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -923,14 +923,23 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
923 int nr) 923 int nr)
924{ 924{
925 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 925 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
926 int ret;
926 927
927 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) 928 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
928 return 0; 929 return 0;
929 if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data) 930
931 if (!buffer_jbd(bg_bh))
930 return 1; 932 return 1;
931 933
934 jbd_lock_bh_state(bg_bh);
932 bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; 935 bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
933 return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); 936 if (bg)
937 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
938 else
939 ret = 1;
940 jbd_unlock_bh_state(bg_bh);
941
942 return ret;
934} 943}
935 944
936static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, 945static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
@@ -1885,6 +1894,7 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1885 unsigned int tmp; 1894 unsigned int tmp;
1886 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 1895 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1887 struct ocfs2_group_desc *undo_bg = NULL; 1896 struct ocfs2_group_desc *undo_bg = NULL;
1897 int cluster_bitmap = 0;
1888 1898
1889 mlog_entry_void(); 1899 mlog_entry_void();
1890 1900
@@ -1905,18 +1915,28 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1905 } 1915 }
1906 1916
1907 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1917 if (ocfs2_is_cluster_bitmap(alloc_inode))
1908 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data; 1918 cluster_bitmap = 1;
1919
1920 if (cluster_bitmap) {
1921 jbd_lock_bh_state(group_bh);
1922 undo_bg = (struct ocfs2_group_desc *)
1923 bh2jh(group_bh)->b_committed_data;
1924 BUG_ON(!undo_bg);
1925 }
1909 1926
1910 tmp = num_bits; 1927 tmp = num_bits;
1911 while(tmp--) { 1928 while(tmp--) {
1912 ocfs2_clear_bit((bit_off + tmp), 1929 ocfs2_clear_bit((bit_off + tmp),
1913 (unsigned long *) bg->bg_bitmap); 1930 (unsigned long *) bg->bg_bitmap);
1914 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1931 if (cluster_bitmap)
1915 ocfs2_set_bit(bit_off + tmp, 1932 ocfs2_set_bit(bit_off + tmp,
1916 (unsigned long *) undo_bg->bg_bitmap); 1933 (unsigned long *) undo_bg->bg_bitmap);
1917 } 1934 }
1918 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 1935 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1919 1936
1937 if (cluster_bitmap)
1938 jbd_unlock_bh_state(group_bh);
1939
1920 status = ocfs2_journal_dirty(handle, group_bh); 1940 status = ocfs2_journal_dirty(handle, group_bh);
1921 if (status < 0) 1941 if (status < 0)
1922 mlog_errno(status); 1942 mlog_errno(status);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9d37e0..a3f8871d21fd 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,6 +42,7 @@
42#include <linux/mount.h> 42#include <linux/mount.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/quotaops.h> 44#include <linux/quotaops.h>
45#include <linux/smp_lock.h>
45 46
46#define MLOG_MASK_PREFIX ML_SUPER 47#define MLOG_MASK_PREFIX ML_SUPER
47#include <cluster/masklog.h> 48#include <cluster/masklog.h>
@@ -118,15 +119,16 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
118static int ocfs2_check_volume(struct ocfs2_super *osb); 119static int ocfs2_check_volume(struct ocfs2_super *osb);
119static int ocfs2_verify_volume(struct ocfs2_dinode *di, 120static int ocfs2_verify_volume(struct ocfs2_dinode *di,
120 struct buffer_head *bh, 121 struct buffer_head *bh,
121 u32 sectsize); 122 u32 sectsize,
123 struct ocfs2_blockcheck_stats *stats);
122static int ocfs2_initialize_super(struct super_block *sb, 124static int ocfs2_initialize_super(struct super_block *sb,
123 struct buffer_head *bh, 125 struct buffer_head *bh,
124 int sector_size); 126 int sector_size,
127 struct ocfs2_blockcheck_stats *stats);
125static int ocfs2_get_sector(struct super_block *sb, 128static int ocfs2_get_sector(struct super_block *sb,
126 struct buffer_head **bh, 129 struct buffer_head **bh,
127 int block, 130 int block,
128 int sect_size); 131 int sect_size);
129static void ocfs2_write_super(struct super_block *sb);
130static struct inode *ocfs2_alloc_inode(struct super_block *sb); 132static struct inode *ocfs2_alloc_inode(struct super_block *sb);
131static void ocfs2_destroy_inode(struct inode *inode); 133static void ocfs2_destroy_inode(struct inode *inode);
132static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); 134static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
@@ -141,7 +143,6 @@ static const struct super_operations ocfs2_sops = {
141 .clear_inode = ocfs2_clear_inode, 143 .clear_inode = ocfs2_clear_inode,
142 .delete_inode = ocfs2_delete_inode, 144 .delete_inode = ocfs2_delete_inode,
143 .sync_fs = ocfs2_sync_fs, 145 .sync_fs = ocfs2_sync_fs,
144 .write_super = ocfs2_write_super,
145 .put_super = ocfs2_put_super, 146 .put_super = ocfs2_put_super,
146 .remount_fs = ocfs2_remount, 147 .remount_fs = ocfs2_remount,
147 .show_options = ocfs2_show_options, 148 .show_options = ocfs2_show_options,
@@ -204,10 +205,10 @@ static const match_table_t tokens = {
204#ifdef CONFIG_DEBUG_FS 205#ifdef CONFIG_DEBUG_FS
205static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) 206static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
206{ 207{
207 int out = 0;
208 int i;
209 struct ocfs2_cluster_connection *cconn = osb->cconn; 208 struct ocfs2_cluster_connection *cconn = osb->cconn;
210 struct ocfs2_recovery_map *rm = osb->recovery_map; 209 struct ocfs2_recovery_map *rm = osb->recovery_map;
210 struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
211 int i, out = 0;
211 212
212 out += snprintf(buf + out, len - out, 213 out += snprintf(buf + out, len - out,
213 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", 214 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
@@ -232,20 +233,24 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
232 "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", 233 "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount",
233 osb->s_mount_opt, osb->s_atime_quantum); 234 osb->s_mount_opt, osb->s_atime_quantum);
234 235
235 out += snprintf(buf + out, len - out, 236 if (cconn) {
236 "%10s => Stack: %s Name: %*s Version: %d.%d\n", 237 out += snprintf(buf + out, len - out,
237 "Cluster", 238 "%10s => Stack: %s Name: %*s "
238 (*osb->osb_cluster_stack == '\0' ? 239 "Version: %d.%d\n", "Cluster",
239 "o2cb" : osb->osb_cluster_stack), 240 (*osb->osb_cluster_stack == '\0' ?
240 cconn->cc_namelen, cconn->cc_name, 241 "o2cb" : osb->osb_cluster_stack),
241 cconn->cc_version.pv_major, cconn->cc_version.pv_minor); 242 cconn->cc_namelen, cconn->cc_name,
243 cconn->cc_version.pv_major,
244 cconn->cc_version.pv_minor);
245 }
242 246
243 spin_lock(&osb->dc_task_lock); 247 spin_lock(&osb->dc_task_lock);
244 out += snprintf(buf + out, len - out, 248 out += snprintf(buf + out, len - out,
245 "%10s => Pid: %d Count: %lu WakeSeq: %lu " 249 "%10s => Pid: %d Count: %lu WakeSeq: %lu "
246 "WorkSeq: %lu\n", "DownCnvt", 250 "WorkSeq: %lu\n", "DownCnvt",
247 task_pid_nr(osb->dc_task), osb->blocked_lock_count, 251 (osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
248 osb->dc_wake_sequence, osb->dc_work_sequence); 252 osb->blocked_lock_count, osb->dc_wake_sequence,
253 osb->dc_work_sequence);
249 spin_unlock(&osb->dc_task_lock); 254 spin_unlock(&osb->dc_task_lock);
250 255
251 spin_lock(&osb->osb_lock); 256 spin_lock(&osb->osb_lock);
@@ -265,14 +270,15 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
265 270
266 out += snprintf(buf + out, len - out, 271 out += snprintf(buf + out, len - out,
267 "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit", 272 "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit",
268 task_pid_nr(osb->commit_task), osb->osb_commit_interval, 273 (osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
274 osb->osb_commit_interval,
269 atomic_read(&osb->needs_checkpoint)); 275 atomic_read(&osb->needs_checkpoint));
270 276
271 out += snprintf(buf + out, len - out, 277 out += snprintf(buf + out, len - out,
272 "%10s => State: %d NumTxns: %d TxnId: %lu\n", 278 "%10s => State: %d TxnId: %lu NumTxns: %d\n",
273 "Journal", osb->journal->j_state, 279 "Journal", osb->journal->j_state,
274 atomic_read(&osb->journal->j_num_trans), 280 osb->journal->j_trans_id,
275 osb->journal->j_trans_id); 281 atomic_read(&osb->journal->j_num_trans));
276 282
277 out += snprintf(buf + out, len - out, 283 out += snprintf(buf + out, len - out,
278 "%10s => GlobalAllocs: %d LocalAllocs: %d " 284 "%10s => GlobalAllocs: %d LocalAllocs: %d "
@@ -298,9 +304,18 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
298 atomic_read(&osb->s_num_inodes_stolen)); 304 atomic_read(&osb->s_num_inodes_stolen));
299 spin_unlock(&osb->osb_lock); 305 spin_unlock(&osb->osb_lock);
300 306
307 out += snprintf(buf + out, len - out, "OrphanScan => ");
308 out += snprintf(buf + out, len - out, "Local: %u Global: %u ",
309 os->os_count, os->os_seqno);
310 out += snprintf(buf + out, len - out, " Last Scan: ");
311 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
312 out += snprintf(buf + out, len - out, "Disabled\n");
313 else
314 out += snprintf(buf + out, len - out, "%lu seconds ago\n",
315 (get_seconds() - os->os_scantime.tv_sec));
316
301 out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", 317 out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
302 "Slots", "Num", "RecoGen"); 318 "Slots", "Num", "RecoGen");
303
304 for (i = 0; i < osb->max_slots; ++i) { 319 for (i = 0; i < osb->max_slots; ++i) {
305 out += snprintf(buf + out, len - out, 320 out += snprintf(buf + out, len - out,
306 "%10s %c %3d %10d\n", 321 "%10s %c %3d %10d\n",
@@ -365,24 +380,12 @@ static struct file_operations ocfs2_osb_debug_fops = {
365 .llseek = generic_file_llseek, 380 .llseek = generic_file_llseek,
366}; 381};
367 382
368/*
369 * write_super and sync_fs ripped right out of ext3.
370 */
371static void ocfs2_write_super(struct super_block *sb)
372{
373 if (mutex_trylock(&sb->s_lock) != 0)
374 BUG();
375 sb->s_dirt = 0;
376}
377
378static int ocfs2_sync_fs(struct super_block *sb, int wait) 383static int ocfs2_sync_fs(struct super_block *sb, int wait)
379{ 384{
380 int status; 385 int status;
381 tid_t target; 386 tid_t target;
382 struct ocfs2_super *osb = OCFS2_SB(sb); 387 struct ocfs2_super *osb = OCFS2_SB(sb);
383 388
384 sb->s_dirt = 0;
385
386 if (ocfs2_is_hard_readonly(osb)) 389 if (ocfs2_is_hard_readonly(osb))
387 return -EROFS; 390 return -EROFS;
388 391
@@ -555,7 +558,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
555 */ 558 */
556 559
557#if BITS_PER_LONG == 32 560#if BITS_PER_LONG == 32
558# if defined(CONFIG_LBD) 561# if defined(CONFIG_LBDAF)
559 BUILD_BUG_ON(sizeof(sector_t) != 8); 562 BUILD_BUG_ON(sizeof(sector_t) != 8);
560 /* 563 /*
561 * We might be limited by page cache size. 564 * We might be limited by page cache size.
@@ -595,6 +598,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
595 struct mount_options parsed_options; 598 struct mount_options parsed_options;
596 struct ocfs2_super *osb = OCFS2_SB(sb); 599 struct ocfs2_super *osb = OCFS2_SB(sb);
597 600
601 lock_kernel();
602
598 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { 603 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
599 ret = -EINVAL; 604 ret = -EINVAL;
600 goto out; 605 goto out;
@@ -698,12 +703,14 @@ unlock_osb:
698 ocfs2_set_journal_params(osb); 703 ocfs2_set_journal_params(osb);
699 } 704 }
700out: 705out:
706 unlock_kernel();
701 return ret; 707 return ret;
702} 708}
703 709
704static int ocfs2_sb_probe(struct super_block *sb, 710static int ocfs2_sb_probe(struct super_block *sb,
705 struct buffer_head **bh, 711 struct buffer_head **bh,
706 int *sector_size) 712 int *sector_size,
713 struct ocfs2_blockcheck_stats *stats)
707{ 714{
708 int status, tmpstat; 715 int status, tmpstat;
709 struct ocfs1_vol_disk_hdr *hdr; 716 struct ocfs1_vol_disk_hdr *hdr;
@@ -713,7 +720,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
713 *bh = NULL; 720 *bh = NULL;
714 721
715 /* may be > 512 */ 722 /* may be > 512 */
716 *sector_size = bdev_hardsect_size(sb->s_bdev); 723 *sector_size = bdev_logical_block_size(sb->s_bdev);
717 if (*sector_size > OCFS2_MAX_BLOCKSIZE) { 724 if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
718 mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", 725 mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
719 *sector_size, OCFS2_MAX_BLOCKSIZE); 726 *sector_size, OCFS2_MAX_BLOCKSIZE);
@@ -769,7 +776,9 @@ static int ocfs2_sb_probe(struct super_block *sb,
769 goto bail; 776 goto bail;
770 } 777 }
771 di = (struct ocfs2_dinode *) (*bh)->b_data; 778 di = (struct ocfs2_dinode *) (*bh)->b_data;
772 status = ocfs2_verify_volume(di, *bh, blksize); 779 memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
780 spin_lock_init(&stats->b_lock);
781 status = ocfs2_verify_volume(di, *bh, blksize, stats);
773 if (status >= 0) 782 if (status >= 0)
774 goto bail; 783 goto bail;
775 brelse(*bh); 784 brelse(*bh);
@@ -975,6 +984,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
975 struct ocfs2_super *osb = NULL; 984 struct ocfs2_super *osb = NULL;
976 struct buffer_head *bh = NULL; 985 struct buffer_head *bh = NULL;
977 char nodestr[8]; 986 char nodestr[8];
987 struct ocfs2_blockcheck_stats stats;
978 988
979 mlog_entry("%p, %p, %i", sb, data, silent); 989 mlog_entry("%p, %p, %i", sb, data, silent);
980 990
@@ -984,13 +994,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
984 } 994 }
985 995
986 /* probe for superblock */ 996 /* probe for superblock */
987 status = ocfs2_sb_probe(sb, &bh, &sector_size); 997 status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
988 if (status < 0) { 998 if (status < 0) {
989 mlog(ML_ERROR, "superblock probe failed!\n"); 999 mlog(ML_ERROR, "superblock probe failed!\n");
990 goto read_super_error; 1000 goto read_super_error;
991 } 1001 }
992 1002
993 status = ocfs2_initialize_super(sb, bh, sector_size); 1003 status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
994 osb = OCFS2_SB(sb); 1004 osb = OCFS2_SB(sb);
995 if (status < 0) { 1005 if (status < 0) {
996 mlog_errno(status); 1006 mlog_errno(status);
@@ -1100,6 +1110,18 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1100 goto read_super_error; 1110 goto read_super_error;
1101 } 1111 }
1102 1112
1113 if (ocfs2_meta_ecc(osb)) {
1114 status = ocfs2_blockcheck_stats_debugfs_install(
1115 &osb->osb_ecc_stats,
1116 osb->osb_debug_root);
1117 if (status) {
1118 mlog(ML_ERROR,
1119 "Unable to create blockcheck statistics "
1120 "files\n");
1121 goto read_super_error;
1122 }
1123 }
1124
1103 status = ocfs2_mount_volume(sb); 1125 status = ocfs2_mount_volume(sb);
1104 if (osb->root_inode) 1126 if (osb->root_inode)
1105 inode = igrab(osb->root_inode); 1127 inode = igrab(osb->root_inode);
@@ -1160,6 +1182,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1160 atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS); 1182 atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
1161 wake_up(&osb->osb_mount_event); 1183 wake_up(&osb->osb_mount_event);
1162 1184
1185 /* Start this when the mount is almost sure of being successful */
1186 ocfs2_orphan_scan_start(osb);
1187
1163 mlog_exit(status); 1188 mlog_exit(status);
1164 return status; 1189 return status;
1165 1190
@@ -1189,14 +1214,31 @@ static int ocfs2_get_sb(struct file_system_type *fs_type,
1189 mnt); 1214 mnt);
1190} 1215}
1191 1216
1217static void ocfs2_kill_sb(struct super_block *sb)
1218{
1219 struct ocfs2_super *osb = OCFS2_SB(sb);
1220
1221 /* Failed mount? */
1222 if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
1223 goto out;
1224
1225 /* Prevent further queueing of inode drop events */
1226 spin_lock(&dentry_list_lock);
1227 ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
1228 spin_unlock(&dentry_list_lock);
1229 /* Wait for work to finish and/or remove it */
1230 cancel_work_sync(&osb->dentry_lock_work);
1231out:
1232 kill_block_super(sb);
1233}
1234
1192static struct file_system_type ocfs2_fs_type = { 1235static struct file_system_type ocfs2_fs_type = {
1193 .owner = THIS_MODULE, 1236 .owner = THIS_MODULE,
1194 .name = "ocfs2", 1237 .name = "ocfs2",
1195 .get_sb = ocfs2_get_sb, /* is this called when we mount 1238 .get_sb = ocfs2_get_sb, /* is this called when we mount
1196 * the fs? */ 1239 * the fs? */
1197 .kill_sb = kill_block_super, /* set to the generic one 1240 .kill_sb = ocfs2_kill_sb,
1198 * right now, but do we 1241
1199 * need to change that? */
1200 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, 1242 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
1201 .next = NULL 1243 .next = NULL
1202}; 1244};
@@ -1550,9 +1592,13 @@ static void ocfs2_put_super(struct super_block *sb)
1550{ 1592{
1551 mlog_entry("(0x%p)\n", sb); 1593 mlog_entry("(0x%p)\n", sb);
1552 1594
1595 lock_kernel();
1596
1553 ocfs2_sync_blockdev(sb); 1597 ocfs2_sync_blockdev(sb);
1554 ocfs2_dismount_volume(sb, 0); 1598 ocfs2_dismount_volume(sb, 0);
1555 1599
1600 unlock_kernel();
1601
1556 mlog_exit_void(); 1602 mlog_exit_void();
1557} 1603}
1558 1604
@@ -1766,13 +1812,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
1766 } 1812 }
1767 1813
1768 status = ocfs2_truncate_log_init(osb); 1814 status = ocfs2_truncate_log_init(osb);
1769 if (status < 0) { 1815 if (status < 0)
1770 mlog_errno(status); 1816 mlog_errno(status);
1771 goto leave;
1772 }
1773
1774 if (ocfs2_mount_local(osb))
1775 goto leave;
1776 1817
1777leave: 1818leave:
1778 if (unlock_super) 1819 if (unlock_super)
@@ -1796,6 +1837,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1796 1837
1797 debugfs_remove(osb->osb_ctxt); 1838 debugfs_remove(osb->osb_ctxt);
1798 1839
1840 /*
1841 * Flush inode dropping work queue so that deletes are
1842 * performed while the filesystem is still working
1843 */
1844 ocfs2_drop_all_dl_inodes(osb);
1845
1846 /* Orphan scan should be stopped as early as possible */
1847 ocfs2_orphan_scan_stop(osb);
1848
1799 ocfs2_disable_quotas(osb); 1849 ocfs2_disable_quotas(osb);
1800 1850
1801 ocfs2_shutdown_local_alloc(osb); 1851 ocfs2_shutdown_local_alloc(osb);
@@ -1839,6 +1889,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1839 if (osb->cconn) 1889 if (osb->cconn)
1840 ocfs2_dlm_shutdown(osb, hangup_needed); 1890 ocfs2_dlm_shutdown(osb, hangup_needed);
1841 1891
1892 ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
1842 debugfs_remove(osb->osb_debug_root); 1893 debugfs_remove(osb->osb_debug_root);
1843 1894
1844 if (hangup_needed) 1895 if (hangup_needed)
@@ -1886,7 +1937,8 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
1886 1937
1887static int ocfs2_initialize_super(struct super_block *sb, 1938static int ocfs2_initialize_super(struct super_block *sb,
1888 struct buffer_head *bh, 1939 struct buffer_head *bh,
1889 int sector_size) 1940 int sector_size,
1941 struct ocfs2_blockcheck_stats *stats)
1890{ 1942{
1891 int status; 1943 int status;
1892 int i, cbits, bbits; 1944 int i, cbits, bbits;
@@ -1945,11 +1997,16 @@ static int ocfs2_initialize_super(struct super_block *sb,
1945 atomic_set(&osb->alloc_stats.bg_allocs, 0); 1997 atomic_set(&osb->alloc_stats.bg_allocs, 0);
1946 atomic_set(&osb->alloc_stats.bg_extends, 0); 1998 atomic_set(&osb->alloc_stats.bg_extends, 0);
1947 1999
2000 /* Copy the blockcheck stats from the superblock probe */
2001 osb->osb_ecc_stats = *stats;
2002
1948 ocfs2_init_node_maps(osb); 2003 ocfs2_init_node_maps(osb);
1949 2004
1950 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 2005 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
1951 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 2006 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1952 2007
2008 ocfs2_orphan_scan_init(osb);
2009
1953 status = ocfs2_recovery_init(osb); 2010 status = ocfs2_recovery_init(osb);
1954 if (status) { 2011 if (status) {
1955 mlog(ML_ERROR, "Unable to initialize recovery state\n"); 2012 mlog(ML_ERROR, "Unable to initialize recovery state\n");
@@ -2175,7 +2232,8 @@ bail:
2175 */ 2232 */
2176static int ocfs2_verify_volume(struct ocfs2_dinode *di, 2233static int ocfs2_verify_volume(struct ocfs2_dinode *di,
2177 struct buffer_head *bh, 2234 struct buffer_head *bh,
2178 u32 blksz) 2235 u32 blksz,
2236 struct ocfs2_blockcheck_stats *stats)
2179{ 2237{
2180 int status = -EAGAIN; 2238 int status = -EAGAIN;
2181 2239
@@ -2188,7 +2246,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
2188 OCFS2_FEATURE_INCOMPAT_META_ECC) { 2246 OCFS2_FEATURE_INCOMPAT_META_ECC) {
2189 status = ocfs2_block_check_validate(bh->b_data, 2247 status = ocfs2_block_check_validate(bh->b_data,
2190 bh->b_size, 2248 bh->b_size,
2191 &di->i_check); 2249 &di->i_check,
2250 stats);
2192 if (status) 2251 if (status)
2193 goto out; 2252 goto out;
2194 } 2253 }
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index ab713ebdd546..40e53702948c 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -50,6 +50,10 @@ static inline int is_in_system_inode_array(struct ocfs2_super *osb,
50 int type, 50 int type,
51 u32 slot); 51 u32 slot);
52 52
53#ifdef CONFIG_DEBUG_LOCK_ALLOC
54static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
55#endif
56
53static inline int is_global_system_inode(int type) 57static inline int is_global_system_inode(int type)
54{ 58{
55 return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE && 59 return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
@@ -118,6 +122,21 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
118 inode = NULL; 122 inode = NULL;
119 goto bail; 123 goto bail;
120 } 124 }
125#ifdef CONFIG_DEBUG_LOCK_ALLOC
126 if (type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
127 type == LOCAL_GROUP_QUOTA_SYSTEM_INODE ||
128 type == JOURNAL_SYSTEM_INODE) {
129 /* Ignore inode lock on these inodes as the lock does not
130 * really belong to any process and lockdep cannot handle
131 * that */
132 OCFS2_I(inode)->ip_inode_lockres.l_lockdep_map.key = NULL;
133 } else {
134 lockdep_init_map(&OCFS2_I(inode)->ip_inode_lockres.
135 l_lockdep_map,
136 ocfs2_system_inodes[type].si_name,
137 &ocfs2_sysfile_cluster_lock_key[type], 0);
138 }
139#endif
121bail: 140bail:
122 141
123 return inode; 142 return inode;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 15631019dc63..d1a27cda984f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode,
1052 struct ocfs2_xattr_block *xb; 1052 struct ocfs2_xattr_block *xb;
1053 struct ocfs2_xattr_value_root *xv; 1053 struct ocfs2_xattr_value_root *xv;
1054 size_t size; 1054 size_t size;
1055 int ret = -ENODATA, name_offset, name_len, block_off, i; 1055 int ret = -ENODATA, name_offset, name_len, i;
1056 int uninitialized_var(block_off);
1056 1057
1057 xs->bucket = ocfs2_xattr_bucket_new(inode); 1058 xs->bucket = ocfs2_xattr_bucket_new(inode);
1058 if (!xs->bucket) { 1059 if (!xs->bucket) {
@@ -3154,7 +3155,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
3154 le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash)); 3155 le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
3155 if (func) { 3156 if (func) {
3156 ret = func(inode, bucket, para); 3157 ret = func(inode, bucket, para);
3157 if (ret) 3158 if (ret && ret != -ERANGE)
3158 mlog_errno(ret); 3159 mlog_errno(ret);
3159 /* Fall through to bucket_relse() */ 3160 /* Fall through to bucket_relse() */
3160 } 3161 }
@@ -3261,7 +3262,8 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
3261 ocfs2_list_xattr_bucket, 3262 ocfs2_list_xattr_bucket,
3262 &xl); 3263 &xl);
3263 if (ret) { 3264 if (ret) {
3264 mlog_errno(ret); 3265 if (ret != -ERANGE)
3266 mlog_errno(ret);
3265 goto out; 3267 goto out;
3266 } 3268 }
3267 3269
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 834b2331f6b3..d17e774eaf45 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -11,21 +11,6 @@
11#include <linux/mpage.h> 11#include <linux/mpage.h>
12#include "omfs.h" 12#include "omfs.h"
13 13
14static int omfs_sync_file(struct file *file, struct dentry *dentry,
15 int datasync)
16{
17 struct inode *inode = dentry->d_inode;
18 int err;
19
20 err = sync_mapping_buffers(inode->i_mapping);
21 if (!(inode->i_state & I_DIRTY))
22 return err;
23 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
24 return err;
25 err |= omfs_sync_inode(inode);
26 return err ? -EIO : 0;
27}
28
29static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset) 14static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
30{ 15{
31 return (sbi->s_sys_blocksize - offset - 16 return (sbi->s_sys_blocksize - offset -
@@ -344,7 +329,7 @@ struct file_operations omfs_file_operations = {
344 .aio_read = generic_file_aio_read, 329 .aio_read = generic_file_aio_read,
345 .aio_write = generic_file_aio_write, 330 .aio_write = generic_file_aio_write,
346 .mmap = generic_file_mmap, 331 .mmap = generic_file_mmap,
347 .fsync = omfs_sync_file, 332 .fsync = simple_fsync,
348 .splice_read = generic_file_splice_read, 333 .splice_read = generic_file_splice_read,
349}; 334};
350 335
diff --git a/fs/open.c b/fs/open.c
index bdfbf03615a4..dd98e8076024 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -378,63 +378,63 @@ SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
378#endif 378#endif
379#endif /* BITS_PER_LONG == 32 */ 379#endif /* BITS_PER_LONG == 32 */
380 380
381SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) 381
382int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
382{ 383{
383 struct file *file; 384 struct inode *inode = file->f_path.dentry->d_inode;
384 struct inode *inode; 385 long ret;
385 long ret = -EINVAL;
386 386
387 if (offset < 0 || len <= 0) 387 if (offset < 0 || len <= 0)
388 goto out; 388 return -EINVAL;
389 389
390 /* Return error if mode is not supported */ 390 /* Return error if mode is not supported */
391 ret = -EOPNOTSUPP;
392 if (mode && !(mode & FALLOC_FL_KEEP_SIZE)) 391 if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
393 goto out; 392 return -EOPNOTSUPP;
394 393
395 ret = -EBADF;
396 file = fget(fd);
397 if (!file)
398 goto out;
399 if (!(file->f_mode & FMODE_WRITE)) 394 if (!(file->f_mode & FMODE_WRITE))
400 goto out_fput; 395 return -EBADF;
401 /* 396 /*
402 * Revalidate the write permissions, in case security policy has 397 * Revalidate the write permissions, in case security policy has
403 * changed since the files were opened. 398 * changed since the files were opened.
404 */ 399 */
405 ret = security_file_permission(file, MAY_WRITE); 400 ret = security_file_permission(file, MAY_WRITE);
406 if (ret) 401 if (ret)
407 goto out_fput; 402 return ret;
408 403
409 inode = file->f_path.dentry->d_inode;
410
411 ret = -ESPIPE;
412 if (S_ISFIFO(inode->i_mode)) 404 if (S_ISFIFO(inode->i_mode))
413 goto out_fput; 405 return -ESPIPE;
414 406
415 ret = -ENODEV;
416 /* 407 /*
417 * Let individual file system decide if it supports preallocation 408 * Let individual file system decide if it supports preallocation
418 * for directories or not. 409 * for directories or not.
419 */ 410 */
420 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) 411 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
421 goto out_fput; 412 return -ENODEV;
422 413
423 ret = -EFBIG;
424 /* Check for wrap through zero too */ 414 /* Check for wrap through zero too */
425 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) 415 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
426 goto out_fput; 416 return -EFBIG;
427 417
428 if (inode->i_op->fallocate) 418 if (!inode->i_op->fallocate)
429 ret = inode->i_op->fallocate(inode, mode, offset, len); 419 return -EOPNOTSUPP;
430 else
431 ret = -EOPNOTSUPP;
432 420
433out_fput: 421 return inode->i_op->fallocate(inode, mode, offset, len);
434 fput(file);
435out:
436 return ret;
437} 422}
423
424SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
425{
426 struct file *file;
427 int error = -EBADF;
428
429 file = fget(fd);
430 if (file) {
431 error = do_fallocate(file, mode, offset, len);
432 fput(file);
433 }
434
435 return error;
436}
437
438#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS 438#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
439asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len) 439asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
440{ 440{
@@ -612,7 +612,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
612 612
613 audit_inode(NULL, dentry); 613 audit_inode(NULL, dentry);
614 614
615 err = mnt_want_write(file->f_path.mnt); 615 err = mnt_want_write_file(file);
616 if (err) 616 if (err)
617 goto out_putf; 617 goto out_putf;
618 mutex_lock(&inode->i_mutex); 618 mutex_lock(&inode->i_mutex);
@@ -761,7 +761,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
761 if (!file) 761 if (!file)
762 goto out; 762 goto out;
763 763
764 error = mnt_want_write(file->f_path.mnt); 764 error = mnt_want_write_file(file);
765 if (error) 765 if (error)
766 goto out_fput; 766 goto out_fput;
767 dentry = file->f_path.dentry; 767 dentry = file->f_path.dentry;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 99e33ef40be4..ea4e6cb29e13 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *dev,
219 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); 219 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
220} 220}
221 221
222ssize_t part_alignment_offset_show(struct device *dev,
223 struct device_attribute *attr, char *buf)
224{
225 struct hd_struct *p = dev_to_part(dev);
226 return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
227}
228
222ssize_t part_stat_show(struct device *dev, 229ssize_t part_stat_show(struct device *dev,
223 struct device_attribute *attr, char *buf) 230 struct device_attribute *attr, char *buf)
224{ 231{
@@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *dev,
272static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); 279static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
273static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); 280static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
274static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 281static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
282static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
275static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 283static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
276#ifdef CONFIG_FAIL_MAKE_REQUEST 284#ifdef CONFIG_FAIL_MAKE_REQUEST
277static struct device_attribute dev_attr_fail = 285static struct device_attribute dev_attr_fail =
@@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = {
282 &dev_attr_partition.attr, 290 &dev_attr_partition.attr,
283 &dev_attr_start.attr, 291 &dev_attr_start.attr,
284 &dev_attr_size.attr, 292 &dev_attr_size.attr,
293 &dev_attr_alignment_offset.attr,
285 &dev_attr_stat.attr, 294 &dev_attr_stat.attr,
286#ifdef CONFIG_FAIL_MAKE_REQUEST 295#ifdef CONFIG_FAIL_MAKE_REQUEST
287 &dev_attr_fail.attr, 296 &dev_attr_fail.attr,
@@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
383 pdev = part_to_dev(p); 392 pdev = part_to_dev(p);
384 393
385 p->start_sect = start; 394 p->start_sect = start;
395 p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
386 p->nr_sects = len; 396 p->nr_sects = len;
387 p->partno = partno; 397 p->partno = partno;
388 p->policy = get_disk_ro(disk); 398 p->policy = get_disk_ro(disk);
@@ -426,7 +436,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
426 rcu_assign_pointer(ptbl->part[partno], p); 436 rcu_assign_pointer(ptbl->part[partno], p);
427 437
428 /* suppress uevent if the disk supresses it */ 438 /* suppress uevent if the disk supresses it */
429 if (!dev_get_uevent_suppress(pdev)) 439 if (!dev_get_uevent_suppress(ddev))
430 kobject_uevent(&pdev->kobj, KOBJ_ADD); 440 kobject_uevent(&pdev->kobj, KOBJ_ADD);
431 441
432 return p; 442 return p;
@@ -546,27 +556,49 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
546 556
547 /* add partitions */ 557 /* add partitions */
548 for (p = 1; p < state->limit; p++) { 558 for (p = 1; p < state->limit; p++) {
549 sector_t size = state->parts[p].size; 559 sector_t size, from;
550 sector_t from = state->parts[p].from; 560try_scan:
561 size = state->parts[p].size;
551 if (!size) 562 if (!size)
552 continue; 563 continue;
564
565 from = state->parts[p].from;
553 if (from >= get_capacity(disk)) { 566 if (from >= get_capacity(disk)) {
554 printk(KERN_WARNING 567 printk(KERN_WARNING
555 "%s: p%d ignored, start %llu is behind the end of the disk\n", 568 "%s: p%d ignored, start %llu is behind the end of the disk\n",
556 disk->disk_name, p, (unsigned long long) from); 569 disk->disk_name, p, (unsigned long long) from);
557 continue; 570 continue;
558 } 571 }
572
559 if (from + size > get_capacity(disk)) { 573 if (from + size > get_capacity(disk)) {
560 /* 574 struct block_device_operations *bdops = disk->fops;
561 * we can not ignore partitions of broken tables 575 unsigned long long capacity;
562 * created by for example camera firmware, but we 576
563 * limit them to the end of the disk to avoid
564 * creating invalid block devices
565 */
566 printk(KERN_WARNING 577 printk(KERN_WARNING
567 "%s: p%d size %llu limited to end of disk\n", 578 "%s: p%d size %llu exceeds device capacity, ",
568 disk->disk_name, p, (unsigned long long) size); 579 disk->disk_name, p, (unsigned long long) size);
569 size = get_capacity(disk) - from; 580
581 if (bdops->set_capacity &&
582 (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
583 printk(KERN_CONT "enabling native capacity\n");
584 capacity = bdops->set_capacity(disk, ~0ULL);
585 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
586 if (capacity > get_capacity(disk)) {
587 set_capacity(disk, capacity);
588 check_disk_size_change(disk, bdev);
589 bdev->bd_invalidated = 0;
590 }
591 goto try_scan;
592 } else {
593 /*
594 * we can not ignore partitions of broken tables
595 * created by for example camera firmware, but
596 * we limit them to the end of the disk to avoid
597 * creating invalid block devices
598 */
599 printk(KERN_CONT "limited to end of disk\n");
600 size = get_capacity(disk) - from;
601 }
570 } 602 }
571 part = add_partition(disk, p, from, size, 603 part = add_partition(disk, p, from, size,
572 state->parts[p].flags); 604 state->parts[p].flags);
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 46297683cd34..fc71aab08460 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -76,7 +76,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
76 Sector sect; 76 Sector sect;
77 77
78 res = 0; 78 res = 0;
79 blocksize = bdev_hardsect_size(bdev); 79 blocksize = bdev_logical_block_size(bdev);
80 if (blocksize <= 0) 80 if (blocksize <= 0)
81 goto out_exit; 81 goto out_exit;
82 i_size = i_size_read(bdev->bd_inode); 82 i_size = i_size_read(bdev->bd_inode);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 796511886f28..0028d2ef0662 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -110,7 +110,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
110 Sector sect; 110 Sector sect;
111 unsigned char *data; 111 unsigned char *data;
112 u32 this_sector, this_size; 112 u32 this_sector, this_size;
113 int sector_size = bdev_hardsect_size(bdev) / 512; 113 int sector_size = bdev_logical_block_size(bdev) / 512;
114 int loopct = 0; /* number of links followed 114 int loopct = 0; /* number of links followed
115 without finding a data partition */ 115 without finding a data partition */
116 int i; 116 int i;
@@ -415,7 +415,7 @@ static struct {
415 415
416int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 416int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
417{ 417{
418 int sector_size = bdev_hardsect_size(bdev) / 512; 418 int sector_size = bdev_logical_block_size(bdev) / 512;
419 Sector sect; 419 Sector sect;
420 unsigned char *data; 420 unsigned char *data;
421 struct partition *p; 421 struct partition *p;
diff --git a/fs/pipe.c b/fs/pipe.c
index 13414ec45b8d..52c415114838 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -68,8 +68,8 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
68 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 68 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
69 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 69 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
70 } else { 70 } else {
71 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 71 pipe_lock_nested(pipe2, I_MUTEX_PARENT);
72 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 72 pipe_lock_nested(pipe1, I_MUTEX_CHILD);
73 } 73 }
74} 74}
75 75
@@ -302,6 +302,20 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
302 return 0; 302 return 0;
303} 303}
304 304
305/**
306 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
307 * @pipe: the pipe that the buffer belongs to
308 * @buf: the buffer to put a reference to
309 *
310 * Description:
311 * This function releases a reference to @buf.
312 */
313void generic_pipe_buf_release(struct pipe_inode_info *pipe,
314 struct pipe_buffer *buf)
315{
316 page_cache_release(buf->page);
317}
318
305static const struct pipe_buf_operations anon_pipe_buf_ops = { 319static const struct pipe_buf_operations anon_pipe_buf_ops = {
306 .can_merge = 1, 320 .can_merge = 1,
307 .map = generic_pipe_buf_map, 321 .map = generic_pipe_buf_map,
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 63d965193b22..11a7b5c68153 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -18,6 +18,7 @@ proc-y += meminfo.o
18proc-y += stat.o 18proc-y += stat.o
19proc-y += uptime.o 19proc-y += uptime.o
20proc-y += version.o 20proc-y += version.o
21proc-y += softirqs.o
21proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o 22proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
22proc-$(CONFIG_NET) += proc_net.o 23proc-$(CONFIG_NET) += proc_net.o
23proc-$(CONFIG_PROC_KCORE) += kcore.o 24proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3326bbf9ab95..6f742f6658a9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -234,23 +234,20 @@ static int check_mem_permission(struct task_struct *task)
234 234
235struct mm_struct *mm_for_maps(struct task_struct *task) 235struct mm_struct *mm_for_maps(struct task_struct *task)
236{ 236{
237 struct mm_struct *mm = get_task_mm(task); 237 struct mm_struct *mm;
238 if (!mm) 238
239 if (mutex_lock_killable(&task->cred_guard_mutex))
239 return NULL; 240 return NULL;
240 down_read(&mm->mmap_sem); 241
241 task_lock(task); 242 mm = get_task_mm(task);
242 if (task->mm != mm) 243 if (mm && mm != current->mm &&
243 goto out; 244 !ptrace_may_access(task, PTRACE_MODE_READ)) {
244 if (task->mm != current->mm && 245 mmput(mm);
245 __ptrace_may_access(task, PTRACE_MODE_READ) < 0) 246 mm = NULL;
246 goto out; 247 }
247 task_unlock(task); 248 mutex_unlock(&task->cred_guard_mutex);
249
248 return mm; 250 return mm;
249out:
250 task_unlock(task);
251 up_read(&mm->mmap_sem);
252 mmput(mm);
253 return NULL;
254} 251}
255 252
256static int proc_pid_cmdline(struct task_struct *task, char * buffer) 253static int proc_pid_cmdline(struct task_struct *task, char * buffer)
@@ -2128,9 +2125,15 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2128 if (copy_from_user(page, buf, count)) 2125 if (copy_from_user(page, buf, count))
2129 goto out_free; 2126 goto out_free;
2130 2127
2128 /* Guard against adverse ptrace interaction */
2129 length = mutex_lock_interruptible(&task->cred_guard_mutex);
2130 if (length < 0)
2131 goto out_free;
2132
2131 length = security_setprocattr(task, 2133 length = security_setprocattr(task,
2132 (char*)file->f_path.dentry->d_name.name, 2134 (char*)file->f_path.dentry->d_name.name,
2133 (void*)page, count); 2135 (void*)page, count);
2136 mutex_unlock(&task->cred_guard_mutex);
2134out_free: 2137out_free:
2135 free_page((unsigned long) page); 2138 free_page((unsigned long) page);
2136out: 2139out:
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f6db9618a888..753ca37002c8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -92,3 +92,28 @@ struct pde_opener {
92 struct list_head lh; 92 struct list_head lh;
93}; 93};
94void pde_users_dec(struct proc_dir_entry *pde); 94void pde_users_dec(struct proc_dir_entry *pde);
95
96extern spinlock_t proc_subdir_lock;
97
98struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
99int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
100unsigned long task_vsize(struct mm_struct *);
101int task_statm(struct mm_struct *, int *, int *, int *, int *);
102void task_mem(struct seq_file *, struct mm_struct *);
103
104struct proc_dir_entry *de_get(struct proc_dir_entry *de);
105void de_put(struct proc_dir_entry *de);
106
107extern struct vfsmount *proc_mnt;
108int proc_fill_super(struct super_block *);
109struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
110
111/*
112 * These are generic /proc routines that use the internal
113 * "struct proc_dir_entry" tree to traverse the filesystem.
114 *
115 * The /proc root directory has extended versions to take care
116 * of the /proc/<pid> subdirectories.
117 */
118int proc_readdir(struct file *, void *, filldir_t);
119struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bca39cf99ee..1afa4dd4cae2 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
12 12
13static int loadavg_proc_show(struct seq_file *m, void *v) 13static int loadavg_proc_show(struct seq_file *m, void *v)
14{ 14{
15 int a, b, c; 15 unsigned long avnrun[3];
16 unsigned long seq;
17 16
18 do { 17 get_avenrun(avnrun, FIXED_1/200, 0);
19 seq = read_seqbegin(&xtime_lock);
20 a = avenrun[0] + (FIXED_1/200);
21 b = avenrun[1] + (FIXED_1/200);
22 c = avenrun[2] + (FIXED_1/200);
23 } while (read_seqretry(&xtime_lock, seq));
24 18
25 seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n", 19 seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
26 LOAD_INT(a), LOAD_FRAC(a), 20 LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
27 LOAD_INT(b), LOAD_FRAC(b), 21 LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
28 LOAD_INT(c), LOAD_FRAC(c), 22 LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
29 nr_running(), nr_threads, 23 nr_running(), nr_threads,
30 task_active_pid_ns(current)->last_pid); 24 task_active_pid_ns(current)->last_pid);
31 return 0; 25 return 0;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c6b0302af4c4..d5c410d47fae 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -64,10 +64,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
64 "Inactive(anon): %8lu kB\n" 64 "Inactive(anon): %8lu kB\n"
65 "Active(file): %8lu kB\n" 65 "Active(file): %8lu kB\n"
66 "Inactive(file): %8lu kB\n" 66 "Inactive(file): %8lu kB\n"
67#ifdef CONFIG_UNEVICTABLE_LRU
68 "Unevictable: %8lu kB\n" 67 "Unevictable: %8lu kB\n"
69 "Mlocked: %8lu kB\n" 68 "Mlocked: %8lu kB\n"
70#endif
71#ifdef CONFIG_HIGHMEM 69#ifdef CONFIG_HIGHMEM
72 "HighTotal: %8lu kB\n" 70 "HighTotal: %8lu kB\n"
73 "HighFree: %8lu kB\n" 71 "HighFree: %8lu kB\n"
@@ -109,10 +107,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
109 K(pages[LRU_INACTIVE_ANON]), 107 K(pages[LRU_INACTIVE_ANON]),
110 K(pages[LRU_ACTIVE_FILE]), 108 K(pages[LRU_ACTIVE_FILE]),
111 K(pages[LRU_INACTIVE_FILE]), 109 K(pages[LRU_INACTIVE_FILE]),
112#ifdef CONFIG_UNEVICTABLE_LRU
113 K(pages[LRU_UNEVICTABLE]), 110 K(pages[LRU_UNEVICTABLE]),
114 K(global_page_state(NR_MLOCK)), 111 K(global_page_state(NR_MLOCK)),
115#endif
116#ifdef CONFIG_HIGHMEM 112#ifdef CONFIG_HIGHMEM
117 K(i.totalhigh), 113 K(i.totalhigh),
118 K(i.freehigh), 114 K(i.freehigh),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e9983837d08d..2707c6c7a20f 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -6,11 +6,13 @@
6#include <linux/mmzone.h> 6#include <linux/mmzone.h>
7#include <linux/proc_fs.h> 7#include <linux/proc_fs.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9#include <linux/hugetlb.h>
9#include <asm/uaccess.h> 10#include <asm/uaccess.h>
10#include "internal.h" 11#include "internal.h"
11 12
12#define KPMSIZE sizeof(u64) 13#define KPMSIZE sizeof(u64)
13#define KPMMASK (KPMSIZE - 1) 14#define KPMMASK (KPMSIZE - 1)
15
14/* /proc/kpagecount - an array exposing page counts 16/* /proc/kpagecount - an array exposing page counts
15 * 17 *
16 * Each entry is a u64 representing the corresponding 18 * Each entry is a u64 representing the corresponding
@@ -32,20 +34,22 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
32 return -EINVAL; 34 return -EINVAL;
33 35
34 while (count > 0) { 36 while (count > 0) {
35 ppage = NULL;
36 if (pfn_valid(pfn)) 37 if (pfn_valid(pfn))
37 ppage = pfn_to_page(pfn); 38 ppage = pfn_to_page(pfn);
38 pfn++; 39 else
40 ppage = NULL;
39 if (!ppage) 41 if (!ppage)
40 pcount = 0; 42 pcount = 0;
41 else 43 else
42 pcount = page_mapcount(ppage); 44 pcount = page_mapcount(ppage);
43 45
44 if (put_user(pcount, out++)) { 46 if (put_user(pcount, out)) {
45 ret = -EFAULT; 47 ret = -EFAULT;
46 break; 48 break;
47 } 49 }
48 50
51 pfn++;
52 out++;
49 count -= KPMSIZE; 53 count -= KPMSIZE;
50 } 54 }
51 55
@@ -68,19 +72,122 @@ static const struct file_operations proc_kpagecount_operations = {
68 72
69/* These macros are used to decouple internal flags from exported ones */ 73/* These macros are used to decouple internal flags from exported ones */
70 74
71#define KPF_LOCKED 0 75#define KPF_LOCKED 0
72#define KPF_ERROR 1 76#define KPF_ERROR 1
73#define KPF_REFERENCED 2 77#define KPF_REFERENCED 2
74#define KPF_UPTODATE 3 78#define KPF_UPTODATE 3
75#define KPF_DIRTY 4 79#define KPF_DIRTY 4
76#define KPF_LRU 5 80#define KPF_LRU 5
77#define KPF_ACTIVE 6 81#define KPF_ACTIVE 6
78#define KPF_SLAB 7 82#define KPF_SLAB 7
79#define KPF_WRITEBACK 8 83#define KPF_WRITEBACK 8
80#define KPF_RECLAIM 9 84#define KPF_RECLAIM 9
81#define KPF_BUDDY 10 85#define KPF_BUDDY 10
86
87/* 11-20: new additions in 2.6.31 */
88#define KPF_MMAP 11
89#define KPF_ANON 12
90#define KPF_SWAPCACHE 13
91#define KPF_SWAPBACKED 14
92#define KPF_COMPOUND_HEAD 15
93#define KPF_COMPOUND_TAIL 16
94#define KPF_HUGE 17
95#define KPF_UNEVICTABLE 18
96#define KPF_NOPAGE 20
97
98/* kernel hacking assistances
99 * WARNING: subject to change, never rely on them!
100 */
101#define KPF_RESERVED 32
102#define KPF_MLOCKED 33
103#define KPF_MAPPEDTODISK 34
104#define KPF_PRIVATE 35
105#define KPF_PRIVATE_2 36
106#define KPF_OWNER_PRIVATE 37
107#define KPF_ARCH 38
108#define KPF_UNCACHED 39
109
110static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
111{
112 return ((kflags >> kbit) & 1) << ubit;
113}
82 114
83#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos) 115static u64 get_uflags(struct page *page)
116{
117 u64 k;
118 u64 u;
119
120 /*
121 * pseudo flag: KPF_NOPAGE
122 * it differentiates a memory hole from a page with no flags
123 */
124 if (!page)
125 return 1 << KPF_NOPAGE;
126
127 k = page->flags;
128 u = 0;
129
130 /*
131 * pseudo flags for the well known (anonymous) memory mapped pages
132 *
133 * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
134 * simple test in page_mapped() is not enough.
135 */
136 if (!PageSlab(page) && page_mapped(page))
137 u |= 1 << KPF_MMAP;
138 if (PageAnon(page))
139 u |= 1 << KPF_ANON;
140
141 /*
142 * compound pages: export both head/tail info
143 * they together define a compound page's start/end pos and order
144 */
145 if (PageHead(page))
146 u |= 1 << KPF_COMPOUND_HEAD;
147 if (PageTail(page))
148 u |= 1 << KPF_COMPOUND_TAIL;
149 if (PageHuge(page))
150 u |= 1 << KPF_HUGE;
151
152 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
153
154 /*
155 * Caveats on high order pages:
156 * PG_buddy will only be set on the head page; SLUB/SLQB do the same
157 * for PG_slab; SLOB won't set PG_slab at all on compound pages.
158 */
159 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
160 u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy);
161
162 u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
163 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
164 u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate);
165 u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback);
166
167 u |= kpf_copy_bit(k, KPF_LRU, PG_lru);
168 u |= kpf_copy_bit(k, KPF_REFERENCED, PG_referenced);
169 u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active);
170 u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
171
172 u |= kpf_copy_bit(k, KPF_SWAPCACHE, PG_swapcache);
173 u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
174
175 u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
176 u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked);
177
178#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
179 u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
180#endif
181
182 u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved);
183 u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk);
184 u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private);
185 u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
186 u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
187 u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
188
189 return u;
190};
84 191
85static ssize_t kpageflags_read(struct file *file, char __user *buf, 192static ssize_t kpageflags_read(struct file *file, char __user *buf,
86 size_t count, loff_t *ppos) 193 size_t count, loff_t *ppos)
@@ -90,7 +197,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
90 unsigned long src = *ppos; 197 unsigned long src = *ppos;
91 unsigned long pfn; 198 unsigned long pfn;
92 ssize_t ret = 0; 199 ssize_t ret = 0;
93 u64 kflags, uflags;
94 200
95 pfn = src / KPMSIZE; 201 pfn = src / KPMSIZE;
96 count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); 202 count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
@@ -98,32 +204,18 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
98 return -EINVAL; 204 return -EINVAL;
99 205
100 while (count > 0) { 206 while (count > 0) {
101 ppage = NULL;
102 if (pfn_valid(pfn)) 207 if (pfn_valid(pfn))
103 ppage = pfn_to_page(pfn); 208 ppage = pfn_to_page(pfn);
104 pfn++;
105 if (!ppage)
106 kflags = 0;
107 else 209 else
108 kflags = ppage->flags; 210 ppage = NULL;
109 211
110 uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) | 212 if (put_user(get_uflags(ppage), out)) {
111 kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
112 kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
113 kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
114 kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) |
115 kpf_copy_bit(kflags, KPF_LRU, PG_lru) |
116 kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) |
117 kpf_copy_bit(kflags, KPF_SLAB, PG_slab) |
118 kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) |
119 kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) |
120 kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy);
121
122 if (put_user(uflags, out++)) {
123 ret = -EFAULT; 213 ret = -EFAULT;
124 break; 214 break;
125 } 215 }
126 216
217 pfn++;
218 out++;
127 count -= KPMSIZE; 219 count -= KPMSIZE;
128 } 220 }
129 221
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index de2bba5a3440..7ba79a54948c 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -11,6 +11,7 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <asm/prom.h> 12#include <asm/prom.h>
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include "internal.h"
14 15
15#ifndef HAVE_ARCH_DEVTREE_FIXUPS 16#ifndef HAVE_ARCH_DEVTREE_FIXUPS
16static inline void set_node_proc_entry(struct device_node *np, 17static inline void set_node_proc_entry(struct device_node *np,
@@ -194,20 +195,20 @@ void proc_device_tree_add_node(struct device_node *np,
194 p = fixup_name(np, de, p); 195 p = fixup_name(np, de, p);
195 196
196 ent = proc_mkdir(p, de); 197 ent = proc_mkdir(p, de);
197 if (ent == 0) 198 if (ent == NULL)
198 break; 199 break;
199 proc_device_tree_add_node(child, ent); 200 proc_device_tree_add_node(child, ent);
200 } 201 }
201 of_node_put(child); 202 of_node_put(child);
202 203
203 for (pp = np->properties; pp != 0; pp = pp->next) { 204 for (pp = np->properties; pp != NULL; pp = pp->next) {
204 p = pp->name; 205 p = pp->name;
205 206
206 if (duplicate_name(de, p)) 207 if (duplicate_name(de, p))
207 p = fixup_name(np, de, p); 208 p = fixup_name(np, de, p);
208 209
209 ent = __proc_device_tree_add_prop(de, pp, p); 210 ent = __proc_device_tree_add_prop(de, pp, p);
210 if (ent == 0) 211 if (ent == NULL)
211 break; 212 break;
212 } 213 }
213} 214}
@@ -220,10 +221,10 @@ void __init proc_device_tree_init(void)
220 struct device_node *root; 221 struct device_node *root;
221 222
222 proc_device_tree = proc_mkdir("device-tree", NULL); 223 proc_device_tree = proc_mkdir("device-tree", NULL);
223 if (proc_device_tree == 0) 224 if (proc_device_tree == NULL)
224 return; 225 return;
225 root = of_find_node_by_path("/"); 226 root = of_find_node_by_path("/");
226 if (root == 0) { 227 if (root == NULL) {
227 printk(KERN_ERR "/proc/device-tree: can't find root\n"); 228 printk(KERN_ERR "/proc/device-tree: can't find root\n");
228 return; 229 return;
229 } 230 }
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
new file mode 100644
index 000000000000..1807c2419f17
--- /dev/null
+++ b/fs/proc/softirqs.c
@@ -0,0 +1,44 @@
1#include <linux/init.h>
2#include <linux/kernel_stat.h>
3#include <linux/proc_fs.h>
4#include <linux/seq_file.h>
5
6/*
7 * /proc/softirqs ... display the number of softirqs
8 */
9static int show_softirqs(struct seq_file *p, void *v)
10{
11 int i, j;
12
13 seq_printf(p, " ");
14 for_each_possible_cpu(i)
15 seq_printf(p, "CPU%-8d", i);
16 seq_printf(p, "\n");
17
18 for (i = 0; i < NR_SOFTIRQS; i++) {
19 seq_printf(p, "%8s:", softirq_to_name[i]);
20 for_each_possible_cpu(j)
21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
22 seq_printf(p, "\n");
23 }
24 return 0;
25}
26
27static int softirqs_open(struct inode *inode, struct file *file)
28{
29 return single_open(file, show_softirqs, NULL);
30}
31
32static const struct file_operations proc_softirqs_operations = {
33 .open = softirqs_open,
34 .read = seq_read,
35 .llseek = seq_lseek,
36 .release = single_release,
37};
38
39static int __init proc_softirqs_init(void)
40{
41 proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
42 return 0;
43}
44module_init(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 81e4eb60972e..7cc726c6d70a 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -29,6 +29,8 @@ static int show_stat(struct seq_file *p, void *v)
29 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; 29 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
30 cputime64_t guest; 30 cputime64_t guest;
31 u64 sum = 0; 31 u64 sum = 0;
32 u64 sum_softirq = 0;
33 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
32 struct timespec boottime; 34 struct timespec boottime;
33 unsigned int per_irq_sum; 35 unsigned int per_irq_sum;
34 36
@@ -53,6 +55,13 @@ static int show_stat(struct seq_file *p, void *v)
53 sum += kstat_irqs_cpu(j, i); 55 sum += kstat_irqs_cpu(j, i);
54 } 56 }
55 sum += arch_irq_stat_cpu(i); 57 sum += arch_irq_stat_cpu(i);
58
59 for (j = 0; j < NR_SOFTIRQS; j++) {
60 unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
61
62 per_softirq_sums[j] += softirq_stat;
63 sum_softirq += softirq_stat;
64 }
56 } 65 }
57 sum += arch_irq_stat(); 66 sum += arch_irq_stat();
58 67
@@ -115,6 +124,12 @@ static int show_stat(struct seq_file *p, void *v)
115 nr_running(), 124 nr_running(),
116 nr_iowait()); 125 nr_iowait());
117 126
127 seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
128
129 for (i = 0; i < NR_SOFTIRQS; i++)
130 seq_printf(p, " %u", per_softirq_sums[i]);
131 seq_printf(p, "\n");
132
118 return 0; 133 return 0;
119} 134}
120 135
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6f61b7cc32e0..9bd8be1d235c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
119 mm = mm_for_maps(priv->task); 119 mm = mm_for_maps(priv->task);
120 if (!mm) 120 if (!mm)
121 return NULL; 121 return NULL;
122 down_read(&mm->mmap_sem);
122 123
123 tail_vma = get_gate_vma(priv->task); 124 tail_vma = get_gate_vma(priv->task);
124 priv->tail_vma = tail_vma; 125 priv->tail_vma = tail_vma;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 64a72e2e7650..8f5c05d3dbd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
189 priv->task = NULL; 189 priv->task = NULL;
190 return NULL; 190 return NULL;
191 } 191 }
192 down_read(&mm->mmap_sem);
192 193
193 /* start from the Nth VMA */ 194 /* start from the Nth VMA */
194 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) 195 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 5edcc3f92ba7..0872afa58d39 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -166,12 +166,7 @@ static const struct file_operations proc_vmcore_operations = {
166 166
167static struct vmcore* __init get_new_element(void) 167static struct vmcore* __init get_new_element(void)
168{ 168{
169 struct vmcore *p; 169 return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
170
171 p = kmalloc(sizeof(*p), GFP_KERNEL);
172 if (p)
173 memset(p, 0, sizeof(*p));
174 return p;
175} 170}
176 171
177static u64 __init get_vmcore_size_elf64(char *elfptr) 172static u64 __init get_vmcore_size_elf64(char *elfptr)
diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index 502d7fe98bab..e4d408cc5473 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_QNX4FS_FS) += qnx4.o 5obj-$(CONFIG_QNX4FS_FS) += qnx4.o
6 6
7qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o fsync.o 7qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 8425cf6e9624..e1cd061a25f7 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -13,14 +13,9 @@
13 * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) . 13 * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
14 */ 14 */
15 15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/qnx4_fs.h>
19#include <linux/stat.h>
20#include <linux/kernel.h>
21#include <linux/string.h>
22#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
23#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include "qnx4.h"
24 19
25#if 0 20#if 0
26int qnx4_new_block(struct super_block *sb) 21int qnx4_new_block(struct super_block *sb)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index ea9ffefb48ad..003c68f3238b 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,14 +11,9 @@
11 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support. 11 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
12 */ 12 */
13 13
14#include <linux/string.h>
15#include <linux/errno.h>
16#include <linux/fs.h>
17#include <linux/qnx4_fs.h>
18#include <linux/stat.h>
19#include <linux/smp_lock.h> 14#include <linux/smp_lock.h>
20#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
21 16#include "qnx4.h"
22 17
23static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) 18static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
24{ 19{
@@ -84,7 +79,7 @@ const struct file_operations qnx4_dir_operations =
84{ 79{
85 .read = generic_read_dir, 80 .read = generic_read_dir,
86 .readdir = qnx4_readdir, 81 .readdir = qnx4_readdir,
87 .fsync = file_fsync, 82 .fsync = simple_fsync,
88}; 83};
89 84
90const struct inode_operations qnx4_dir_inode_operations = 85const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index 867f42b02035..09b170ac936c 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -12,8 +12,7 @@
12 * 27-06-1998 by Frank Denis : file overwriting. 12 * 27-06-1998 by Frank Denis : file overwriting.
13 */ 13 */
14 14
15#include <linux/fs.h> 15#include "qnx4.h"
16#include <linux/qnx4_fs.h>
17 16
18/* 17/*
19 * We have mostly NULL's here: the current defaults are ok for 18 * We have mostly NULL's here: the current defaults are ok for
@@ -29,7 +28,7 @@ const struct file_operations qnx4_file_operations =
29#ifdef CONFIG_QNX4FS_RW 28#ifdef CONFIG_QNX4FS_RW
30 .write = do_sync_write, 29 .write = do_sync_write,
31 .aio_write = generic_file_aio_write, 30 .aio_write = generic_file_aio_write,
32 .fsync = qnx4_sync_file, 31 .fsync = simple_fsync,
33#endif 32#endif
34}; 33};
35 34
diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c
deleted file mode 100644
index aa3b19544bee..000000000000
--- a/fs/qnx4/fsync.c
+++ /dev/null
@@ -1,169 +0,0 @@
1/*
2 * QNX4 file system, Linux implementation.
3 *
4 * Version : 0.1
5 *
6 * Using parts of the xiafs filesystem.
7 *
8 * History :
9 *
10 * 24-03-1998 by Richard Frowijn : first release.
11 */
12
13#include <linux/errno.h>
14#include <linux/time.h>
15#include <linux/stat.h>
16#include <linux/fcntl.h>
17#include <linux/smp_lock.h>
18#include <linux/buffer_head.h>
19
20#include <linux/fs.h>
21#include <linux/qnx4_fs.h>
22
23#include <asm/system.h>
24
25/*
26 * The functions for qnx4 fs file synchronization.
27 */
28
29#ifdef CONFIG_QNX4FS_RW
30
31static int sync_block(struct inode *inode, unsigned short *block, int wait)
32{
33 struct buffer_head *bh;
34 unsigned short tmp;
35
36 if (!*block)
37 return 0;
38 tmp = *block;
39 bh = sb_find_get_block(inode->i_sb, *block);
40 if (!bh)
41 return 0;
42 if (*block != tmp) {
43 brelse(bh);
44 return 1;
45 }
46 if (wait && buffer_req(bh) && !buffer_uptodate(bh)) {
47 brelse(bh);
48 return -1;
49 }
50 if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) {
51 brelse(bh);
52 return 0;
53 }
54 ll_rw_block(WRITE, 1, &bh);
55 atomic_dec(&bh->b_count);
56 return 0;
57}
58
59#ifdef WTF
60static int sync_iblock(struct inode *inode, unsigned short *iblock,
61 struct buffer_head **bh, int wait)
62{
63 int rc;
64 unsigned short tmp;
65
66 *bh = NULL;
67 tmp = *iblock;
68 if (!tmp)
69 return 0;
70 rc = sync_block(inode, iblock, wait);
71 if (rc)
72 return rc;
73 *bh = sb_bread(inode->i_sb, tmp);
74 if (tmp != *iblock) {
75 brelse(*bh);
76 *bh = NULL;
77 return 1;
78 }
79 if (!*bh)
80 return -1;
81 return 0;
82}
83#endif
84
85static int sync_direct(struct inode *inode, int wait)
86{
87 int i;
88 int rc, err = 0;
89
90 for (i = 0; i < 7; i++) {
91 rc = sync_block(inode,
92 (unsigned short *) qnx4_raw_inode(inode)->di_first_xtnt.xtnt_blk + i, wait);
93 if (rc > 0)
94 break;
95 if (rc)
96 err = rc;
97 }
98 return err;
99}
100
101#ifdef WTF
102static int sync_indirect(struct inode *inode, unsigned short *iblock, int wait)
103{
104 int i;
105 struct buffer_head *ind_bh;
106 int rc, err = 0;
107
108 rc = sync_iblock(inode, iblock, &ind_bh, wait);
109 if (rc || !ind_bh)
110 return rc;
111
112 for (i = 0; i < 512; i++) {
113 rc = sync_block(inode,
114 ((unsigned short *) ind_bh->b_data) + i,
115 wait);
116 if (rc > 0)
117 break;
118 if (rc)
119 err = rc;
120 }
121 brelse(ind_bh);
122 return err;
123}
124
125static int sync_dindirect(struct inode *inode, unsigned short *diblock,
126 int wait)
127{
128 int i;
129 struct buffer_head *dind_bh;
130 int rc, err = 0;
131
132 rc = sync_iblock(inode, diblock, &dind_bh, wait);
133 if (rc || !dind_bh)
134 return rc;
135
136 for (i = 0; i < 512; i++) {
137 rc = sync_indirect(inode,
138 ((unsigned short *) dind_bh->b_data) + i,
139 wait);
140 if (rc > 0)
141 break;
142 if (rc)
143 err = rc;
144 }
145 brelse(dind_bh);
146 return err;
147}
148#endif
149
150int qnx4_sync_file(struct file *file, struct dentry *dentry, int unused)
151{
152 struct inode *inode = dentry->d_inode;
153 int wait, err = 0;
154
155 (void) file;
156 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
157 S_ISLNK(inode->i_mode)))
158 return -EINVAL;
159
160 lock_kernel();
161 for (wait = 0; wait <= 1; wait++) {
162 err |= sync_direct(inode, wait);
163 }
164 err |= qnx4_sync_inode(inode);
165 unlock_kernel();
166 return (err < 0) ? -EIO : 0;
167}
168
169#endif
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fe1f0f31d11c..681df5fcd161 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -13,19 +13,15 @@
13 */ 13 */
14 14
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/string.h>
18#include <linux/errno.h>
19#include <linux/slab.h>
20#include <linux/fs.h>
21#include <linux/qnx4_fs.h>
22#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/slab.h>
23#include <linux/highuid.h> 18#include <linux/highuid.h>
24#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
25#include <linux/pagemap.h> 20#include <linux/pagemap.h>
26#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
27#include <linux/vfs.h> 22#include <linux/writeback.h>
28#include <asm/uaccess.h> 23#include <linux/statfs.h>
24#include "qnx4.h"
29 25
30#define QNX4_VERSION 4 26#define QNX4_VERSION 4
31#define QNX4_BMNAME ".bitmap" 27#define QNX4_BMNAME ".bitmap"
@@ -34,31 +30,6 @@ static const struct super_operations qnx4_sops;
34 30
35#ifdef CONFIG_QNX4FS_RW 31#ifdef CONFIG_QNX4FS_RW
36 32
37int qnx4_sync_inode(struct inode *inode)
38{
39 int err = 0;
40# if 0
41 struct buffer_head *bh;
42
43 bh = qnx4_update_inode(inode);
44 if (bh && buffer_dirty(bh))
45 {
46 sync_dirty_buffer(bh);
47 if (buffer_req(bh) && !buffer_uptodate(bh))
48 {
49 printk ("IO error syncing qnx4 inode [%s:%08lx]\n",
50 inode->i_sb->s_id, inode->i_ino);
51 err = -1;
52 }
53 brelse (bh);
54 } else if (!bh) {
55 err = -1;
56 }
57# endif
58
59 return err;
60}
61
62static void qnx4_delete_inode(struct inode *inode) 33static void qnx4_delete_inode(struct inode *inode)
63{ 34{
64 QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino)); 35 QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
@@ -70,15 +41,7 @@ static void qnx4_delete_inode(struct inode *inode)
70 unlock_kernel(); 41 unlock_kernel();
71} 42}
72 43
73static void qnx4_write_super(struct super_block *sb) 44static int qnx4_write_inode(struct inode *inode, int do_sync)
74{
75 lock_kernel();
76 QNX4DEBUG(("qnx4: write_super\n"));
77 sb->s_dirt = 0;
78 unlock_kernel();
79}
80
81static int qnx4_write_inode(struct inode *inode, int unused)
82{ 45{
83 struct qnx4_inode_entry *raw_inode; 46 struct qnx4_inode_entry *raw_inode;
84 int block, ino; 47 int block, ino;
@@ -115,6 +78,16 @@ static int qnx4_write_inode(struct inode *inode, int unused)
115 raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 78 raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
116 raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks); 79 raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
117 mark_buffer_dirty(bh); 80 mark_buffer_dirty(bh);
81 if (do_sync) {
82 sync_dirty_buffer(bh);
83 if (buffer_req(bh) && !buffer_uptodate(bh)) {
84 printk("qnx4: IO error syncing inode [%s:%08x]\n",
85 inode->i_sb->s_id, ino);
86 brelse(bh);
87 unlock_kernel();
88 return -EIO;
89 }
90 }
118 brelse(bh); 91 brelse(bh);
119 unlock_kernel(); 92 unlock_kernel();
120 return 0; 93 return 0;
@@ -138,7 +111,6 @@ static const struct super_operations qnx4_sops =
138#ifdef CONFIG_QNX4FS_RW 111#ifdef CONFIG_QNX4FS_RW
139 .write_inode = qnx4_write_inode, 112 .write_inode = qnx4_write_inode,
140 .delete_inode = qnx4_delete_inode, 113 .delete_inode = qnx4_delete_inode,
141 .write_super = qnx4_write_super,
142#endif 114#endif
143}; 115};
144 116
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 775eed3a4085..5972ed214937 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,16 +12,9 @@
12 * 04-07-1998 by Frank Denis : first step for rmdir/unlink. 12 * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
13 */ 13 */
14 14
15#include <linux/time.h>
16#include <linux/fs.h>
17#include <linux/qnx4_fs.h>
18#include <linux/kernel.h>
19#include <linux/string.h>
20#include <linux/stat.h>
21#include <linux/fcntl.h>
22#include <linux/errno.h>
23#include <linux/smp_lock.h> 15#include <linux/smp_lock.h>
24#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
17#include "qnx4.h"
25 18
26 19
27/* 20/*
@@ -187,7 +180,7 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
187 de->di_status = 0; 180 de->di_status = 0;
188 memset(de->di_fname, 0, sizeof de->di_fname); 181 memset(de->di_fname, 0, sizeof de->di_fname);
189 de->di_mode = 0; 182 de->di_mode = 0;
190 mark_buffer_dirty(bh); 183 mark_buffer_dirty_inode(bh, dir);
191 clear_nlink(inode); 184 clear_nlink(inode);
192 mark_inode_dirty(inode); 185 mark_inode_dirty(inode);
193 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; 186 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
@@ -232,7 +225,7 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry)
232 de->di_status = 0; 225 de->di_status = 0;
233 memset(de->di_fname, 0, sizeof de->di_fname); 226 memset(de->di_fname, 0, sizeof de->di_fname);
234 de->di_mode = 0; 227 de->di_mode = 0;
235 mark_buffer_dirty(bh); 228 mark_buffer_dirty_inode(bh, dir);
236 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; 229 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
237 mark_inode_dirty(dir); 230 mark_inode_dirty(dir);
238 inode->i_ctime = dir->i_ctime; 231 inode->i_ctime = dir->i_ctime;
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
new file mode 100644
index 000000000000..9efc089454f6
--- /dev/null
+++ b/fs/qnx4/qnx4.h
@@ -0,0 +1,57 @@
1#include <linux/fs.h>
2#include <linux/qnx4_fs.h>
3
4#define QNX4_DEBUG 0
5
6#if QNX4_DEBUG
7#define QNX4DEBUG(X) printk X
8#else
9#define QNX4DEBUG(X) (void) 0
10#endif
11
12struct qnx4_sb_info {
13 struct buffer_head *sb_buf; /* superblock buffer */
14 struct qnx4_super_block *sb; /* our superblock */
15 unsigned int Version; /* may be useful */
16 struct qnx4_inode_entry *BitMap; /* useful */
17};
18
19struct qnx4_inode_info {
20 struct qnx4_inode_entry raw;
21 loff_t mmu_private;
22 struct inode vfs_inode;
23};
24
25extern struct inode *qnx4_iget(struct super_block *, unsigned long);
26extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
27extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
28extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
29
30extern struct buffer_head *qnx4_bread(struct inode *, int, int);
31
32extern const struct inode_operations qnx4_file_inode_operations;
33extern const struct inode_operations qnx4_dir_inode_operations;
34extern const struct file_operations qnx4_file_operations;
35extern const struct file_operations qnx4_dir_operations;
36extern int qnx4_is_free(struct super_block *sb, long block);
37extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
38extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
39extern void qnx4_truncate(struct inode *inode);
40extern void qnx4_free_inode(struct inode *inode);
41extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
42extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
43
44static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
45{
46 return sb->s_fs_info;
47}
48
49static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
50{
51 return container_of(inode, struct qnx4_inode_info, vfs_inode);
52}
53
54static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
55{
56 return &qnx4_i(inode)->raw;
57}
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
index 6437c1c3d1dd..d94d9ee241fe 100644
--- a/fs/qnx4/truncate.c
+++ b/fs/qnx4/truncate.c
@@ -10,12 +10,8 @@
10 * 30-06-1998 by Frank DENIS : ugly filler. 10 * 30-06-1998 by Frank DENIS : ugly filler.
11 */ 11 */
12 12
13#include <linux/types.h>
14#include <linux/errno.h>
15#include <linux/fs.h>
16#include <linux/qnx4_fs.h>
17#include <linux/smp_lock.h> 13#include <linux/smp_lock.h>
18#include <asm/uaccess.h> 14#include "qnx4.h"
19 15
20#ifdef CONFIG_QNX4FS_RW 16#ifdef CONFIG_QNX4FS_RW
21 17
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 607c579e5eca..38f7bd559f35 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2042,7 +2042,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
2042 * changes */ 2042 * changes */
2043 invalidate_bdev(sb->s_bdev); 2043 invalidate_bdev(sb->s_bdev);
2044 } 2044 }
2045 mutex_lock(&inode->i_mutex);
2046 mutex_lock(&dqopt->dqonoff_mutex); 2045 mutex_lock(&dqopt->dqonoff_mutex);
2047 if (sb_has_quota_loaded(sb, type)) { 2046 if (sb_has_quota_loaded(sb, type)) {
2048 error = -EBUSY; 2047 error = -EBUSY;
@@ -2054,9 +2053,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
2054 * possible) Also nobody should write to the file - we use 2053 * possible) Also nobody should write to the file - we use
2055 * special IO operations which ignore the immutable bit. */ 2054 * special IO operations which ignore the immutable bit. */
2056 down_write(&dqopt->dqptr_sem); 2055 down_write(&dqopt->dqptr_sem);
2056 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2057 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | 2057 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
2058 S_NOQUOTA); 2058 S_NOQUOTA);
2059 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; 2059 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
2060 mutex_unlock(&inode->i_mutex);
2060 up_write(&dqopt->dqptr_sem); 2061 up_write(&dqopt->dqptr_sem);
2061 sb->dq_op->drop(inode); 2062 sb->dq_op->drop(inode);
2062 } 2063 }
@@ -2080,7 +2081,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
2080 goto out_file_init; 2081 goto out_file_init;
2081 } 2082 }
2082 mutex_unlock(&dqopt->dqio_mutex); 2083 mutex_unlock(&dqopt->dqio_mutex);
2083 mutex_unlock(&inode->i_mutex);
2084 spin_lock(&dq_state_lock); 2084 spin_lock(&dq_state_lock);
2085 dqopt->flags |= dquot_state_flag(flags, type); 2085 dqopt->flags |= dquot_state_flag(flags, type);
2086 spin_unlock(&dq_state_lock); 2086 spin_unlock(&dq_state_lock);
@@ -2094,16 +2094,17 @@ out_file_init:
2094 dqopt->files[type] = NULL; 2094 dqopt->files[type] = NULL;
2095 iput(inode); 2095 iput(inode);
2096out_lock: 2096out_lock:
2097 mutex_unlock(&dqopt->dqonoff_mutex);
2098 if (oldflags != -1) { 2097 if (oldflags != -1) {
2099 down_write(&dqopt->dqptr_sem); 2098 down_write(&dqopt->dqptr_sem);
2099 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2100 /* Set the flags back (in the case of accidental quotaon() 2100 /* Set the flags back (in the case of accidental quotaon()
2101 * on a wrong file we don't want to mess up the flags) */ 2101 * on a wrong file we don't want to mess up the flags) */
2102 inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); 2102 inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
2103 inode->i_flags |= oldflags; 2103 inode->i_flags |= oldflags;
2104 mutex_unlock(&inode->i_mutex);
2104 up_write(&dqopt->dqptr_sem); 2105 up_write(&dqopt->dqptr_sem);
2105 } 2106 }
2106 mutex_unlock(&inode->i_mutex); 2107 mutex_unlock(&dqopt->dqonoff_mutex);
2107out_fmt: 2108out_fmt:
2108 put_quota_format(fmt); 2109 put_quota_format(fmt);
2109 2110
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b7f5a468f076..95c5b42384b2 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -159,10 +159,14 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
159 return error; 159 return error;
160} 160}
161 161
162static void quota_sync_sb(struct super_block *sb, int type) 162#ifdef CONFIG_QUOTA
163void sync_quota_sb(struct super_block *sb, int type)
163{ 164{
164 int cnt; 165 int cnt;
165 166
167 if (!sb->s_qcop->quota_sync)
168 return;
169
166 sb->s_qcop->quota_sync(sb, type); 170 sb->s_qcop->quota_sync(sb, type);
167 171
168 if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) 172 if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
@@ -191,17 +195,13 @@ static void quota_sync_sb(struct super_block *sb, int type)
191 } 195 }
192 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 196 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
193} 197}
198#endif
194 199
195void sync_dquots(struct super_block *sb, int type) 200static void sync_dquots(int type)
196{ 201{
202 struct super_block *sb;
197 int cnt; 203 int cnt;
198 204
199 if (sb) {
200 if (sb->s_qcop->quota_sync)
201 quota_sync_sb(sb, type);
202 return;
203 }
204
205 spin_lock(&sb_lock); 205 spin_lock(&sb_lock);
206restart: 206restart:
207 list_for_each_entry(sb, &super_blocks, s_list) { 207 list_for_each_entry(sb, &super_blocks, s_list) {
@@ -222,8 +222,8 @@ restart:
222 sb->s_count++; 222 sb->s_count++;
223 spin_unlock(&sb_lock); 223 spin_unlock(&sb_lock);
224 down_read(&sb->s_umount); 224 down_read(&sb->s_umount);
225 if (sb->s_root && sb->s_qcop->quota_sync) 225 if (sb->s_root)
226 quota_sync_sb(sb, type); 226 sync_quota_sb(sb, type);
227 up_read(&sb->s_umount); 227 up_read(&sb->s_umount);
228 spin_lock(&sb_lock); 228 spin_lock(&sb_lock);
229 if (__put_super_and_need_restart(sb)) 229 if (__put_super_and_need_restart(sb))
@@ -301,7 +301,10 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
301 return sb->s_qcop->set_dqblk(sb, type, id, &idq); 301 return sb->s_qcop->set_dqblk(sb, type, id, &idq);
302 } 302 }
303 case Q_SYNC: 303 case Q_SYNC:
304 sync_dquots(sb, type); 304 if (sb)
305 sync_quota_sb(sb, type);
306 else
307 sync_dquots(type);
305 return 0; 308 return 0;
306 309
307 case Q_XQUOTAON: 310 case Q_XQUOTAON:
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ebb2c417912c..11f0c06316de 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -20,6 +20,7 @@
20#include <linux/ramfs.h> 20#include <linux/ramfs.h>
21#include <linux/pagevec.h> 21#include <linux/pagevec.h>
22#include <linux/mman.h> 22#include <linux/mman.h>
23#include <linux/sched.h>
23 24
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25#include "internal.h" 26#include "internal.h"
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 3a6b193d8444..0ff7566c767c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -202,9 +202,12 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
202 return -EINVAL; 202 return -EINVAL;
203 opts->mode = option & S_IALLUGO; 203 opts->mode = option & S_IALLUGO;
204 break; 204 break;
205 default: 205 /*
206 printk(KERN_ERR "ramfs: bad mount option: %s\n", p); 206 * We might like to report bad mount options here;
207 return -EINVAL; 207 * but traditionally ramfs has ignored all mount options,
208 * and as it is used as a !CONFIG_SHMEM simple substitute
209 * for tmpfs, better continue to ignore other mount options.
210 */
208 } 211 }
209 } 212 }
210 213
diff --git a/fs/read_write.c b/fs/read_write.c
index 9d1e76bb9ee1..6c8c55dec2bc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -805,12 +805,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
805 goto out; 805 goto out;
806 if (!(in_file->f_mode & FMODE_READ)) 806 if (!(in_file->f_mode & FMODE_READ))
807 goto fput_in; 807 goto fput_in;
808 retval = -EINVAL;
809 in_inode = in_file->f_path.dentry->d_inode;
810 if (!in_inode)
811 goto fput_in;
812 if (!in_file->f_op || !in_file->f_op->splice_read)
813 goto fput_in;
814 retval = -ESPIPE; 808 retval = -ESPIPE;
815 if (!ppos) 809 if (!ppos)
816 ppos = &in_file->f_pos; 810 ppos = &in_file->f_pos;
@@ -834,6 +828,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
834 retval = -EINVAL; 828 retval = -EINVAL;
835 if (!out_file->f_op || !out_file->f_op->sendpage) 829 if (!out_file->f_op || !out_file->f_op->sendpage)
836 goto fput_out; 830 goto fput_out;
831 in_inode = in_file->f_path.dentry->d_inode;
837 out_inode = out_file->f_path.dentry->d_inode; 832 out_inode = out_file->f_path.dentry->d_inode;
838 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); 833 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
839 if (retval < 0) 834 if (retval < 0)
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 45ee3d357c70..6d2668fdc384 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -44,13 +44,11 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
44static inline bool is_privroot_deh(struct dentry *dir, 44static inline bool is_privroot_deh(struct dentry *dir,
45 struct reiserfs_de_head *deh) 45 struct reiserfs_de_head *deh)
46{ 46{
47 int ret = 0;
48#ifdef CONFIG_REISERFS_FS_XATTR
49 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; 47 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
50 ret = (dir == dir->d_parent && privroot->d_inode && 48 if (reiserfs_expose_privroot(dir->d_sb))
51 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); 49 return 0;
52#endif 50 return (dir == dir->d_parent && privroot->d_inode &&
53 return ret; 51 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
54} 52}
55 53
56int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, 54int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 4beb964a2a3e..128d3f7c8aa5 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -1270,9 +1270,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1270 1270
1271 RFALSE(ih, "PAP-12210: ih must be 0"); 1271 RFALSE(ih, "PAP-12210: ih must be 0");
1272 1272
1273 if (is_direntry_le_ih 1273 aux_ih = B_N_PITEM_HEAD(tbS0, item_pos);
1274 (aux_ih = 1274 if (is_direntry_le_ih(aux_ih)) {
1275 B_N_PITEM_HEAD(tbS0, item_pos))) {
1276 /* we append to directory item */ 1275 /* we append to directory item */
1277 1276
1278 int entry_count; 1277 int entry_count;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6fd0f47e45db..a14d6cd9eeda 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1131,8 +1131,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
1131 REISERFS_I(inode)->i_trans_id = 0; 1131 REISERFS_I(inode)->i_trans_id = 0;
1132 REISERFS_I(inode)->i_jl = NULL; 1132 REISERFS_I(inode)->i_jl = NULL;
1133 mutex_init(&(REISERFS_I(inode)->i_mmap)); 1133 mutex_init(&(REISERFS_I(inode)->i_mmap));
1134 reiserfs_init_acl_access(inode);
1135 reiserfs_init_acl_default(inode);
1136 reiserfs_init_xattr_rwsem(inode); 1134 reiserfs_init_xattr_rwsem(inode);
1137 1135
1138 if (stat_data_v1(ih)) { 1136 if (stat_data_v1(ih)) {
@@ -1834,8 +1832,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1834 REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; 1832 REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1835 sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode); 1833 sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
1836 mutex_init(&(REISERFS_I(inode)->i_mmap)); 1834 mutex_init(&(REISERFS_I(inode)->i_mmap));
1837 reiserfs_init_acl_access(inode);
1838 reiserfs_init_acl_default(inode);
1839 reiserfs_init_xattr_rwsem(inode); 1835 reiserfs_init_xattr_rwsem(inode);
1840 1836
1841 /* key to search for correct place for new stat data */ 1837 /* key to search for correct place for new stat data */
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 77f5bb746bf0..90622200b39c 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -997,7 +997,7 @@ static int reiserfs_async_progress_wait(struct super_block *s)
997 DEFINE_WAIT(wait); 997 DEFINE_WAIT(wait);
998 struct reiserfs_journal *j = SB_JOURNAL(s); 998 struct reiserfs_journal *j = SB_JOURNAL(s);
999 if (atomic_read(&j->j_async_throttle)) 999 if (atomic_read(&j->j_async_throttle))
1000 congestion_wait(WRITE, HZ / 10); 1000 congestion_wait(BLK_RW_ASYNC, HZ / 10);
1001 return 0; 1001 return 0;
1002} 1002}
1003 1003
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 381750a155f6..03d85cbf90bf 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -390,7 +390,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
390 390
391 if (last_first == FIRST_TO_LAST) { 391 if (last_first == FIRST_TO_LAST) {
392 /* if ( if item in position item_num in buffer SOURCE is directory item ) */ 392 /* if ( if item in position item_num in buffer SOURCE is directory item ) */
393 if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num))) 393 ih = B_N_PITEM_HEAD(src, item_num);
394 if (is_direntry_le_ih(ih))
394 leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 395 leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
395 item_num, 0, cpy_bytes); 396 item_num, 0, cpy_bytes);
396 else { 397 else {
@@ -418,7 +419,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
418 } 419 }
419 } else { 420 } else {
420 /* if ( if item in position item_num in buffer SOURCE is directory item ) */ 421 /* if ( if item in position item_num in buffer SOURCE is directory item ) */
421 if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num))) 422 ih = B_N_PITEM_HEAD(src, item_num);
423 if (is_direntry_le_ih(ih))
422 leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, 424 leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
423 item_num, 425 item_num,
424 I_ENTRY_COUNT(ih) - cpy_bytes, 426 I_ENTRY_COUNT(ih) - cpy_bytes,
@@ -774,8 +776,8 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
774 leaf_delete_items_entirely(cur_bi, first + 1, 776 leaf_delete_items_entirely(cur_bi, first + 1,
775 del_num - 1); 777 del_num - 1);
776 778
777 if (is_direntry_le_ih 779 ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1);
778 (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1))) 780 if (is_direntry_le_ih(ih))
779 /* the last item is directory */ 781 /* the last item is directory */
780 /* len = numbers of directory entries in this item */ 782 /* len = numbers of directory entries in this item */
781 len = ih_entry_count(ih); 783 len = ih_entry_count(ih);
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 238e9d9b31e0..18b315d3d104 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -82,7 +82,6 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
82 if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) { 82 if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
83 printk 83 printk
84 ("reiserfs_resize: unable to allocate memory for journal bitmaps\n"); 84 ("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
85 unlock_super(s);
86 return -ENOMEM; 85 return -ENOMEM;
87 } 86 }
88 /* the new journal bitmaps are zero filled, now we copy in the bitmap 87 /* the new journal bitmaps are zero filled, now we copy in the bitmap
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3567fb9e3fb1..7adea74d6a8a 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -24,10 +24,10 @@
24#include <linux/exportfs.h> 24#include <linux/exportfs.h>
25#include <linux/quotaops.h> 25#include <linux/quotaops.h>
26#include <linux/vfs.h> 26#include <linux/vfs.h>
27#include <linux/mnt_namespace.h>
28#include <linux/mount.h> 27#include <linux/mount.h>
29#include <linux/namei.h> 28#include <linux/namei.h>
30#include <linux/crc32.h> 29#include <linux/crc32.h>
30#include <linux/smp_lock.h>
31 31
32struct file_system_type reiserfs_fs_type; 32struct file_system_type reiserfs_fs_type;
33 33
@@ -64,18 +64,15 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
64 64
65static int reiserfs_sync_fs(struct super_block *s, int wait) 65static int reiserfs_sync_fs(struct super_block *s, int wait)
66{ 66{
67 if (!(s->s_flags & MS_RDONLY)) { 67 struct reiserfs_transaction_handle th;
68 struct reiserfs_transaction_handle th; 68
69 reiserfs_write_lock(s); 69 reiserfs_write_lock(s);
70 if (!journal_begin(&th, s, 1)) 70 if (!journal_begin(&th, s, 1))
71 if (!journal_end_sync(&th, s, 1)) 71 if (!journal_end_sync(&th, s, 1))
72 reiserfs_flush_old_commits(s); 72 reiserfs_flush_old_commits(s);
73 s->s_dirt = 0; /* Even if it's not true. 73 s->s_dirt = 0; /* Even if it's not true.
74 * We'll loop forever in sync_supers otherwise */ 74 * We'll loop forever in sync_supers otherwise */
75 reiserfs_write_unlock(s); 75 reiserfs_write_unlock(s);
76 } else {
77 s->s_dirt = 0;
78 }
79 return 0; 76 return 0;
80} 77}
81 78
@@ -468,6 +465,11 @@ static void reiserfs_put_super(struct super_block *s)
468 struct reiserfs_transaction_handle th; 465 struct reiserfs_transaction_handle th;
469 th.t_trans_id = 0; 466 th.t_trans_id = 0;
470 467
468 lock_kernel();
469
470 if (s->s_dirt)
471 reiserfs_write_super(s);
472
471 /* change file system state to current state if it was mounted with read-write permissions */ 473 /* change file system state to current state if it was mounted with read-write permissions */
472 if (!(s->s_flags & MS_RDONLY)) { 474 if (!(s->s_flags & MS_RDONLY)) {
473 if (!journal_begin(&th, s, 10)) { 475 if (!journal_begin(&th, s, 10)) {
@@ -500,7 +502,7 @@ static void reiserfs_put_super(struct super_block *s)
500 kfree(s->s_fs_info); 502 kfree(s->s_fs_info);
501 s->s_fs_info = NULL; 503 s->s_fs_info = NULL;
502 504
503 return; 505 unlock_kernel();
504} 506}
505 507
506static struct kmem_cache *reiserfs_inode_cachep; 508static struct kmem_cache *reiserfs_inode_cachep;
@@ -526,10 +528,6 @@ static void init_once(void *foo)
526 528
527 INIT_LIST_HEAD(&ei->i_prealloc_list); 529 INIT_LIST_HEAD(&ei->i_prealloc_list);
528 inode_init_once(&ei->vfs_inode); 530 inode_init_once(&ei->vfs_inode);
529#ifdef CONFIG_REISERFS_FS_POSIX_ACL
530 ei->i_acl_access = NULL;
531 ei->i_acl_default = NULL;
532#endif
533} 531}
534 532
535static int init_inodecache(void) 533static int init_inodecache(void)
@@ -577,25 +575,6 @@ static void reiserfs_dirty_inode(struct inode *inode)
577 reiserfs_write_unlock(inode->i_sb); 575 reiserfs_write_unlock(inode->i_sb);
578} 576}
579 577
580#ifdef CONFIG_REISERFS_FS_POSIX_ACL
581static void reiserfs_clear_inode(struct inode *inode)
582{
583 struct posix_acl *acl;
584
585 acl = REISERFS_I(inode)->i_acl_access;
586 if (acl && !IS_ERR(acl))
587 posix_acl_release(acl);
588 REISERFS_I(inode)->i_acl_access = NULL;
589
590 acl = REISERFS_I(inode)->i_acl_default;
591 if (acl && !IS_ERR(acl))
592 posix_acl_release(acl);
593 REISERFS_I(inode)->i_acl_default = NULL;
594}
595#else
596#define reiserfs_clear_inode NULL
597#endif
598
599#ifdef CONFIG_QUOTA 578#ifdef CONFIG_QUOTA
600static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, 579static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
601 size_t, loff_t); 580 size_t, loff_t);
@@ -609,7 +588,6 @@ static const struct super_operations reiserfs_sops = {
609 .write_inode = reiserfs_write_inode, 588 .write_inode = reiserfs_write_inode,
610 .dirty_inode = reiserfs_dirty_inode, 589 .dirty_inode = reiserfs_dirty_inode,
611 .delete_inode = reiserfs_delete_inode, 590 .delete_inode = reiserfs_delete_inode,
612 .clear_inode = reiserfs_clear_inode,
613 .put_super = reiserfs_put_super, 591 .put_super = reiserfs_put_super,
614 .write_super = reiserfs_write_super, 592 .write_super = reiserfs_write_super,
615 .sync_fs = reiserfs_sync_fs, 593 .sync_fs = reiserfs_sync_fs,
@@ -898,6 +876,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
898 {"conv",.setmask = 1 << REISERFS_CONVERT}, 876 {"conv",.setmask = 1 << REISERFS_CONVERT},
899 {"attrs",.setmask = 1 << REISERFS_ATTRS}, 877 {"attrs",.setmask = 1 << REISERFS_ATTRS},
900 {"noattrs",.clrmask = 1 << REISERFS_ATTRS}, 878 {"noattrs",.clrmask = 1 << REISERFS_ATTRS},
879 {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
901#ifdef CONFIG_REISERFS_FS_XATTR 880#ifdef CONFIG_REISERFS_FS_XATTR
902 {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER}, 881 {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
903 {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER}, 882 {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
@@ -1193,6 +1172,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1193 memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); 1172 memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
1194#endif 1173#endif
1195 1174
1175 lock_kernel();
1196 rs = SB_DISK_SUPER_BLOCK(s); 1176 rs = SB_DISK_SUPER_BLOCK(s);
1197 1177
1198 if (!reiserfs_parse_options 1178 if (!reiserfs_parse_options
@@ -1315,10 +1295,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1315 1295
1316out_ok: 1296out_ok:
1317 replace_mount_options(s, new_opts); 1297 replace_mount_options(s, new_opts);
1298 unlock_kernel();
1318 return 0; 1299 return 0;
1319 1300
1320out_err: 1301out_err:
1321 kfree(new_opts); 1302 kfree(new_opts);
1303 unlock_kernel();
1322 return err; 1304 return err;
1323} 1305}
1324 1306
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8e7deb0e6964..6925b835a43b 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -46,7 +46,6 @@
46#include <linux/reiserfs_acl.h> 46#include <linux/reiserfs_acl.h>
47#include <asm/uaccess.h> 47#include <asm/uaccess.h>
48#include <net/checksum.h> 48#include <net/checksum.h>
49#include <linux/smp_lock.h>
50#include <linux/stat.h> 49#include <linux/stat.h>
51#include <linux/quotaops.h> 50#include <linux/quotaops.h>
52 51
@@ -981,7 +980,8 @@ int reiserfs_lookup_privroot(struct super_block *s)
981 strlen(PRIVROOT_NAME)); 980 strlen(PRIVROOT_NAME));
982 if (!IS_ERR(dentry)) { 981 if (!IS_ERR(dentry)) {
983 REISERFS_SB(s)->priv_root = dentry; 982 REISERFS_SB(s)->priv_root = dentry;
984 s->s_root->d_op = &xattr_lookup_poison_ops; 983 if (!reiserfs_expose_privroot(s))
984 s->s_root->d_op = &xattr_lookup_poison_ops;
985 if (dentry->d_inode) 985 if (dentry->d_inode)
986 dentry->d_inode->i_flags |= S_PRIVATE; 986 dentry->d_inode->i_flags |= S_PRIVATE;
987 } else 987 } else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index c303c426fe2b..35d6e672a279 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -188,29 +188,6 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
188 return ERR_PTR(-EINVAL); 188 return ERR_PTR(-EINVAL);
189} 189}
190 190
191static inline void iset_acl(struct inode *inode, struct posix_acl **i_acl,
192 struct posix_acl *acl)
193{
194 spin_lock(&inode->i_lock);
195 if (*i_acl != ERR_PTR(-ENODATA))
196 posix_acl_release(*i_acl);
197 *i_acl = posix_acl_dup(acl);
198 spin_unlock(&inode->i_lock);
199}
200
201static inline struct posix_acl *iget_acl(struct inode *inode,
202 struct posix_acl **i_acl)
203{
204 struct posix_acl *acl = ERR_PTR(-ENODATA);
205
206 spin_lock(&inode->i_lock);
207 if (*i_acl != ERR_PTR(-ENODATA))
208 acl = posix_acl_dup(*i_acl);
209 spin_unlock(&inode->i_lock);
210
211 return acl;
212}
213
214/* 191/*
215 * Inode operation get_posix_acl(). 192 * Inode operation get_posix_acl().
216 * 193 *
@@ -220,34 +197,29 @@ static inline struct posix_acl *iget_acl(struct inode *inode,
220struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) 197struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
221{ 198{
222 char *name, *value; 199 char *name, *value;
223 struct posix_acl *acl, **p_acl; 200 struct posix_acl *acl;
224 int size; 201 int size;
225 int retval; 202 int retval;
226 struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); 203
204 acl = get_cached_acl(inode, type);
205 if (acl != ACL_NOT_CACHED)
206 return acl;
227 207
228 switch (type) { 208 switch (type) {
229 case ACL_TYPE_ACCESS: 209 case ACL_TYPE_ACCESS:
230 name = POSIX_ACL_XATTR_ACCESS; 210 name = POSIX_ACL_XATTR_ACCESS;
231 p_acl = &reiserfs_i->i_acl_access;
232 break; 211 break;
233 case ACL_TYPE_DEFAULT: 212 case ACL_TYPE_DEFAULT:
234 name = POSIX_ACL_XATTR_DEFAULT; 213 name = POSIX_ACL_XATTR_DEFAULT;
235 p_acl = &reiserfs_i->i_acl_default;
236 break; 214 break;
237 default: 215 default:
238 return ERR_PTR(-EINVAL); 216 BUG();
239 } 217 }
240 218
241 acl = iget_acl(inode, p_acl);
242 if (acl && !IS_ERR(acl))
243 return acl;
244 else if (PTR_ERR(acl) == -ENODATA)
245 return NULL;
246
247 size = reiserfs_xattr_get(inode, name, NULL, 0); 219 size = reiserfs_xattr_get(inode, name, NULL, 0);
248 if (size < 0) { 220 if (size < 0) {
249 if (size == -ENODATA || size == -ENOSYS) { 221 if (size == -ENODATA || size == -ENOSYS) {
250 *p_acl = ERR_PTR(-ENODATA); 222 set_cached_acl(inode, type, NULL);
251 return NULL; 223 return NULL;
252 } 224 }
253 return ERR_PTR(size); 225 return ERR_PTR(size);
@@ -262,14 +234,13 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
262 /* This shouldn't actually happen as it should have 234 /* This shouldn't actually happen as it should have
263 been caught above.. but just in case */ 235 been caught above.. but just in case */
264 acl = NULL; 236 acl = NULL;
265 *p_acl = ERR_PTR(-ENODATA);
266 } else if (retval < 0) { 237 } else if (retval < 0) {
267 acl = ERR_PTR(retval); 238 acl = ERR_PTR(retval);
268 } else { 239 } else {
269 acl = posix_acl_from_disk(value, retval); 240 acl = posix_acl_from_disk(value, retval);
270 if (!IS_ERR(acl))
271 iset_acl(inode, p_acl, acl);
272 } 241 }
242 if (!IS_ERR(acl))
243 set_cached_acl(inode, type, acl);
273 244
274 kfree(value); 245 kfree(value);
275 return acl; 246 return acl;
@@ -287,10 +258,8 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
287{ 258{
288 char *name; 259 char *name;
289 void *value = NULL; 260 void *value = NULL;
290 struct posix_acl **p_acl;
291 size_t size = 0; 261 size_t size = 0;
292 int error; 262 int error;
293 struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
294 263
295 if (S_ISLNK(inode->i_mode)) 264 if (S_ISLNK(inode->i_mode))
296 return -EOPNOTSUPP; 265 return -EOPNOTSUPP;
@@ -298,7 +267,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
298 switch (type) { 267 switch (type) {
299 case ACL_TYPE_ACCESS: 268 case ACL_TYPE_ACCESS:
300 name = POSIX_ACL_XATTR_ACCESS; 269 name = POSIX_ACL_XATTR_ACCESS;
301 p_acl = &reiserfs_i->i_acl_access;
302 if (acl) { 270 if (acl) {
303 mode_t mode = inode->i_mode; 271 mode_t mode = inode->i_mode;
304 error = posix_acl_equiv_mode(acl, &mode); 272 error = posix_acl_equiv_mode(acl, &mode);
@@ -313,7 +281,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
313 break; 281 break;
314 case ACL_TYPE_DEFAULT: 282 case ACL_TYPE_DEFAULT:
315 name = POSIX_ACL_XATTR_DEFAULT; 283 name = POSIX_ACL_XATTR_DEFAULT;
316 p_acl = &reiserfs_i->i_acl_default;
317 if (!S_ISDIR(inode->i_mode)) 284 if (!S_ISDIR(inode->i_mode))
318 return acl ? -EACCES : 0; 285 return acl ? -EACCES : 0;
319 break; 286 break;
@@ -346,7 +313,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
346 kfree(value); 313 kfree(value);
347 314
348 if (!error) 315 if (!error)
349 iset_acl(inode, p_acl, acl); 316 set_cached_acl(inode, type, acl);
350 317
351 return error; 318 return error;
352} 319}
@@ -379,11 +346,8 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
379 } 346 }
380 347
381 acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT); 348 acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT);
382 if (IS_ERR(acl)) { 349 if (IS_ERR(acl))
383 if (PTR_ERR(acl) == -ENODATA)
384 goto apply_umask;
385 return PTR_ERR(acl); 350 return PTR_ERR(acl);
386 }
387 351
388 if (acl) { 352 if (acl) {
389 struct posix_acl *acl_copy; 353 struct posix_acl *acl_copy;
diff --git a/fs/select.c b/fs/select.c
index 0fe0e1469df3..8084834e123e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -110,6 +110,7 @@ void poll_initwait(struct poll_wqueues *pwq)
110{ 110{
111 init_poll_funcptr(&pwq->pt, __pollwait); 111 init_poll_funcptr(&pwq->pt, __pollwait);
112 pwq->polling_task = current; 112 pwq->polling_task = current;
113 pwq->triggered = 0;
113 pwq->error = 0; 114 pwq->error = 0;
114 pwq->table = NULL; 115 pwq->table = NULL;
115 pwq->inline_index = 0; 116 pwq->inline_index = 0;
@@ -168,7 +169,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
168 return table->entry++; 169 return table->entry++;
169} 170}
170 171
171static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) 172static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
172{ 173{
173 struct poll_wqueues *pwq = wait->private; 174 struct poll_wqueues *pwq = wait->private;
174 DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); 175 DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
@@ -194,6 +195,16 @@ static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
194 return default_wake_function(&dummy_wait, mode, sync, key); 195 return default_wake_function(&dummy_wait, mode, sync, key);
195} 196}
196 197
198static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
199{
200 struct poll_table_entry *entry;
201
202 entry = container_of(wait, struct poll_table_entry, wait);
203 if (key && !((unsigned long)key & entry->key))
204 return 0;
205 return __pollwake(wait, mode, sync, key);
206}
207
197/* Add a new entry */ 208/* Add a new entry */
198static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, 209static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
199 poll_table *p) 210 poll_table *p)
@@ -205,6 +216,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
205 get_file(filp); 216 get_file(filp);
206 entry->filp = filp; 217 entry->filp = filp;
207 entry->wait_address = wait_address; 218 entry->wait_address = wait_address;
219 entry->key = p->key;
208 init_waitqueue_func_entry(&entry->wait, pollwake); 220 init_waitqueue_func_entry(&entry->wait, pollwake);
209 entry->wait.private = pwq; 221 entry->wait.private = pwq;
210 add_wait_queue(wait_address, &entry->wait); 222 add_wait_queue(wait_address, &entry->wait);
@@ -362,6 +374,18 @@ get_max:
362#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) 374#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
363#define POLLEX_SET (POLLPRI) 375#define POLLEX_SET (POLLPRI)
364 376
377static inline void wait_key_set(poll_table *wait, unsigned long in,
378 unsigned long out, unsigned long bit)
379{
380 if (wait) {
381 wait->key = POLLEX_SET;
382 if (in & bit)
383 wait->key |= POLLIN_SET;
384 if (out & bit)
385 wait->key |= POLLOUT_SET;
386 }
387}
388
365int do_select(int n, fd_set_bits *fds, struct timespec *end_time) 389int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
366{ 390{
367 ktime_t expire, *to = NULL; 391 ktime_t expire, *to = NULL;
@@ -418,20 +442,25 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
418 if (file) { 442 if (file) {
419 f_op = file->f_op; 443 f_op = file->f_op;
420 mask = DEFAULT_POLLMASK; 444 mask = DEFAULT_POLLMASK;
421 if (f_op && f_op->poll) 445 if (f_op && f_op->poll) {
422 mask = (*f_op->poll)(file, retval ? NULL : wait); 446 wait_key_set(wait, in, out, bit);
447 mask = (*f_op->poll)(file, wait);
448 }
423 fput_light(file, fput_needed); 449 fput_light(file, fput_needed);
424 if ((mask & POLLIN_SET) && (in & bit)) { 450 if ((mask & POLLIN_SET) && (in & bit)) {
425 res_in |= bit; 451 res_in |= bit;
426 retval++; 452 retval++;
453 wait = NULL;
427 } 454 }
428 if ((mask & POLLOUT_SET) && (out & bit)) { 455 if ((mask & POLLOUT_SET) && (out & bit)) {
429 res_out |= bit; 456 res_out |= bit;
430 retval++; 457 retval++;
458 wait = NULL;
431 } 459 }
432 if ((mask & POLLEX_SET) && (ex & bit)) { 460 if ((mask & POLLEX_SET) && (ex & bit)) {
433 res_ex |= bit; 461 res_ex |= bit;
434 retval++; 462 retval++;
463 wait = NULL;
435 } 464 }
436 } 465 }
437 } 466 }
@@ -685,8 +714,12 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
685 mask = POLLNVAL; 714 mask = POLLNVAL;
686 if (file != NULL) { 715 if (file != NULL) {
687 mask = DEFAULT_POLLMASK; 716 mask = DEFAULT_POLLMASK;
688 if (file->f_op && file->f_op->poll) 717 if (file->f_op && file->f_op->poll) {
718 if (pwait)
719 pwait->key = pollfd->events |
720 POLLERR | POLLHUP;
689 mask = file->f_op->poll(file, pwait); 721 mask = file->f_op->poll(file, pwait);
722 }
690 /* Mask out unneeded events. */ 723 /* Mask out unneeded events. */
691 mask &= pollfd->events | POLLERR | POLLHUP; 724 mask &= pollfd->events | POLLERR | POLLHUP;
692 fput_light(file, fput_needed); 725 fput_light(file, fput_needed);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 7f40f30c55c5..6c959275f2d0 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -640,6 +640,26 @@ int seq_puts(struct seq_file *m, const char *s)
640} 640}
641EXPORT_SYMBOL(seq_puts); 641EXPORT_SYMBOL(seq_puts);
642 642
643/**
644 * seq_write - write arbitrary data to buffer
645 * @seq: seq_file identifying the buffer to which data should be written
646 * @data: data address
647 * @len: number of bytes
648 *
649 * Return 0 on success, non-zero otherwise.
650 */
651int seq_write(struct seq_file *seq, const void *data, size_t len)
652{
653 if (seq->count + len < seq->size) {
654 memcpy(seq->buf + seq->count, data, len);
655 seq->count += len;
656 return 0;
657 }
658 seq->count = seq->size;
659 return -1;
660}
661EXPORT_SYMBOL(seq_write);
662
643struct list_head *seq_list_start(struct list_head *head, loff_t pos) 663struct list_head *seq_list_start(struct list_head *head, loff_t pos)
644{ 664{
645 struct list_head *lh; 665 struct list_head *lh;
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fc27fbfc5397..1402d2d54f52 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -474,6 +474,8 @@ smb_put_super(struct super_block *sb)
474{ 474{
475 struct smb_sb_info *server = SMB_SB(sb); 475 struct smb_sb_info *server = SMB_SB(sb);
476 476
477 lock_kernel();
478
477 smb_lock_server(server); 479 smb_lock_server(server);
478 server->state = CONN_INVALID; 480 server->state = CONN_INVALID;
479 smbiod_unregister_server(server); 481 smbiod_unregister_server(server);
@@ -489,6 +491,8 @@ smb_put_super(struct super_block *sb)
489 smb_unlock_server(server); 491 smb_unlock_server(server);
490 put_pid(server->conn_pid); 492 put_pid(server->conn_pid);
491 kfree(server); 493 kfree(server);
494
495 unlock_kernel();
492} 496}
493 497
494static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) 498static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
diff --git a/fs/splice.c b/fs/splice.c
index 666953d59a35..73766d24f97b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -507,9 +507,131 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
507 507
508 return ret; 508 return ret;
509} 509}
510
511EXPORT_SYMBOL(generic_file_splice_read); 510EXPORT_SYMBOL(generic_file_splice_read);
512 511
512static const struct pipe_buf_operations default_pipe_buf_ops = {
513 .can_merge = 0,
514 .map = generic_pipe_buf_map,
515 .unmap = generic_pipe_buf_unmap,
516 .confirm = generic_pipe_buf_confirm,
517 .release = generic_pipe_buf_release,
518 .steal = generic_pipe_buf_steal,
519 .get = generic_pipe_buf_get,
520};
521
522static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
523 unsigned long vlen, loff_t offset)
524{
525 mm_segment_t old_fs;
526 loff_t pos = offset;
527 ssize_t res;
528
529 old_fs = get_fs();
530 set_fs(get_ds());
531 /* The cast to a user pointer is valid due to the set_fs() */
532 res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
533 set_fs(old_fs);
534
535 return res;
536}
537
538static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
539 loff_t pos)
540{
541 mm_segment_t old_fs;
542 ssize_t res;
543
544 old_fs = get_fs();
545 set_fs(get_ds());
546 /* The cast to a user pointer is valid due to the set_fs() */
547 res = vfs_write(file, (const char __user *)buf, count, &pos);
548 set_fs(old_fs);
549
550 return res;
551}
552
553ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
554 struct pipe_inode_info *pipe, size_t len,
555 unsigned int flags)
556{
557 unsigned int nr_pages;
558 unsigned int nr_freed;
559 size_t offset;
560 struct page *pages[PIPE_BUFFERS];
561 struct partial_page partial[PIPE_BUFFERS];
562 struct iovec vec[PIPE_BUFFERS];
563 pgoff_t index;
564 ssize_t res;
565 size_t this_len;
566 int error;
567 int i;
568 struct splice_pipe_desc spd = {
569 .pages = pages,
570 .partial = partial,
571 .flags = flags,
572 .ops = &default_pipe_buf_ops,
573 .spd_release = spd_release_page,
574 };
575
576 index = *ppos >> PAGE_CACHE_SHIFT;
577 offset = *ppos & ~PAGE_CACHE_MASK;
578 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
579
580 for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
581 struct page *page;
582
583 page = alloc_page(GFP_USER);
584 error = -ENOMEM;
585 if (!page)
586 goto err;
587
588 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
589 vec[i].iov_base = (void __user *) page_address(page);
590 vec[i].iov_len = this_len;
591 pages[i] = page;
592 spd.nr_pages++;
593 len -= this_len;
594 offset = 0;
595 }
596
597 res = kernel_readv(in, vec, spd.nr_pages, *ppos);
598 if (res < 0) {
599 error = res;
600 goto err;
601 }
602
603 error = 0;
604 if (!res)
605 goto err;
606
607 nr_freed = 0;
608 for (i = 0; i < spd.nr_pages; i++) {
609 this_len = min_t(size_t, vec[i].iov_len, res);
610 partial[i].offset = 0;
611 partial[i].len = this_len;
612 if (!this_len) {
613 __free_page(pages[i]);
614 pages[i] = NULL;
615 nr_freed++;
616 }
617 res -= this_len;
618 }
619 spd.nr_pages -= nr_freed;
620
621 res = splice_to_pipe(pipe, &spd);
622 if (res > 0)
623 *ppos += res;
624
625 return res;
626
627err:
628 for (i = 0; i < spd.nr_pages; i++)
629 __free_page(pages[i]);
630
631 return error;
632}
633EXPORT_SYMBOL(default_file_splice_read);
634
513/* 635/*
514 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 636 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
515 * using sendpage(). Return the number of bytes sent. 637 * using sendpage(). Return the number of bytes sent.
@@ -881,6 +1003,36 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
881 1003
882EXPORT_SYMBOL(generic_file_splice_write); 1004EXPORT_SYMBOL(generic_file_splice_write);
883 1005
1006static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1007 struct splice_desc *sd)
1008{
1009 int ret;
1010 void *data;
1011
1012 ret = buf->ops->confirm(pipe, buf);
1013 if (ret)
1014 return ret;
1015
1016 data = buf->ops->map(pipe, buf, 0);
1017 ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
1018 buf->ops->unmap(pipe, buf, data);
1019
1020 return ret;
1021}
1022
1023static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
1024 struct file *out, loff_t *ppos,
1025 size_t len, unsigned int flags)
1026{
1027 ssize_t ret;
1028
1029 ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
1030 if (ret > 0)
1031 *ppos += ret;
1032
1033 return ret;
1034}
1035
884/** 1036/**
885 * generic_splice_sendpage - splice data from a pipe to a socket 1037 * generic_splice_sendpage - splice data from a pipe to a socket
886 * @pipe: pipe to splice from 1038 * @pipe: pipe to splice from
@@ -908,11 +1060,10 @@ EXPORT_SYMBOL(generic_splice_sendpage);
908static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 1060static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
909 loff_t *ppos, size_t len, unsigned int flags) 1061 loff_t *ppos, size_t len, unsigned int flags)
910{ 1062{
1063 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1064 loff_t *, size_t, unsigned int);
911 int ret; 1065 int ret;
912 1066
913 if (unlikely(!out->f_op || !out->f_op->splice_write))
914 return -EINVAL;
915
916 if (unlikely(!(out->f_mode & FMODE_WRITE))) 1067 if (unlikely(!(out->f_mode & FMODE_WRITE)))
917 return -EBADF; 1068 return -EBADF;
918 1069
@@ -923,7 +1074,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
923 if (unlikely(ret < 0)) 1074 if (unlikely(ret < 0))
924 return ret; 1075 return ret;
925 1076
926 return out->f_op->splice_write(pipe, out, ppos, len, flags); 1077 splice_write = out->f_op->splice_write;
1078 if (!splice_write)
1079 splice_write = default_file_splice_write;
1080
1081 return splice_write(pipe, out, ppos, len, flags);
927} 1082}
928 1083
929/* 1084/*
@@ -933,11 +1088,10 @@ static long do_splice_to(struct file *in, loff_t *ppos,
933 struct pipe_inode_info *pipe, size_t len, 1088 struct pipe_inode_info *pipe, size_t len,
934 unsigned int flags) 1089 unsigned int flags)
935{ 1090{
1091 ssize_t (*splice_read)(struct file *, loff_t *,
1092 struct pipe_inode_info *, size_t, unsigned int);
936 int ret; 1093 int ret;
937 1094
938 if (unlikely(!in->f_op || !in->f_op->splice_read))
939 return -EINVAL;
940
941 if (unlikely(!(in->f_mode & FMODE_READ))) 1095 if (unlikely(!(in->f_mode & FMODE_READ)))
942 return -EBADF; 1096 return -EBADF;
943 1097
@@ -945,7 +1099,11 @@ static long do_splice_to(struct file *in, loff_t *ppos,
945 if (unlikely(ret < 0)) 1099 if (unlikely(ret < 0))
946 return ret; 1100 return ret;
947 1101
948 return in->f_op->splice_read(in, ppos, pipe, len, flags); 1102 splice_read = in->f_op->splice_read;
1103 if (!splice_read)
1104 splice_read = default_file_splice_read;
1105
1106 return splice_read(in, ppos, pipe, len, flags);
949} 1107}
950 1108
951/** 1109/**
@@ -1112,6 +1270,9 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1112 return ret; 1270 return ret;
1113} 1271}
1114 1272
1273static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1274 struct pipe_inode_info *opipe,
1275 size_t len, unsigned int flags);
1115/* 1276/*
1116 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1277 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1117 * location, so checking ->i_pipe is not enough to verify that this is a 1278 * location, so checking ->i_pipe is not enough to verify that this is a
@@ -1132,12 +1293,32 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1132 struct file *out, loff_t __user *off_out, 1293 struct file *out, loff_t __user *off_out,
1133 size_t len, unsigned int flags) 1294 size_t len, unsigned int flags)
1134{ 1295{
1135 struct pipe_inode_info *pipe; 1296 struct pipe_inode_info *ipipe;
1297 struct pipe_inode_info *opipe;
1136 loff_t offset, *off; 1298 loff_t offset, *off;
1137 long ret; 1299 long ret;
1138 1300
1139 pipe = pipe_info(in->f_path.dentry->d_inode); 1301 ipipe = pipe_info(in->f_path.dentry->d_inode);
1140 if (pipe) { 1302 opipe = pipe_info(out->f_path.dentry->d_inode);
1303
1304 if (ipipe && opipe) {
1305 if (off_in || off_out)
1306 return -ESPIPE;
1307
1308 if (!(in->f_mode & FMODE_READ))
1309 return -EBADF;
1310
1311 if (!(out->f_mode & FMODE_WRITE))
1312 return -EBADF;
1313
1314 /* Splicing to self would be fun, but... */
1315 if (ipipe == opipe)
1316 return -EINVAL;
1317
1318 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1319 }
1320
1321 if (ipipe) {
1141 if (off_in) 1322 if (off_in)
1142 return -ESPIPE; 1323 return -ESPIPE;
1143 if (off_out) { 1324 if (off_out) {
@@ -1149,7 +1330,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1149 } else 1330 } else
1150 off = &out->f_pos; 1331 off = &out->f_pos;
1151 1332
1152 ret = do_splice_from(pipe, out, off, len, flags); 1333 ret = do_splice_from(ipipe, out, off, len, flags);
1153 1334
1154 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1335 if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1155 ret = -EFAULT; 1336 ret = -EFAULT;
@@ -1157,8 +1338,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1157 return ret; 1338 return ret;
1158 } 1339 }
1159 1340
1160 pipe = pipe_info(out->f_path.dentry->d_inode); 1341 if (opipe) {
1161 if (pipe) {
1162 if (off_out) 1342 if (off_out)
1163 return -ESPIPE; 1343 return -ESPIPE;
1164 if (off_in) { 1344 if (off_in) {
@@ -1170,7 +1350,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1170 } else 1350 } else
1171 off = &in->f_pos; 1351 off = &in->f_pos;
1172 1352
1173 ret = do_splice_to(in, off, pipe, len, flags); 1353 ret = do_splice_to(in, off, opipe, len, flags);
1174 1354
1175 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1355 if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1176 ret = -EFAULT; 1356 ret = -EFAULT;
@@ -1511,7 +1691,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1511 * Make sure there's data to read. Wait for input if we can, otherwise 1691 * Make sure there's data to read. Wait for input if we can, otherwise
1512 * return an appropriate error. 1692 * return an appropriate error.
1513 */ 1693 */
1514static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1694static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1515{ 1695{
1516 int ret; 1696 int ret;
1517 1697
@@ -1549,7 +1729,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1549 * Make sure there's writeable room. Wait for room if we can, otherwise 1729 * Make sure there's writeable room. Wait for room if we can, otherwise
1550 * return an appropriate error. 1730 * return an appropriate error.
1551 */ 1731 */
1552static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1732static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1553{ 1733{
1554 int ret; 1734 int ret;
1555 1735
@@ -1587,6 +1767,124 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1587} 1767}
1588 1768
1589/* 1769/*
1770 * Splice contents of ipipe to opipe.
1771 */
1772static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1773 struct pipe_inode_info *opipe,
1774 size_t len, unsigned int flags)
1775{
1776 struct pipe_buffer *ibuf, *obuf;
1777 int ret = 0, nbuf;
1778 bool input_wakeup = false;
1779
1780
1781retry:
1782 ret = ipipe_prep(ipipe, flags);
1783 if (ret)
1784 return ret;
1785
1786 ret = opipe_prep(opipe, flags);
1787 if (ret)
1788 return ret;
1789
1790 /*
1791 * Potential ABBA deadlock, work around it by ordering lock
1792 * grabbing by pipe info address. Otherwise two different processes
1793 * could deadlock (one doing tee from A -> B, the other from B -> A).
1794 */
1795 pipe_double_lock(ipipe, opipe);
1796
1797 do {
1798 if (!opipe->readers) {
1799 send_sig(SIGPIPE, current, 0);
1800 if (!ret)
1801 ret = -EPIPE;
1802 break;
1803 }
1804
1805 if (!ipipe->nrbufs && !ipipe->writers)
1806 break;
1807
1808 /*
1809 * Cannot make any progress, because either the input
1810 * pipe is empty or the output pipe is full.
1811 */
1812 if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
1813 /* Already processed some buffers, break */
1814 if (ret)
1815 break;
1816
1817 if (flags & SPLICE_F_NONBLOCK) {
1818 ret = -EAGAIN;
1819 break;
1820 }
1821
1822 /*
1823 * We raced with another reader/writer and haven't
1824 * managed to process any buffers. A zero return
1825 * value means EOF, so retry instead.
1826 */
1827 pipe_unlock(ipipe);
1828 pipe_unlock(opipe);
1829 goto retry;
1830 }
1831
1832 ibuf = ipipe->bufs + ipipe->curbuf;
1833 nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
1834 obuf = opipe->bufs + nbuf;
1835
1836 if (len >= ibuf->len) {
1837 /*
1838 * Simply move the whole buffer from ipipe to opipe
1839 */
1840 *obuf = *ibuf;
1841 ibuf->ops = NULL;
1842 opipe->nrbufs++;
1843 ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
1844 ipipe->nrbufs--;
1845 input_wakeup = true;
1846 } else {
1847 /*
1848 * Get a reference to this pipe buffer,
1849 * so we can copy the contents over.
1850 */
1851 ibuf->ops->get(ipipe, ibuf);
1852 *obuf = *ibuf;
1853
1854 /*
1855 * Don't inherit the gift flag, we need to
1856 * prevent multiple steals of this page.
1857 */
1858 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1859
1860 obuf->len = len;
1861 opipe->nrbufs++;
1862 ibuf->offset += obuf->len;
1863 ibuf->len -= obuf->len;
1864 }
1865 ret += obuf->len;
1866 len -= obuf->len;
1867 } while (len);
1868
1869 pipe_unlock(ipipe);
1870 pipe_unlock(opipe);
1871
1872 /*
1873 * If we put data in the output pipe, wakeup any potential readers.
1874 */
1875 if (ret > 0) {
1876 smp_mb();
1877 if (waitqueue_active(&opipe->wait))
1878 wake_up_interruptible(&opipe->wait);
1879 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1880 }
1881 if (input_wakeup)
1882 wakeup_pipe_writers(ipipe);
1883
1884 return ret;
1885}
1886
1887/*
1590 * Link contents of ipipe to opipe. 1888 * Link contents of ipipe to opipe.
1591 */ 1889 */
1592static int link_pipe(struct pipe_inode_info *ipipe, 1890static int link_pipe(struct pipe_inode_info *ipipe,
@@ -1690,9 +1988,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
1690 * Keep going, unless we encounter an error. The ipipe/opipe 1988 * Keep going, unless we encounter an error. The ipipe/opipe
1691 * ordering doesn't really matter. 1989 * ordering doesn't really matter.
1692 */ 1990 */
1693 ret = link_ipipe_prep(ipipe, flags); 1991 ret = ipipe_prep(ipipe, flags);
1694 if (!ret) { 1992 if (!ret) {
1695 ret = link_opipe_prep(opipe, flags); 1993 ret = opipe_prep(opipe, flags);
1696 if (!ret) 1994 if (!ret)
1697 ret = link_pipe(ipipe, opipe, len, flags); 1995 ret = link_pipe(ipipe, opipe, len, flags);
1698 } 1996 }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 0adc624c956f..cb5fc57e370b 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -30,6 +30,7 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/smp_lock.h>
33#include <linux/mutex.h> 34#include <linux/mutex.h>
34#include <linux/pagemap.h> 35#include <linux/pagemap.h>
35#include <linux/init.h> 36#include <linux/init.h>
@@ -338,6 +339,8 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
338 339
339static void squashfs_put_super(struct super_block *sb) 340static void squashfs_put_super(struct super_block *sb)
340{ 341{
342 lock_kernel();
343
341 if (sb->s_fs_info) { 344 if (sb->s_fs_info) {
342 struct squashfs_sb_info *sbi = sb->s_fs_info; 345 struct squashfs_sb_info *sbi = sb->s_fs_info;
343 squashfs_cache_delete(sbi->block_cache); 346 squashfs_cache_delete(sbi->block_cache);
@@ -350,6 +353,8 @@ static void squashfs_put_super(struct super_block *sb)
350 kfree(sb->s_fs_info); 353 kfree(sb->s_fs_info);
351 sb->s_fs_info = NULL; 354 sb->s_fs_info = NULL;
352 } 355 }
356
357 unlock_kernel();
353} 358}
354 359
355 360
diff --git a/fs/super.c b/fs/super.c
index 1943fdf655fa..2761d3e22ed9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -28,7 +28,6 @@
28#include <linux/blkdev.h> 28#include <linux/blkdev.h>
29#include <linux/quotaops.h> 29#include <linux/quotaops.h>
30#include <linux/namei.h> 30#include <linux/namei.h>
31#include <linux/buffer_head.h> /* for fsync_super() */
32#include <linux/mount.h> 31#include <linux/mount.h>
33#include <linux/security.h> 32#include <linux/security.h>
34#include <linux/syscalls.h> 33#include <linux/syscalls.h>
@@ -38,7 +37,6 @@
38#include <linux/kobject.h> 37#include <linux/kobject.h>
39#include <linux/mutex.h> 38#include <linux/mutex.h>
40#include <linux/file.h> 39#include <linux/file.h>
41#include <linux/async.h>
42#include <asm/uaccess.h> 40#include <asm/uaccess.h>
43#include "internal.h" 41#include "internal.h"
44 42
@@ -72,7 +70,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
72 INIT_HLIST_HEAD(&s->s_anon); 70 INIT_HLIST_HEAD(&s->s_anon);
73 INIT_LIST_HEAD(&s->s_inodes); 71 INIT_LIST_HEAD(&s->s_inodes);
74 INIT_LIST_HEAD(&s->s_dentry_lru); 72 INIT_LIST_HEAD(&s->s_dentry_lru);
75 INIT_LIST_HEAD(&s->s_async_list);
76 init_rwsem(&s->s_umount); 73 init_rwsem(&s->s_umount);
77 mutex_init(&s->s_lock); 74 mutex_init(&s->s_lock);
78 lockdep_set_class(&s->s_umount, &type->s_umount_key); 75 lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -285,38 +282,6 @@ void unlock_super(struct super_block * sb)
285EXPORT_SYMBOL(lock_super); 282EXPORT_SYMBOL(lock_super);
286EXPORT_SYMBOL(unlock_super); 283EXPORT_SYMBOL(unlock_super);
287 284
288/*
289 * Write out and wait upon all dirty data associated with this
290 * superblock. Filesystem data as well as the underlying block
291 * device. Takes the superblock lock. Requires a second blkdev
292 * flush by the caller to complete the operation.
293 */
294void __fsync_super(struct super_block *sb)
295{
296 sync_inodes_sb(sb, 0);
297 vfs_dq_sync(sb);
298 lock_super(sb);
299 if (sb->s_dirt && sb->s_op->write_super)
300 sb->s_op->write_super(sb);
301 unlock_super(sb);
302 if (sb->s_op->sync_fs)
303 sb->s_op->sync_fs(sb, 1);
304 sync_blockdev(sb->s_bdev);
305 sync_inodes_sb(sb, 1);
306}
307
308/*
309 * Write out and wait upon all dirty data associated with this
310 * superblock. Filesystem data as well as the underlying block
311 * device. Takes the superblock lock.
312 */
313int fsync_super(struct super_block *sb)
314{
315 __fsync_super(sb);
316 return sync_blockdev(sb->s_bdev);
317}
318EXPORT_SYMBOL_GPL(fsync_super);
319
320/** 285/**
321 * generic_shutdown_super - common helper for ->kill_sb() 286 * generic_shutdown_super - common helper for ->kill_sb()
322 * @sb: superblock to kill 287 * @sb: superblock to kill
@@ -338,21 +303,13 @@ void generic_shutdown_super(struct super_block *sb)
338 303
339 if (sb->s_root) { 304 if (sb->s_root) {
340 shrink_dcache_for_umount(sb); 305 shrink_dcache_for_umount(sb);
341 fsync_super(sb); 306 sync_filesystem(sb);
342 lock_super(sb); 307 get_fs_excl();
343 sb->s_flags &= ~MS_ACTIVE; 308 sb->s_flags &= ~MS_ACTIVE;
344 309
345 /*
346 * wait for asynchronous fs operations to finish before going further
347 */
348 async_synchronize_full_domain(&sb->s_async_list);
349
350 /* bad name - it should be evict_inodes() */ 310 /* bad name - it should be evict_inodes() */
351 invalidate_inodes(sb); 311 invalidate_inodes(sb);
352 lock_kernel();
353 312
354 if (sop->write_super && sb->s_dirt)
355 sop->write_super(sb);
356 if (sop->put_super) 313 if (sop->put_super)
357 sop->put_super(sb); 314 sop->put_super(sb);
358 315
@@ -362,9 +319,7 @@ void generic_shutdown_super(struct super_block *sb)
362 "Self-destruct in 5 seconds. Have a nice day...\n", 319 "Self-destruct in 5 seconds. Have a nice day...\n",
363 sb->s_id); 320 sb->s_id);
364 } 321 }
365 322 put_fs_excl();
366 unlock_kernel();
367 unlock_super(sb);
368 } 323 }
369 spin_lock(&sb_lock); 324 spin_lock(&sb_lock);
370 /* should be initialized for __put_super_and_need_restart() */ 325 /* should be initialized for __put_super_and_need_restart() */
@@ -441,16 +396,14 @@ void drop_super(struct super_block *sb)
441 396
442EXPORT_SYMBOL(drop_super); 397EXPORT_SYMBOL(drop_super);
443 398
444static inline void write_super(struct super_block *sb) 399/**
445{ 400 * sync_supers - helper for periodic superblock writeback
446 lock_super(sb); 401 *
447 if (sb->s_root && sb->s_dirt) 402 * Call the write_super method if present on all dirty superblocks in
448 if (sb->s_op->write_super) 403 * the system. This is for the periodic writeback used by most older
449 sb->s_op->write_super(sb); 404 * filesystems. For data integrity superblock writeback use
450 unlock_super(sb); 405 * sync_filesystems() instead.
451} 406 *
452
453/*
454 * Note: check the dirty flag before waiting, so we don't 407 * Note: check the dirty flag before waiting, so we don't
455 * hold up the sync while mounting a device. (The newly 408 * hold up the sync while mounting a device. (The newly
456 * mounted device won't need syncing.) 409 * mounted device won't need syncing.)
@@ -462,12 +415,15 @@ void sync_supers(void)
462 spin_lock(&sb_lock); 415 spin_lock(&sb_lock);
463restart: 416restart:
464 list_for_each_entry(sb, &super_blocks, s_list) { 417 list_for_each_entry(sb, &super_blocks, s_list) {
465 if (sb->s_dirt) { 418 if (sb->s_op->write_super && sb->s_dirt) {
466 sb->s_count++; 419 sb->s_count++;
467 spin_unlock(&sb_lock); 420 spin_unlock(&sb_lock);
421
468 down_read(&sb->s_umount); 422 down_read(&sb->s_umount);
469 write_super(sb); 423 if (sb->s_root && sb->s_dirt)
424 sb->s_op->write_super(sb);
470 up_read(&sb->s_umount); 425 up_read(&sb->s_umount);
426
471 spin_lock(&sb_lock); 427 spin_lock(&sb_lock);
472 if (__put_super_and_need_restart(sb)) 428 if (__put_super_and_need_restart(sb))
473 goto restart; 429 goto restart;
@@ -476,60 +432,6 @@ restart:
476 spin_unlock(&sb_lock); 432 spin_unlock(&sb_lock);
477} 433}
478 434
479/*
480 * Call the ->sync_fs super_op against all filesystems which are r/w and
481 * which implement it.
482 *
483 * This operation is careful to avoid the livelock which could easily happen
484 * if two or more filesystems are being continuously dirtied. s_need_sync_fs
485 * is used only here. We set it against all filesystems and then clear it as
486 * we sync them. So redirtied filesystems are skipped.
487 *
488 * But if process A is currently running sync_filesystems and then process B
489 * calls sync_filesystems as well, process B will set all the s_need_sync_fs
490 * flags again, which will cause process A to resync everything. Fix that with
491 * a local mutex.
492 *
493 * (Fabian) Avoid sync_fs with clean fs & wait mode 0
494 */
495void sync_filesystems(int wait)
496{
497 struct super_block *sb;
498 static DEFINE_MUTEX(mutex);
499
500 mutex_lock(&mutex); /* Could be down_interruptible */
501 spin_lock(&sb_lock);
502 list_for_each_entry(sb, &super_blocks, s_list) {
503 if (!sb->s_op->sync_fs)
504 continue;
505 if (sb->s_flags & MS_RDONLY)
506 continue;
507 sb->s_need_sync_fs = 1;
508 }
509
510restart:
511 list_for_each_entry(sb, &super_blocks, s_list) {
512 if (!sb->s_need_sync_fs)
513 continue;
514 sb->s_need_sync_fs = 0;
515 if (sb->s_flags & MS_RDONLY)
516 continue; /* hm. Was remounted r/o meanwhile */
517 sb->s_count++;
518 spin_unlock(&sb_lock);
519 down_read(&sb->s_umount);
520 async_synchronize_full_domain(&sb->s_async_list);
521 if (sb->s_root && (wait || sb->s_dirt))
522 sb->s_op->sync_fs(sb, wait);
523 up_read(&sb->s_umount);
524 /* restart only when sb is no longer on the list */
525 spin_lock(&sb_lock);
526 if (__put_super_and_need_restart(sb))
527 goto restart;
528 }
529 spin_unlock(&sb_lock);
530 mutex_unlock(&mutex);
531}
532
533/** 435/**
534 * get_super - get the superblock of a device 436 * get_super - get the superblock of a device
535 * @bdev: device to get the superblock for 437 * @bdev: device to get the superblock for
@@ -616,45 +518,6 @@ out:
616} 518}
617 519
618/** 520/**
619 * mark_files_ro - mark all files read-only
620 * @sb: superblock in question
621 *
622 * All files are marked read-only. We don't care about pending
623 * delete files so this should be used in 'force' mode only.
624 */
625
626static void mark_files_ro(struct super_block *sb)
627{
628 struct file *f;
629
630retry:
631 file_list_lock();
632 list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
633 struct vfsmount *mnt;
634 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
635 continue;
636 if (!file_count(f))
637 continue;
638 if (!(f->f_mode & FMODE_WRITE))
639 continue;
640 f->f_mode &= ~FMODE_WRITE;
641 if (file_check_writeable(f) != 0)
642 continue;
643 file_release_write(f);
644 mnt = mntget(f->f_path.mnt);
645 file_list_unlock();
646 /*
647 * This can sleep, so we can't hold
648 * the file_list_lock() spinlock.
649 */
650 mnt_drop_write(mnt);
651 mntput(mnt);
652 goto retry;
653 }
654 file_list_unlock();
655}
656
657/**
658 * do_remount_sb - asks filesystem to change mount options. 521 * do_remount_sb - asks filesystem to change mount options.
659 * @sb: superblock in question 522 * @sb: superblock in question
660 * @flags: numeric part of options 523 * @flags: numeric part of options
@@ -675,7 +538,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
675 if (flags & MS_RDONLY) 538 if (flags & MS_RDONLY)
676 acct_auto_close(sb); 539 acct_auto_close(sb);
677 shrink_dcache_sb(sb); 540 shrink_dcache_sb(sb);
678 fsync_super(sb); 541 sync_filesystem(sb);
679 542
680 /* If we are remounting RDONLY and current sb is read/write, 543 /* If we are remounting RDONLY and current sb is read/write,
681 make sure there are no rw files opened */ 544 make sure there are no rw files opened */
@@ -691,9 +554,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
691 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); 554 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
692 555
693 if (sb->s_op->remount_fs) { 556 if (sb->s_op->remount_fs) {
694 lock_super(sb);
695 retval = sb->s_op->remount_fs(sb, &flags, data); 557 retval = sb->s_op->remount_fs(sb, &flags, data);
696 unlock_super(sb);
697 if (retval) 558 if (retval)
698 return retval; 559 return retval;
699 } 560 }
@@ -711,18 +572,17 @@ static void do_emergency_remount(struct work_struct *work)
711 list_for_each_entry(sb, &super_blocks, s_list) { 572 list_for_each_entry(sb, &super_blocks, s_list) {
712 sb->s_count++; 573 sb->s_count++;
713 spin_unlock(&sb_lock); 574 spin_unlock(&sb_lock);
714 down_read(&sb->s_umount); 575 down_write(&sb->s_umount);
715 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { 576 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
716 /* 577 /*
717 * ->remount_fs needs lock_kernel(). 578 * ->remount_fs needs lock_kernel().
718 * 579 *
719 * What lock protects sb->s_flags?? 580 * What lock protects sb->s_flags??
720 */ 581 */
721 lock_kernel();
722 do_remount_sb(sb, MS_RDONLY, NULL, 1); 582 do_remount_sb(sb, MS_RDONLY, NULL, 1);
723 unlock_kernel();
724 } 583 }
725 drop_super(sb); 584 up_write(&sb->s_umount);
585 put_super(sb);
726 spin_lock(&sb_lock); 586 spin_lock(&sb_lock);
727 } 587 }
728 spin_unlock(&sb_lock); 588 spin_unlock(&sb_lock);
@@ -748,6 +608,7 @@ void emergency_remount(void)
748 608
749static DEFINE_IDA(unnamed_dev_ida); 609static DEFINE_IDA(unnamed_dev_ida);
750static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ 610static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
611static int unnamed_dev_start = 0; /* don't bother trying below it */
751 612
752int set_anon_super(struct super_block *s, void *data) 613int set_anon_super(struct super_block *s, void *data)
753{ 614{
@@ -758,7 +619,9 @@ int set_anon_super(struct super_block *s, void *data)
758 if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0) 619 if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0)
759 return -ENOMEM; 620 return -ENOMEM;
760 spin_lock(&unnamed_dev_lock); 621 spin_lock(&unnamed_dev_lock);
761 error = ida_get_new(&unnamed_dev_ida, &dev); 622 error = ida_get_new_above(&unnamed_dev_ida, unnamed_dev_start, &dev);
623 if (!error)
624 unnamed_dev_start = dev + 1;
762 spin_unlock(&unnamed_dev_lock); 625 spin_unlock(&unnamed_dev_lock);
763 if (error == -EAGAIN) 626 if (error == -EAGAIN)
764 /* We raced and lost with another CPU. */ 627 /* We raced and lost with another CPU. */
@@ -769,6 +632,8 @@ int set_anon_super(struct super_block *s, void *data)
769 if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { 632 if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
770 spin_lock(&unnamed_dev_lock); 633 spin_lock(&unnamed_dev_lock);
771 ida_remove(&unnamed_dev_ida, dev); 634 ida_remove(&unnamed_dev_ida, dev);
635 if (unnamed_dev_start > dev)
636 unnamed_dev_start = dev;
772 spin_unlock(&unnamed_dev_lock); 637 spin_unlock(&unnamed_dev_lock);
773 return -EMFILE; 638 return -EMFILE;
774 } 639 }
@@ -785,6 +650,8 @@ void kill_anon_super(struct super_block *sb)
785 generic_shutdown_super(sb); 650 generic_shutdown_super(sb);
786 spin_lock(&unnamed_dev_lock); 651 spin_lock(&unnamed_dev_lock);
787 ida_remove(&unnamed_dev_ida, slot); 652 ida_remove(&unnamed_dev_ida, slot);
653 if (slot < unnamed_dev_start)
654 unnamed_dev_start = slot;
788 spin_unlock(&unnamed_dev_lock); 655 spin_unlock(&unnamed_dev_lock);
789} 656}
790 657
diff --git a/fs/sync.c b/fs/sync.c
index 7abc65fbf21d..3422ba61d86d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -13,38 +13,128 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/quotaops.h> 14#include <linux/quotaops.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include "internal.h"
16 17
17#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ 18#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
18 SYNC_FILE_RANGE_WAIT_AFTER) 19 SYNC_FILE_RANGE_WAIT_AFTER)
19 20
20/* 21/*
21 * sync everything. Start out by waking pdflush, because that writes back 22 * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
22 * all queues in parallel. 23 * just dirties buffers with inodes so we have to submit IO for these buffers
24 * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
25 * case write_inode() functions do sync_dirty_buffer() and thus effectively
26 * write one block at a time.
23 */ 27 */
24static void do_sync(unsigned long wait) 28static int __sync_filesystem(struct super_block *sb, int wait)
25{ 29{
26 wakeup_pdflush(0); 30 /* Avoid doing twice syncing and cache pruning for quota sync */
27 sync_inodes(0); /* All mappings, inodes and their blockdevs */
28 vfs_dq_sync(NULL);
29 sync_supers(); /* Write the superblocks */
30 sync_filesystems(0); /* Start syncing the filesystems */
31 sync_filesystems(wait); /* Waitingly sync the filesystems */
32 sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
33 if (!wait) 31 if (!wait)
34 printk("Emergency Sync complete\n"); 32 writeout_quota_sb(sb, -1);
35 if (unlikely(laptop_mode)) 33 else
36 laptop_sync_completion(); 34 sync_quota_sb(sb, -1);
35 sync_inodes_sb(sb, wait);
36 if (sb->s_op->sync_fs)
37 sb->s_op->sync_fs(sb, wait);
38 return __sync_blockdev(sb->s_bdev, wait);
39}
40
41/*
42 * Write out and wait upon all dirty data associated with this
43 * superblock. Filesystem data as well as the underlying block
44 * device. Takes the superblock lock.
45 */
46int sync_filesystem(struct super_block *sb)
47{
48 int ret;
49
50 /*
51 * We need to be protected against the filesystem going from
52 * r/o to r/w or vice versa.
53 */
54 WARN_ON(!rwsem_is_locked(&sb->s_umount));
55
56 /*
57 * No point in syncing out anything if the filesystem is read-only.
58 */
59 if (sb->s_flags & MS_RDONLY)
60 return 0;
61
62 ret = __sync_filesystem(sb, 0);
63 if (ret < 0)
64 return ret;
65 return __sync_filesystem(sb, 1);
66}
67EXPORT_SYMBOL_GPL(sync_filesystem);
68
69/*
70 * Sync all the data for all the filesystems (called by sys_sync() and
71 * emergency sync)
72 *
73 * This operation is careful to avoid the livelock which could easily happen
74 * if two or more filesystems are being continuously dirtied. s_need_sync
75 * is used only here. We set it against all filesystems and then clear it as
76 * we sync them. So redirtied filesystems are skipped.
77 *
78 * But if process A is currently running sync_filesystems and then process B
79 * calls sync_filesystems as well, process B will set all the s_need_sync
80 * flags again, which will cause process A to resync everything. Fix that with
81 * a local mutex.
82 */
83static void sync_filesystems(int wait)
84{
85 struct super_block *sb;
86 static DEFINE_MUTEX(mutex);
87
88 mutex_lock(&mutex); /* Could be down_interruptible */
89 spin_lock(&sb_lock);
90 list_for_each_entry(sb, &super_blocks, s_list)
91 sb->s_need_sync = 1;
92
93restart:
94 list_for_each_entry(sb, &super_blocks, s_list) {
95 if (!sb->s_need_sync)
96 continue;
97 sb->s_need_sync = 0;
98 sb->s_count++;
99 spin_unlock(&sb_lock);
100
101 down_read(&sb->s_umount);
102 if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
103 __sync_filesystem(sb, wait);
104 up_read(&sb->s_umount);
105
106 /* restart only when sb is no longer on the list */
107 spin_lock(&sb_lock);
108 if (__put_super_and_need_restart(sb))
109 goto restart;
110 }
111 spin_unlock(&sb_lock);
112 mutex_unlock(&mutex);
37} 113}
38 114
115/*
116 * sync everything. Start out by waking pdflush, because that writes back
117 * all queues in parallel.
118 */
39SYSCALL_DEFINE0(sync) 119SYSCALL_DEFINE0(sync)
40{ 120{
41 do_sync(1); 121 wakeup_pdflush(0);
122 sync_filesystems(0);
123 sync_filesystems(1);
124 if (unlikely(laptop_mode))
125 laptop_sync_completion();
42 return 0; 126 return 0;
43} 127}
44 128
45static void do_sync_work(struct work_struct *work) 129static void do_sync_work(struct work_struct *work)
46{ 130{
47 do_sync(0); 131 /*
132 * Sync twice to reduce the possibility we skipped some inodes / pages
133 * because they were temporarily locked
134 */
135 sync_filesystems(0);
136 sync_filesystems(0);
137 printk("Emergency Sync complete\n");
48 kfree(work); 138 kfree(work);
49} 139}
50 140
@@ -75,10 +165,8 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
75 165
76 /* sync the superblock to buffers */ 166 /* sync the superblock to buffers */
77 sb = inode->i_sb; 167 sb = inode->i_sb;
78 lock_super(sb);
79 if (sb->s_dirt && sb->s_op->write_super) 168 if (sb->s_dirt && sb->s_op->write_super)
80 sb->s_op->write_super(sb); 169 sb->s_op->write_super(sb);
81 unlock_super(sb);
82 170
83 /* .. finally sync the buffers to disk */ 171 /* .. finally sync the buffers to disk */
84 err = sync_blockdev(sb->s_bdev); 172 err = sync_blockdev(sb->s_bdev);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 9345806c8853..2524714bece1 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -171,6 +171,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
171 if (count > 0) 171 if (count > 0)
172 *off = offs + count; 172 *off = offs + count;
173 173
174 kfree(temp);
174 return count; 175 return count;
175} 176}
176 177
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index d88d0fac9fa5..14f2d71ea3ce 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -939,8 +939,10 @@ again:
939 /* Remove from old parent's list and insert into new parent's list. */ 939 /* Remove from old parent's list and insert into new parent's list. */
940 sysfs_unlink_sibling(sd); 940 sysfs_unlink_sibling(sd);
941 sysfs_get(new_parent_sd); 941 sysfs_get(new_parent_sd);
942 drop_nlink(old_parent->d_inode);
942 sysfs_put(sd->s_parent); 943 sysfs_put(sd->s_parent);
943 sd->s_parent = new_parent_sd; 944 sd->s_parent = new_parent_sd;
945 inc_nlink(new_parent->d_inode);
944 sysfs_link_sibling(sd); 946 sysfs_link_sibling(sd);
945 947
946 out_unlock: 948 out_unlock:
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index a3ba217fbe74..1d897ad808e0 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -192,8 +192,11 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
192{ 192{
193 int error = -ENOMEM; 193 int error = -ENOMEM;
194 unsigned long page = get_zeroed_page(GFP_KERNEL); 194 unsigned long page = get_zeroed_page(GFP_KERNEL);
195 if (page) 195 if (page) {
196 error = sysfs_getlink(dentry, (char *) page); 196 error = sysfs_getlink(dentry, (char *) page);
197 if (error < 0)
198 free_page((unsigned long)page);
199 }
197 nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); 200 nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
198 return NULL; 201 return NULL;
199} 202}
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 56f655254bfe..4e50286a4cc3 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -15,16 +15,16 @@
15 15
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/highmem.h> 17#include <linux/highmem.h>
18#include <linux/smp_lock.h>
19#include <linux/swap.h> 18#include <linux/swap.h>
20#include "sysv.h" 19#include "sysv.h"
21 20
22static int sysv_readdir(struct file *, void *, filldir_t); 21static int sysv_readdir(struct file *, void *, filldir_t);
23 22
24const struct file_operations sysv_dir_operations = { 23const struct file_operations sysv_dir_operations = {
24 .llseek = generic_file_llseek,
25 .read = generic_read_dir, 25 .read = generic_read_dir,
26 .readdir = sysv_readdir, 26 .readdir = sysv_readdir,
27 .fsync = sysv_sync_file, 27 .fsync = simple_fsync,
28}; 28};
29 29
30static inline void dir_put_page(struct page *page) 30static inline void dir_put_page(struct page *page)
@@ -74,8 +74,6 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
74 unsigned long n = pos >> PAGE_CACHE_SHIFT; 74 unsigned long n = pos >> PAGE_CACHE_SHIFT;
75 unsigned long npages = dir_pages(inode); 75 unsigned long npages = dir_pages(inode);
76 76
77 lock_kernel();
78
79 pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); 77 pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
80 if (pos >= inode->i_size) 78 if (pos >= inode->i_size)
81 goto done; 79 goto done;
@@ -113,7 +111,6 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
113 111
114done: 112done:
115 filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset; 113 filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
116 unlock_kernel();
117 return 0; 114 return 0;
118} 115}
119 116
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 589be21d884e..96340c01f4a7 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
26 .write = do_sync_write, 26 .write = do_sync_write,
27 .aio_write = generic_file_aio_write, 27 .aio_write = generic_file_aio_write,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = sysv_sync_file, 29 .fsync = simple_fsync,
30 .splice_read = generic_file_splice_read, 30 .splice_read = generic_file_splice_read,
31}; 31};
32 32
@@ -34,18 +34,3 @@ const struct inode_operations sysv_file_inode_operations = {
34 .truncate = sysv_truncate, 34 .truncate = sysv_truncate,
35 .getattr = sysv_getattr, 35 .getattr = sysv_getattr,
36}; 36};
37
38int sysv_sync_file(struct file * file, struct dentry *dentry, int datasync)
39{
40 struct inode *inode = dentry->d_inode;
41 int err;
42
43 err = sync_mapping_buffers(inode->i_mapping);
44 if (!(inode->i_state & I_DIRTY))
45 return err;
46 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
47 return err;
48
49 err |= sysv_sync_inode(inode);
50 return err ? -EIO : 0;
51}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index da20b48d350f..9824743832a7 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -21,7 +21,6 @@
21 * the superblock. 21 * the superblock.
22 */ 22 */
23 23
24#include <linux/smp_lock.h>
25#include <linux/highuid.h> 24#include <linux/highuid.h>
26#include <linux/slab.h> 25#include <linux/slab.h>
27#include <linux/init.h> 26#include <linux/init.h>
@@ -31,15 +30,12 @@
31#include <asm/byteorder.h> 30#include <asm/byteorder.h>
32#include "sysv.h" 31#include "sysv.h"
33 32
34/* This is only called on sync() and umount(), when s_dirt=1. */ 33static int sysv_sync_fs(struct super_block *sb, int wait)
35static void sysv_write_super(struct super_block *sb)
36{ 34{
37 struct sysv_sb_info *sbi = SYSV_SB(sb); 35 struct sysv_sb_info *sbi = SYSV_SB(sb);
38 unsigned long time = get_seconds(), old_time; 36 unsigned long time = get_seconds(), old_time;
39 37
40 lock_kernel(); 38 lock_super(sb);
41 if (sb->s_flags & MS_RDONLY)
42 goto clean;
43 39
44 /* 40 /*
45 * If we are going to write out the super block, 41 * If we are going to write out the super block,
@@ -53,18 +49,29 @@ static void sysv_write_super(struct super_block *sb)
53 *sbi->s_sb_time = cpu_to_fs32(sbi, time); 49 *sbi->s_sb_time = cpu_to_fs32(sbi, time);
54 mark_buffer_dirty(sbi->s_bh2); 50 mark_buffer_dirty(sbi->s_bh2);
55 } 51 }
56clean: 52
57 sb->s_dirt = 0; 53 unlock_super(sb);
58 unlock_kernel(); 54
55 return 0;
56}
57
58static void sysv_write_super(struct super_block *sb)
59{
60 if (!(sb->s_flags & MS_RDONLY))
61 sysv_sync_fs(sb, 1);
62 else
63 sb->s_dirt = 0;
59} 64}
60 65
61static int sysv_remount(struct super_block *sb, int *flags, char *data) 66static int sysv_remount(struct super_block *sb, int *flags, char *data)
62{ 67{
63 struct sysv_sb_info *sbi = SYSV_SB(sb); 68 struct sysv_sb_info *sbi = SYSV_SB(sb);
69 lock_super(sb);
64 if (sbi->s_forced_ro) 70 if (sbi->s_forced_ro)
65 *flags |= MS_RDONLY; 71 *flags |= MS_RDONLY;
66 if (!(*flags & MS_RDONLY)) 72 if (!(*flags & MS_RDONLY))
67 sb->s_dirt = 1; 73 sb->s_dirt = 1;
74 unlock_super(sb);
68 return 0; 75 return 0;
69} 76}
70 77
@@ -72,6 +79,9 @@ static void sysv_put_super(struct super_block *sb)
72{ 79{
73 struct sysv_sb_info *sbi = SYSV_SB(sb); 80 struct sysv_sb_info *sbi = SYSV_SB(sb);
74 81
82 if (sb->s_dirt)
83 sysv_write_super(sb);
84
75 if (!(sb->s_flags & MS_RDONLY)) { 85 if (!(sb->s_flags & MS_RDONLY)) {
76 /* XXX ext2 also updates the state here */ 86 /* XXX ext2 also updates the state here */
77 mark_buffer_dirty(sbi->s_bh1); 87 mark_buffer_dirty(sbi->s_bh1);
@@ -236,7 +246,7 @@ bad_inode:
236 return ERR_PTR(-EIO); 246 return ERR_PTR(-EIO);
237} 247}
238 248
239static struct buffer_head * sysv_update_inode(struct inode * inode) 249int sysv_write_inode(struct inode *inode, int wait)
240{ 250{
241 struct super_block * sb = inode->i_sb; 251 struct super_block * sb = inode->i_sb;
242 struct sysv_sb_info * sbi = SYSV_SB(sb); 252 struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -244,17 +254,18 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
244 struct sysv_inode * raw_inode; 254 struct sysv_inode * raw_inode;
245 struct sysv_inode_info * si; 255 struct sysv_inode_info * si;
246 unsigned int ino, block; 256 unsigned int ino, block;
257 int err = 0;
247 258
248 ino = inode->i_ino; 259 ino = inode->i_ino;
249 if (!ino || ino > sbi->s_ninodes) { 260 if (!ino || ino > sbi->s_ninodes) {
250 printk("Bad inode number on dev %s: %d is out of range\n", 261 printk("Bad inode number on dev %s: %d is out of range\n",
251 inode->i_sb->s_id, ino); 262 inode->i_sb->s_id, ino);
252 return NULL; 263 return -EIO;
253 } 264 }
254 raw_inode = sysv_raw_inode(sb, ino, &bh); 265 raw_inode = sysv_raw_inode(sb, ino, &bh);
255 if (!raw_inode) { 266 if (!raw_inode) {
256 printk("unable to read i-node block\n"); 267 printk("unable to read i-node block\n");
257 return NULL; 268 return -EIO;
258 } 269 }
259 270
260 raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); 271 raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
@@ -273,37 +284,21 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
273 write3byte(sbi, (u8 *)&si->i_data[block], 284 write3byte(sbi, (u8 *)&si->i_data[block],
274 &raw_inode->i_data[3*block]); 285 &raw_inode->i_data[3*block]);
275 mark_buffer_dirty(bh); 286 mark_buffer_dirty(bh);
276 return bh; 287 if (wait) {
277} 288 sync_dirty_buffer(bh);
278 289 if (buffer_req(bh) && !buffer_uptodate(bh)) {
279int sysv_write_inode(struct inode * inode, int wait) 290 printk ("IO error syncing sysv inode [%s:%08x]\n",
280{ 291 sb->s_id, ino);
281 struct buffer_head *bh; 292 err = -EIO;
282 lock_kernel(); 293 }
283 bh = sysv_update_inode(inode); 294 }
284 brelse(bh); 295 brelse(bh);
285 unlock_kernel();
286 return 0; 296 return 0;
287} 297}
288 298
289int sysv_sync_inode(struct inode * inode) 299int sysv_sync_inode(struct inode *inode)
290{ 300{
291 int err = 0; 301 return sysv_write_inode(inode, 1);
292 struct buffer_head *bh;
293
294 bh = sysv_update_inode(inode);
295 if (bh && buffer_dirty(bh)) {
296 sync_dirty_buffer(bh);
297 if (buffer_req(bh) && !buffer_uptodate(bh)) {
298 printk ("IO error syncing sysv inode [%s:%08lx]\n",
299 inode->i_sb->s_id, inode->i_ino);
300 err = -1;
301 }
302 }
303 else if (!bh)
304 err = -1;
305 brelse (bh);
306 return err;
307} 302}
308 303
309static void sysv_delete_inode(struct inode *inode) 304static void sysv_delete_inode(struct inode *inode)
@@ -311,9 +306,7 @@ static void sysv_delete_inode(struct inode *inode)
311 truncate_inode_pages(&inode->i_data, 0); 306 truncate_inode_pages(&inode->i_data, 0);
312 inode->i_size = 0; 307 inode->i_size = 0;
313 sysv_truncate(inode); 308 sysv_truncate(inode);
314 lock_kernel();
315 sysv_free_inode(inode); 309 sysv_free_inode(inode);
316 unlock_kernel();
317} 310}
318 311
319static struct kmem_cache *sysv_inode_cachep; 312static struct kmem_cache *sysv_inode_cachep;
@@ -347,6 +340,7 @@ const struct super_operations sysv_sops = {
347 .delete_inode = sysv_delete_inode, 340 .delete_inode = sysv_delete_inode,
348 .put_super = sysv_put_super, 341 .put_super = sysv_put_super,
349 .write_super = sysv_write_super, 342 .write_super = sysv_write_super,
343 .sync_fs = sysv_sync_fs,
350 .remount_fs = sysv_remount, 344 .remount_fs = sysv_remount,
351 .statfs = sysv_statfs, 345 .statfs = sysv_statfs,
352}; 346};
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 5784a318c883..53786eb5cf60 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -144,7 +144,6 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
144extern struct inode *sysv_iget(struct super_block *, unsigned int); 144extern struct inode *sysv_iget(struct super_block *, unsigned int);
145extern int sysv_write_inode(struct inode *, int); 145extern int sysv_write_inode(struct inode *, int);
146extern int sysv_sync_inode(struct inode *); 146extern int sysv_sync_inode(struct inode *);
147extern int sysv_sync_file(struct file *, struct dentry *, int);
148extern void sysv_set_inode(struct inode *, dev_t); 147extern void sysv_set_inode(struct inode *, dev_t);
149extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); 148extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
150extern int sysv_init_icache(void); 149extern int sysv_init_icache(void);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index af1914462f02..eaf6d891d46f 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -91,7 +91,6 @@ static int shrink_liability(struct ubifs_info *c, int nr_to_write)
91 return nr_written; 91 return nr_written;
92} 92}
93 93
94
95/** 94/**
96 * run_gc - run garbage collector. 95 * run_gc - run garbage collector.
97 * @c: UBIFS file-system description object 96 * @c: UBIFS file-system description object
@@ -628,7 +627,7 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
628 * 627 *
629 * This function releases budget corresponding to a dirty inode. It is usually 628 * This function releases budget corresponding to a dirty inode. It is usually
630 * called when after the inode has been written to the media and marked as 629 * called when after the inode has been written to the media and marked as
631 * clean. 630 * clean. It also causes the "no space" flags to be cleared.
632 */ 631 */
633void ubifs_release_dirty_inode_budget(struct ubifs_info *c, 632void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
634 struct ubifs_inode *ui) 633 struct ubifs_inode *ui)
@@ -636,6 +635,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
636 struct ubifs_budget_req req; 635 struct ubifs_budget_req req;
637 636
638 memset(&req, 0, sizeof(struct ubifs_budget_req)); 637 memset(&req, 0, sizeof(struct ubifs_budget_req));
638 /* The "no space" flags will be cleared because dd_growth is > 0 */
639 req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8); 639 req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
640 ubifs_release_budget(c, &req); 640 ubifs_release_budget(c, &req);
641} 641}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f55d523c52bb..552fb0111fff 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -528,6 +528,25 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
528 inode->i_nlink, dir->i_ino); 528 inode->i_nlink, dir->i_ino);
529 ubifs_assert(mutex_is_locked(&dir->i_mutex)); 529 ubifs_assert(mutex_is_locked(&dir->i_mutex));
530 ubifs_assert(mutex_is_locked(&inode->i_mutex)); 530 ubifs_assert(mutex_is_locked(&inode->i_mutex));
531
532 /*
533 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
534 * otherwise has the potential to corrupt the orphan inode list.
535 *
536 * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
537 * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
538 * lock 'dirA->i_mutex', so this is possible. Both of the functions
539 * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
540 * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
541 * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
542 * to the list of orphans. After this, 'vfs_link()' will link
543 * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
544 * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
545 * to the list of orphans.
546 */
547 if (inode->i_nlink == 0)
548 return -ENOENT;
549
531 err = dbg_check_synced_i_size(inode); 550 err = dbg_check_synced_i_size(inode);
532 if (err) 551 if (err)
533 return err; 552 return err;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e8e632a1dcdf..762a7d6cec73 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -293,13 +293,15 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
293 * 293 *
294 * This function is called when the write-buffer timer expires. 294 * This function is called when the write-buffer timer expires.
295 */ 295 */
296static void wbuf_timer_callback_nolock(unsigned long data) 296static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
297{ 297{
298 struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data; 298 struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer);
299 299
300 dbg_io("jhead %d", wbuf->jhead);
300 wbuf->need_sync = 1; 301 wbuf->need_sync = 1;
301 wbuf->c->need_wbuf_sync = 1; 302 wbuf->c->need_wbuf_sync = 1;
302 ubifs_wake_up_bgt(wbuf->c); 303 ubifs_wake_up_bgt(wbuf->c);
304 return HRTIMER_NORESTART;
303} 305}
304 306
305/** 307/**
@@ -308,13 +310,16 @@ static void wbuf_timer_callback_nolock(unsigned long data)
308 */ 310 */
309static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) 311static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
310{ 312{
311 ubifs_assert(!timer_pending(&wbuf->timer)); 313 ubifs_assert(!hrtimer_active(&wbuf->timer));
312 314
313 if (!wbuf->timeout) 315 if (wbuf->no_timer)
314 return; 316 return;
315 317 dbg_io("set timer for jhead %d, %llu-%llu millisecs", wbuf->jhead,
316 wbuf->timer.expires = jiffies + wbuf->timeout; 318 div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
317 add_timer(&wbuf->timer); 319 div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
320 USEC_PER_SEC));
321 hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta,
322 HRTIMER_MODE_REL);
318} 323}
319 324
320/** 325/**
@@ -323,13 +328,10 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
323 */ 328 */
324static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) 329static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
325{ 330{
326 /* 331 if (wbuf->no_timer)
327 * If the syncer is waiting for the lock (from the background thread's 332 return;
328 * context) and another task is changing write-buffer then the syncing
329 * should be canceled.
330 */
331 wbuf->need_sync = 0; 333 wbuf->need_sync = 0;
332 del_timer(&wbuf->timer); 334 hrtimer_cancel(&wbuf->timer);
333} 335}
334 336
335/** 337/**
@@ -349,8 +351,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
349 /* Write-buffer is empty or not seeked */ 351 /* Write-buffer is empty or not seeked */
350 return 0; 352 return 0;
351 353
352 dbg_io("LEB %d:%d, %d bytes", 354 dbg_io("LEB %d:%d, %d bytes, jhead %d",
353 wbuf->lnum, wbuf->offs, wbuf->used); 355 wbuf->lnum, wbuf->offs, wbuf->used, wbuf->jhead);
354 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); 356 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
355 ubifs_assert(!(wbuf->avail & 7)); 357 ubifs_assert(!(wbuf->avail & 7));
356 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); 358 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
@@ -390,7 +392,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
390 * @offs: logical eraseblock offset to seek to 392 * @offs: logical eraseblock offset to seek to
391 * @dtype: data type 393 * @dtype: data type
392 * 394 *
393 * This function targets the write buffer to logical eraseblock @lnum:@offs. 395 * This function targets the write-buffer to logical eraseblock @lnum:@offs.
394 * The write-buffer is synchronized if it is not empty. Returns zero in case of 396 * The write-buffer is synchronized if it is not empty. Returns zero in case of
395 * success and a negative error code in case of failure. 397 * success and a negative error code in case of failure.
396 */ 398 */
@@ -399,7 +401,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
399{ 401{
400 const struct ubifs_info *c = wbuf->c; 402 const struct ubifs_info *c = wbuf->c;
401 403
402 dbg_io("LEB %d:%d", lnum, offs); 404 dbg_io("LEB %d:%d, jhead %d", lnum, offs, wbuf->jhead);
403 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt); 405 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
404 ubifs_assert(offs >= 0 && offs <= c->leb_size); 406 ubifs_assert(offs >= 0 && offs <= c->leb_size);
405 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); 407 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
@@ -506,9 +508,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
506 struct ubifs_info *c = wbuf->c; 508 struct ubifs_info *c = wbuf->c;
507 int err, written, n, aligned_len = ALIGN(len, 8), offs; 509 int err, written, n, aligned_len = ALIGN(len, 8), offs;
508 510
509 dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len, 511 dbg_io("%d bytes (%s) to jhead %d wbuf at LEB %d:%d", len,
510 dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum, 512 dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->jhead,
511 wbuf->offs + wbuf->used); 513 wbuf->lnum, wbuf->offs + wbuf->used);
512 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); 514 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
513 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); 515 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
514 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); 516 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
@@ -533,8 +535,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
533 memcpy(wbuf->buf + wbuf->used, buf, len); 535 memcpy(wbuf->buf + wbuf->used, buf, len);
534 536
535 if (aligned_len == wbuf->avail) { 537 if (aligned_len == wbuf->avail) {
536 dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, 538 dbg_io("flush jhead %d wbuf to LEB %d:%d",
537 wbuf->offs); 539 wbuf->jhead, wbuf->lnum, wbuf->offs);
538 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, 540 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
539 wbuf->offs, c->min_io_size, 541 wbuf->offs, c->min_io_size,
540 wbuf->dtype); 542 wbuf->dtype);
@@ -562,7 +564,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
562 * minimal I/O unit. We have to fill and flush write-buffer and switch 564 * minimal I/O unit. We have to fill and flush write-buffer and switch
563 * to the next min. I/O unit. 565 * to the next min. I/O unit.
564 */ 566 */
565 dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs); 567 dbg_io("flush jhead %d wbuf to LEB %d:%d",
568 wbuf->jhead, wbuf->lnum, wbuf->offs);
566 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); 569 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
567 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, 570 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
568 c->min_io_size, wbuf->dtype); 571 c->min_io_size, wbuf->dtype);
@@ -695,7 +698,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
695 int err, rlen, overlap; 698 int err, rlen, overlap;
696 struct ubifs_ch *ch = buf; 699 struct ubifs_ch *ch = buf;
697 700
698 dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); 701 dbg_io("LEB %d:%d, %s, length %d, jhead %d", lnum, offs,
702 dbg_ntype(type), len, wbuf->jhead);
699 ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); 703 ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
700 ubifs_assert(!(offs & 7) && offs < c->leb_size); 704 ubifs_assert(!(offs & 7) && offs < c->leb_size);
701 ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); 705 ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
@@ -819,7 +823,7 @@ out:
819 * @c: UBIFS file-system description object 823 * @c: UBIFS file-system description object
820 * @wbuf: write-buffer to initialize 824 * @wbuf: write-buffer to initialize
821 * 825 *
822 * This function initializes write buffer. Returns zero in case of success 826 * This function initializes write-buffer. Returns zero in case of success
823 * %-ENOMEM in case of failure. 827 * %-ENOMEM in case of failure.
824 */ 828 */
825int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) 829int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
@@ -845,20 +849,21 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
845 wbuf->sync_callback = NULL; 849 wbuf->sync_callback = NULL;
846 mutex_init(&wbuf->io_mutex); 850 mutex_init(&wbuf->io_mutex);
847 spin_lock_init(&wbuf->lock); 851 spin_lock_init(&wbuf->lock);
848
849 wbuf->c = c; 852 wbuf->c = c;
850 init_timer(&wbuf->timer);
851 wbuf->timer.function = wbuf_timer_callback_nolock;
852 wbuf->timer.data = (unsigned long)wbuf;
853 wbuf->timeout = DEFAULT_WBUF_TIMEOUT;
854 wbuf->next_ino = 0; 853 wbuf->next_ino = 0;
855 854
855 hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
856 wbuf->timer.function = wbuf_timer_callback_nolock;
857 wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0);
858 wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT;
859 wbuf->delta *= 1000000000ULL;
860 ubifs_assert(wbuf->delta <= ULONG_MAX);
856 return 0; 861 return 0;
857} 862}
858 863
859/** 864/**
860 * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array. 865 * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array.
861 * @wbuf: the write-buffer whereto add 866 * @wbuf: the write-buffer where to add
862 * @inum: the inode number 867 * @inum: the inode number
863 * 868 *
864 * This function adds an inode number to the inode array of the write-buffer. 869 * This function adds an inode number to the inode array of the write-buffer.
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 6db7a6be6c97..8aacd64957a2 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -25,7 +25,6 @@
25/* This file implements EXT2-compatible extended attribute ioctl() calls */ 25/* This file implements EXT2-compatible extended attribute ioctl() calls */
26 26
27#include <linux/compat.h> 27#include <linux/compat.h>
28#include <linux/smp_lock.h>
29#include <linux/mount.h> 28#include <linux/mount.h>
30#include "ubifs.h" 29#include "ubifs.h"
31 30
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 10662975d2ef..e5f6cf8a1155 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -53,6 +53,25 @@ static int is_empty(void *buf, int len)
53} 53}
54 54
55/** 55/**
56 * first_non_ff - find offset of the first non-0xff byte.
57 * @buf: buffer to search in
58 * @len: length of buffer
59 *
60 * This function returns offset of the first non-0xff byte in @buf or %-1 if
61 * the buffer contains only 0xff bytes.
62 */
63static int first_non_ff(void *buf, int len)
64{
65 uint8_t *p = buf;
66 int i;
67
68 for (i = 0; i < len; i++)
69 if (*p++ != 0xff)
70 return i;
71 return -1;
72}
73
74/**
56 * get_master_node - get the last valid master node allowing for corruption. 75 * get_master_node - get the last valid master node allowing for corruption.
57 * @c: UBIFS file-system description object 76 * @c: UBIFS file-system description object
58 * @lnum: LEB number 77 * @lnum: LEB number
@@ -343,43 +362,21 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
343 * 362 *
344 * This function returns %1 if @offs was in the last write to the LEB whose data 363 * This function returns %1 if @offs was in the last write to the LEB whose data
345 * is in @buf, otherwise %0 is returned. The determination is made by checking 364 * is in @buf, otherwise %0 is returned. The determination is made by checking
346 * for subsequent empty space starting from the next min_io_size boundary (or a 365 * for subsequent empty space starting from the next @c->min_io_size boundary.
347 * bit less than the common header size if min_io_size is one).
348 */ 366 */
349static int is_last_write(const struct ubifs_info *c, void *buf, int offs) 367static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
350{ 368{
351 int empty_offs; 369 int empty_offs, check_len;
352 int check_len;
353 uint8_t *p; 370 uint8_t *p;
354 371
355 if (c->min_io_size == 1) {
356 check_len = c->leb_size - offs;
357 p = buf + check_len;
358 for (; check_len > 0; check_len--)
359 if (*--p != 0xff)
360 break;
361 /*
362 * 'check_len' is the size of the corruption which cannot be
363 * more than the size of 1 node if it was caused by an unclean
364 * unmount.
365 */
366 if (check_len > UBIFS_MAX_NODE_SZ)
367 return 0;
368 return 1;
369 }
370
371 /* 372 /*
372 * Round up to the next c->min_io_size boundary i.e. 'offs' is in the 373 * Round up to the next @c->min_io_size boundary i.e. @offs is in the
373 * last wbuf written. After that should be empty space. 374 * last wbuf written. After that should be empty space.
374 */ 375 */
375 empty_offs = ALIGN(offs + 1, c->min_io_size); 376 empty_offs = ALIGN(offs + 1, c->min_io_size);
376 check_len = c->leb_size - empty_offs; 377 check_len = c->leb_size - empty_offs;
377 p = buf + empty_offs - offs; 378 p = buf + empty_offs - offs;
378 379 return is_empty(p, check_len);
379 for (; check_len > 0; check_len--)
380 if (*p++ != 0xff)
381 return 0;
382 return 1;
383} 380}
384 381
385/** 382/**
@@ -392,7 +389,7 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
392 * 389 *
393 * This function pads up to the next min_io_size boundary (if there is one) and 390 * This function pads up to the next min_io_size boundary (if there is one) and
394 * sets empty space to all 0xff. @buf, @offs and @len are updated to the next 391 * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
395 * min_io_size boundary (if there is one). 392 * @c->min_io_size boundary.
396 */ 393 */
397static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, 394static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
398 int *offs, int *len) 395 int *offs, int *len)
@@ -402,11 +399,6 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
402 lnum = lnum; 399 lnum = lnum;
403 dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs); 400 dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
404 401
405 if (c->min_io_size == 1) {
406 memset(*buf, 0xff, c->leb_size - *offs);
407 return;
408 }
409
410 ubifs_assert(!(*offs & 7)); 402 ubifs_assert(!(*offs & 7));
411 empty_offs = ALIGN(*offs, c->min_io_size); 403 empty_offs = ALIGN(*offs, c->min_io_size);
412 pad_len = empty_offs - *offs; 404 pad_len = empty_offs - *offs;
@@ -566,8 +558,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
566 * 558 *
567 * This function does a scan of a LEB, but caters for errors that might have 559 * This function does a scan of a LEB, but caters for errors that might have
568 * been caused by the unclean unmount from which we are attempting to recover. 560 * been caused by the unclean unmount from which we are attempting to recover.
569 * 561 * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is
570 * This function returns %0 on success and a negative error code on failure. 562 * found, and a negative error code in case of failure.
571 */ 563 */
572struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, 564struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
573 int offs, void *sbuf, int grouped) 565 int offs, void *sbuf, int grouped)
@@ -666,7 +658,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
666 goto corrupted; 658 goto corrupted;
667 default: 659 default:
668 dbg_err("unknown"); 660 dbg_err("unknown");
669 goto corrupted; 661 err = -EINVAL;
662 goto error;
670 } 663 }
671 } 664 }
672 665
@@ -675,8 +668,13 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
675 clean_buf(c, &buf, lnum, &offs, &len); 668 clean_buf(c, &buf, lnum, &offs, &len);
676 need_clean = 1; 669 need_clean = 1;
677 } else { 670 } else {
678 ubifs_err("corrupt empty space at LEB %d:%d", 671 int corruption = first_non_ff(buf, len);
679 lnum, offs); 672
673 ubifs_err("corrupt empty space LEB %d:%d, corruption "
674 "starts at %d", lnum, offs, corruption);
675 /* Make sure we dump interesting non-0xFF data */
676 offs = corruption;
677 buf += corruption;
680 goto corrupted; 678 goto corrupted;
681 } 679 }
682 } 680 }
@@ -836,7 +834,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
836static int recover_head(const struct ubifs_info *c, int lnum, int offs, 834static int recover_head(const struct ubifs_info *c, int lnum, int offs,
837 void *sbuf) 835 void *sbuf)
838{ 836{
839 int len, err, need_clean = 0; 837 int len, err;
840 838
841 if (c->min_io_size > 1) 839 if (c->min_io_size > 1)
842 len = c->min_io_size; 840 len = c->min_io_size;
@@ -850,19 +848,7 @@ static int recover_head(const struct ubifs_info *c, int lnum, int offs,
850 848
851 /* Read at the head location and check it is empty flash */ 849 /* Read at the head location and check it is empty flash */
852 err = ubi_read(c->ubi, lnum, sbuf, offs, len); 850 err = ubi_read(c->ubi, lnum, sbuf, offs, len);
853 if (err) 851 if (err || !is_empty(sbuf, len)) {
854 need_clean = 1;
855 else {
856 uint8_t *p = sbuf;
857
858 while (len--)
859 if (*p++ != 0xff) {
860 need_clean = 1;
861 break;
862 }
863 }
864
865 if (need_clean) {
866 dbg_rcvry("cleaning head at %d:%d", lnum, offs); 852 dbg_rcvry("cleaning head at %d:%d", lnum, offs);
867 if (offs == 0) 853 if (offs == 0)
868 return ubifs_leb_unmap(c, lnum); 854 return ubifs_leb_unmap(c, lnum);
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 11cc80125a49..2970500f32df 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -837,9 +837,10 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
837 837
838 dbg_mnt("replay log LEB %d:%d", lnum, offs); 838 dbg_mnt("replay log LEB %d:%d", lnum, offs);
839 sleb = ubifs_scan(c, lnum, offs, sbuf); 839 sleb = ubifs_scan(c, lnum, offs, sbuf);
840 if (IS_ERR(sleb)) { 840 if (IS_ERR(sleb) ) {
841 if (c->need_recovery) 841 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
842 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); 842 return PTR_ERR(sleb);
843 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
843 if (IS_ERR(sleb)) 844 if (IS_ERR(sleb))
844 return PTR_ERR(sleb); 845 return PTR_ERR(sleb);
845 } 846 }
@@ -957,7 +958,7 @@ out:
957 return err; 958 return err;
958 959
959out_dump: 960out_dump:
960 ubifs_err("log error detected while replying the log at LEB %d:%d", 961 ubifs_err("log error detected while replaying the log at LEB %d:%d",
961 lnum, offs + snod->offs); 962 lnum, offs + snod->offs);
962 dbg_dump_node(c, snod->node); 963 dbg_dump_node(c, snod->node);
963 ubifs_scan_destroy(sleb); 964 ubifs_scan_destroy(sleb);
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 0ed82479b44b..892ebfee4fe5 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -238,12 +238,12 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
238{ 238{
239 int len; 239 int len;
240 240
241 ubifs_err("corrupted data at LEB %d:%d", lnum, offs); 241 ubifs_err("corruption at LEB %d:%d", lnum, offs);
242 if (dbg_failure_mode) 242 if (dbg_failure_mode)
243 return; 243 return;
244 len = c->leb_size - offs; 244 len = c->leb_size - offs;
245 if (len > 4096) 245 if (len > 8192)
246 len = 4096; 246 len = 8192;
247 dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs); 247 dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs);
248 print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1); 248 print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
249} 249}
@@ -256,7 +256,9 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
256 * @sbuf: scan buffer (must be c->leb_size) 256 * @sbuf: scan buffer (must be c->leb_size)
257 * 257 *
258 * This function scans LEB number @lnum and returns complete information about 258 * This function scans LEB number @lnum and returns complete information about
259 * its contents. Returns an error code in case of failure. 259 * its contents. Returns the scaned information in case of success and,
260 * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
261 * of failure.
260 */ 262 */
261struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, 263struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
262 int offs, void *sbuf) 264 int offs, void *sbuf)
@@ -279,7 +281,6 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
279 cond_resched(); 281 cond_resched();
280 282
281 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); 283 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
282
283 if (ret > 0) { 284 if (ret > 0) {
284 /* Padding bytes or a valid padding node */ 285 /* Padding bytes or a valid padding node */
285 offs += ret; 286 offs += ret;
@@ -304,7 +305,8 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
304 goto corrupted; 305 goto corrupted;
305 default: 306 default:
306 dbg_err("unknown"); 307 dbg_err("unknown");
307 goto corrupted; 308 err = -EINVAL;
309 goto error;
308 } 310 }
309 311
310 err = ubifs_add_snod(c, sleb, buf, offs); 312 err = ubifs_add_snod(c, sleb, buf, offs);
@@ -317,8 +319,10 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
317 len -= node_len; 319 len -= node_len;
318 } 320 }
319 321
320 if (offs % c->min_io_size) 322 if (offs % c->min_io_size) {
321 goto corrupted; 323 ubifs_err("empty space starts at non-aligned offset %d", offs);
324 goto corrupted;;
325 }
322 326
323 ubifs_end_scan(c, sleb, lnum, offs); 327 ubifs_end_scan(c, sleb, lnum, offs);
324 328
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e9f7a754c4f7..26d2e0d80465 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,6 +36,7 @@
36#include <linux/mount.h> 36#include <linux/mount.h>
37#include <linux/math64.h> 37#include <linux/math64.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/smp_lock.h>
39#include "ubifs.h" 40#include "ubifs.h"
40 41
41/* 42/*
@@ -360,6 +361,11 @@ static void ubifs_delete_inode(struct inode *inode)
360out: 361out:
361 if (ui->dirty) 362 if (ui->dirty)
362 ubifs_release_dirty_inode_budget(c, ui); 363 ubifs_release_dirty_inode_budget(c, ui);
364 else {
365 /* We've deleted something - clean the "no space" flags */
366 c->nospace = c->nospace_rp = 0;
367 smp_wmb();
368 }
363 clear_inode(inode); 369 clear_inode(inode);
364} 370}
365 371
@@ -447,9 +453,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
447 if (!wait) 453 if (!wait)
448 return 0; 454 return 0;
449 455
450 if (sb->s_flags & MS_RDONLY)
451 return 0;
452
453 /* 456 /*
454 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and 457 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
455 * pages, so synchronize them first, then commit the journal. Strictly 458 * pages, so synchronize them first, then commit the journal. Strictly
@@ -794,7 +797,7 @@ static int alloc_wbufs(struct ubifs_info *c)
794 * does not need to be synchronized by timer. 797 * does not need to be synchronized by timer.
795 */ 798 */
796 c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; 799 c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
797 c->jheads[GCHD].wbuf.timeout = 0; 800 c->jheads[GCHD].wbuf.no_timer = 1;
798 801
799 return 0; 802 return 0;
800} 803}
@@ -935,6 +938,27 @@ static const match_table_t tokens = {
935}; 938};
936 939
937/** 940/**
941 * parse_standard_option - parse a standard mount option.
942 * @option: the option to parse
943 *
944 * Normally, standard mount options like "sync" are passed to file-systems as
945 * flags. However, when a "rootflags=" kernel boot parameter is used, they may
946 * be present in the options string. This function tries to deal with this
947 * situation and parse standard options. Returns 0 if the option was not
948 * recognized, and the corresponding integer flag if it was.
949 *
950 * UBIFS is only interested in the "sync" option, so do not check for anything
951 * else.
952 */
953static int parse_standard_option(const char *option)
954{
955 ubifs_msg("parse %s", option);
956 if (!strcmp(option, "sync"))
957 return MS_SYNCHRONOUS;
958 return 0;
959}
960
961/**
938 * ubifs_parse_options - parse mount parameters. 962 * ubifs_parse_options - parse mount parameters.
939 * @c: UBIFS file-system description object 963 * @c: UBIFS file-system description object
940 * @options: parameters to parse 964 * @options: parameters to parse
@@ -962,7 +986,7 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
962 switch (token) { 986 switch (token) {
963 /* 987 /*
964 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. 988 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
965 * We accepte them in order to be backware-compatible. But this 989 * We accept them in order to be backward-compatible. But this
966 * should be removed at some point. 990 * should be removed at some point.
967 */ 991 */
968 case Opt_fast_unmount: 992 case Opt_fast_unmount:
@@ -1010,9 +1034,19 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
1010 break; 1034 break;
1011 } 1035 }
1012 default: 1036 default:
1013 ubifs_err("unrecognized mount option \"%s\" " 1037 {
1014 "or missing value", p); 1038 unsigned long flag;
1015 return -EINVAL; 1039 struct super_block *sb = c->vfs_sb;
1040
1041 flag = parse_standard_option(p);
1042 if (!flag) {
1043 ubifs_err("unrecognized mount option \"%s\" "
1044 "or missing value", p);
1045 return -EINVAL;
1046 }
1047 sb->s_flags |= flag;
1048 break;
1049 }
1016 } 1050 }
1017 } 1051 }
1018 1052
@@ -1182,6 +1216,7 @@ static int mount_ubifs(struct ubifs_info *c)
1182 if (!ubifs_compr_present(c->default_compr)) { 1216 if (!ubifs_compr_present(c->default_compr)) {
1183 ubifs_err("'compressor \"%s\" is not compiled in", 1217 ubifs_err("'compressor \"%s\" is not compiled in",
1184 ubifs_compr_name(c->default_compr)); 1218 ubifs_compr_name(c->default_compr));
1219 err = -ENOTSUPP;
1185 goto out_free; 1220 goto out_free;
1186 } 1221 }
1187 1222
@@ -1252,6 +1287,9 @@ static int mount_ubifs(struct ubifs_info *c)
1252 if (err) 1287 if (err)
1253 goto out_journal; 1288 goto out_journal;
1254 1289
1290 /* Calculate 'min_idx_lebs' after journal replay */
1291 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
1292
1255 err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only); 1293 err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
1256 if (err) 1294 if (err)
1257 goto out_orphans; 1295 goto out_orphans;
@@ -1658,7 +1696,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1658 1696
1659 for (i = 0; i < c->jhead_cnt; i++) { 1697 for (i = 0; i < c->jhead_cnt; i++) {
1660 ubifs_wbuf_sync(&c->jheads[i].wbuf); 1698 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1661 del_timer_sync(&c->jheads[i].wbuf.timer); 1699 hrtimer_cancel(&c->jheads[i].wbuf.timer);
1662 } 1700 }
1663 1701
1664 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1702 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
@@ -1687,6 +1725,9 @@ static void ubifs_put_super(struct super_block *sb)
1687 1725
1688 ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, 1726 ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
1689 c->vi.vol_id); 1727 c->vi.vol_id);
1728
1729 lock_kernel();
1730
1690 /* 1731 /*
1691 * The following asserts are only valid if there has not been a failure 1732 * The following asserts are only valid if there has not been a failure
1692 * of the media. For example, there will be dirty inodes if we failed 1733 * of the media. For example, there will be dirty inodes if we failed
@@ -1716,10 +1757,8 @@ static void ubifs_put_super(struct super_block *sb)
1716 1757
1717 /* Synchronize write-buffers */ 1758 /* Synchronize write-buffers */
1718 if (c->jheads) 1759 if (c->jheads)
1719 for (i = 0; i < c->jhead_cnt; i++) { 1760 for (i = 0; i < c->jhead_cnt; i++)
1720 ubifs_wbuf_sync(&c->jheads[i].wbuf); 1761 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1721 del_timer_sync(&c->jheads[i].wbuf.timer);
1722 }
1723 1762
1724 /* 1763 /*
1725 * On fatal errors c->ro_media is set to 1, in which case we do 1764 * On fatal errors c->ro_media is set to 1, in which case we do
@@ -1753,6 +1792,8 @@ static void ubifs_put_super(struct super_block *sb)
1753 ubi_close_volume(c->ubi); 1792 ubi_close_volume(c->ubi);
1754 mutex_unlock(&c->umount_mutex); 1793 mutex_unlock(&c->umount_mutex);
1755 kfree(c); 1794 kfree(c);
1795
1796 unlock_kernel();
1756} 1797}
1757 1798
1758static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) 1799static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1768,17 +1809,22 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1768 return err; 1809 return err;
1769 } 1810 }
1770 1811
1812 lock_kernel();
1771 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1813 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
1772 if (c->ro_media) { 1814 if (c->ro_media) {
1773 ubifs_msg("cannot re-mount due to prior errors"); 1815 ubifs_msg("cannot re-mount due to prior errors");
1816 unlock_kernel();
1774 return -EROFS; 1817 return -EROFS;
1775 } 1818 }
1776 err = ubifs_remount_rw(c); 1819 err = ubifs_remount_rw(c);
1777 if (err) 1820 if (err) {
1821 unlock_kernel();
1778 return err; 1822 return err;
1823 }
1779 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 1824 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
1780 if (c->ro_media) { 1825 if (c->ro_media) {
1781 ubifs_msg("cannot re-mount due to prior errors"); 1826 ubifs_msg("cannot re-mount due to prior errors");
1827 unlock_kernel();
1782 return -EROFS; 1828 return -EROFS;
1783 } 1829 }
1784 ubifs_remount_ro(c); 1830 ubifs_remount_ro(c);
@@ -1793,6 +1839,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1793 } 1839 }
1794 1840
1795 ubifs_assert(c->lst.taken_empty_lebs > 0); 1841 ubifs_assert(c->lst.taken_empty_lebs > 0);
1842 unlock_kernel();
1796 return 0; 1843 return 0;
1797} 1844}
1798 1845
@@ -1902,6 +1949,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1902 INIT_LIST_HEAD(&c->orph_list); 1949 INIT_LIST_HEAD(&c->orph_list);
1903 INIT_LIST_HEAD(&c->orph_new); 1950 INIT_LIST_HEAD(&c->orph_new);
1904 1951
1952 c->vfs_sb = sb;
1905 c->highest_inum = UBIFS_FIRST_INO; 1953 c->highest_inum = UBIFS_FIRST_INO;
1906 c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; 1954 c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
1907 1955
@@ -1928,18 +1976,19 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1928 err = bdi_init(&c->bdi); 1976 err = bdi_init(&c->bdi);
1929 if (err) 1977 if (err)
1930 goto out_close; 1978 goto out_close;
1979 err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d",
1980 c->vi.ubi_num, c->vi.vol_id);
1981 if (err)
1982 goto out_bdi;
1931 1983
1932 err = ubifs_parse_options(c, data, 0); 1984 err = ubifs_parse_options(c, data, 0);
1933 if (err) 1985 if (err)
1934 goto out_bdi; 1986 goto out_bdi;
1935 1987
1936 c->vfs_sb = sb;
1937
1938 sb->s_fs_info = c; 1988 sb->s_fs_info = c;
1939 sb->s_magic = UBIFS_SUPER_MAGIC; 1989 sb->s_magic = UBIFS_SUPER_MAGIC;
1940 sb->s_blocksize = UBIFS_BLOCK_SIZE; 1990 sb->s_blocksize = UBIFS_BLOCK_SIZE;
1941 sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT; 1991 sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT;
1942 sb->s_dev = c->vi.cdev;
1943 sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c); 1992 sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c);
1944 if (c->max_inode_sz > MAX_LFS_FILESIZE) 1993 if (c->max_inode_sz > MAX_LFS_FILESIZE)
1945 sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; 1994 sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
@@ -1984,16 +2033,9 @@ out_free:
1984static int sb_test(struct super_block *sb, void *data) 2033static int sb_test(struct super_block *sb, void *data)
1985{ 2034{
1986 dev_t *dev = data; 2035 dev_t *dev = data;
2036 struct ubifs_info *c = sb->s_fs_info;
1987 2037
1988 return sb->s_dev == *dev; 2038 return c->vi.cdev == *dev;
1989}
1990
1991static int sb_set(struct super_block *sb, void *data)
1992{
1993 dev_t *dev = data;
1994
1995 sb->s_dev = *dev;
1996 return 0;
1997} 2039}
1998 2040
1999static int ubifs_get_sb(struct file_system_type *fs_type, int flags, 2041static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
@@ -2021,7 +2063,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2021 2063
2022 dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id); 2064 dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id);
2023 2065
2024 sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev); 2066 sb = sget(fs_type, &sb_test, &set_anon_super, &vi.cdev);
2025 if (IS_ERR(sb)) { 2067 if (IS_ERR(sb)) {
2026 err = PTR_ERR(sb); 2068 err = PTR_ERR(sb);
2027 goto out_close; 2069 goto out_close;
@@ -2061,16 +2103,11 @@ out_close:
2061 return err; 2103 return err;
2062} 2104}
2063 2105
2064static void ubifs_kill_sb(struct super_block *sb)
2065{
2066 generic_shutdown_super(sb);
2067}
2068
2069static struct file_system_type ubifs_fs_type = { 2106static struct file_system_type ubifs_fs_type = {
2070 .name = "ubifs", 2107 .name = "ubifs",
2071 .owner = THIS_MODULE, 2108 .owner = THIS_MODULE,
2072 .get_sb = ubifs_get_sb, 2109 .get_sb = ubifs_get_sb,
2073 .kill_sb = ubifs_kill_sb 2110 .kill_sb = kill_anon_super,
2074}; 2111};
2075 2112
2076/* 2113/*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 0a8341e14088..a29349094422 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -95,8 +95,9 @@
95 */ 95 */
96#define BGT_NAME_PATTERN "ubifs_bgt%d_%d" 96#define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
97 97
98/* Default write-buffer synchronization timeout (5 secs) */ 98/* Write-buffer synchronization timeout interval in seconds */
99#define DEFAULT_WBUF_TIMEOUT (5 * HZ) 99#define WBUF_TIMEOUT_SOFTLIMIT 3
100#define WBUF_TIMEOUT_HARDLIMIT 5
100 101
101/* Maximum possible inode number (only 32-bit inodes are supported now) */ 102/* Maximum possible inode number (only 32-bit inodes are supported now) */
102#define MAX_INUM 0xFFFFFFFF 103#define MAX_INUM 0xFFFFFFFF
@@ -650,9 +651,12 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
650 * @io_mutex: serializes write-buffer I/O 651 * @io_mutex: serializes write-buffer I/O
651 * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes 652 * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
652 * fields 653 * fields
654 * @softlimit: soft write-buffer timeout interval
655 * @delta: hard and soft timeouts delta (the timer expire inteval is @softlimit
656 * and @softlimit + @delta)
653 * @timer: write-buffer timer 657 * @timer: write-buffer timer
654 * @timeout: timer expire interval in jiffies 658 * @no_timer: non-zero if this write-buffer does not have a timer
655 * @need_sync: it is set if its timer expired and needs sync 659 * @need_sync: non-zero if the timer expired and the wbuf needs sync'ing
656 * @next_ino: points to the next position of the following inode number 660 * @next_ino: points to the next position of the following inode number
657 * @inodes: stores the inode numbers of the nodes which are in wbuf 661 * @inodes: stores the inode numbers of the nodes which are in wbuf
658 * 662 *
@@ -678,9 +682,11 @@ struct ubifs_wbuf {
678 int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); 682 int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
679 struct mutex io_mutex; 683 struct mutex io_mutex;
680 spinlock_t lock; 684 spinlock_t lock;
681 struct timer_list timer; 685 ktime_t softlimit;
682 int timeout; 686 unsigned long long delta;
683 int need_sync; 687 struct hrtimer timer;
688 unsigned int no_timer:1;
689 unsigned int need_sync:1;
684 int next_ino; 690 int next_ino;
685 ino_t *inodes; 691 ino_t *inodes;
686}; 692};
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index cfd31e229c89..adafcf556531 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -55,9 +55,9 @@
55 * ACL support is not implemented. 55 * ACL support is not implemented.
56 */ 56 */
57 57
58#include "ubifs.h"
58#include <linux/xattr.h> 59#include <linux/xattr.h>
59#include <linux/posix_acl_xattr.h> 60#include <linux/posix_acl_xattr.h>
60#include "ubifs.h"
61 61
62/* 62/*
63 * Limit the number of extended attributes per inode so that the total size 63 * Limit the number of extended attributes per inode so that the total size
diff --git a/fs/udf/Makefile b/fs/udf/Makefile
index 0d4503f7446d..eb880f66c23a 100644
--- a/fs/udf/Makefile
+++ b/fs/udf/Makefile
@@ -5,5 +5,5 @@
5obj-$(CONFIG_UDF_FS) += udf.o 5obj-$(CONFIG_UDF_FS) += udf.o
6 6
7udf-objs := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \ 7udf-objs := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
8 partition.o super.o truncate.o symlink.o fsync.o \ 8 partition.o super.o truncate.o symlink.o \
9 directory.o misc.o udftime.o unicode.o 9 directory.o misc.o udftime.o unicode.o
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index e48e9a3af763..1e068535b58b 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -238,7 +238,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
238 238
239 mutex_lock(&sbi->s_alloc_mutex); 239 mutex_lock(&sbi->s_alloc_mutex);
240 part_len = sbi->s_partmaps[partition].s_partition_len; 240 part_len = sbi->s_partmaps[partition].s_partition_len;
241 if (first_block < 0 || first_block >= part_len) 241 if (first_block >= part_len)
242 goto out; 242 goto out;
243 243
244 if (first_block + block_count > part_len) 244 if (first_block + block_count > part_len)
@@ -297,7 +297,7 @@ static int udf_bitmap_new_block(struct super_block *sb,
297 mutex_lock(&sbi->s_alloc_mutex); 297 mutex_lock(&sbi->s_alloc_mutex);
298 298
299repeat: 299repeat:
300 if (goal < 0 || goal >= sbi->s_partmaps[partition].s_partition_len) 300 if (goal >= sbi->s_partmaps[partition].s_partition_len)
301 goal = 0; 301 goal = 0;
302 302
303 nr_groups = bitmap->s_nr_groups; 303 nr_groups = bitmap->s_nr_groups;
@@ -666,8 +666,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
666 int8_t etype = -1; 666 int8_t etype = -1;
667 struct udf_inode_info *iinfo; 667 struct udf_inode_info *iinfo;
668 668
669 if (first_block < 0 || 669 if (first_block >= sbi->s_partmaps[partition].s_partition_len)
670 first_block >= sbi->s_partmaps[partition].s_partition_len)
671 return 0; 670 return 0;
672 671
673 iinfo = UDF_I(table); 672 iinfo = UDF_I(table);
@@ -743,7 +742,7 @@ static int udf_table_new_block(struct super_block *sb,
743 return newblock; 742 return newblock;
744 743
745 mutex_lock(&sbi->s_alloc_mutex); 744 mutex_lock(&sbi->s_alloc_mutex);
746 if (goal < 0 || goal >= sbi->s_partmaps[partition].s_partition_len) 745 if (goal >= sbi->s_partmaps[partition].s_partition_len)
747 goal = 0; 746 goal = 0;
748 747
749 /* We search for the closest matching block to goal. If we find 748 /* We search for the closest matching block to goal. If we find
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 2efd4d5291b6..61d9a76a3a69 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -210,5 +210,5 @@ const struct file_operations udf_dir_operations = {
210 .read = generic_read_dir, 210 .read = generic_read_dir,
211 .readdir = udf_readdir, 211 .readdir = udf_readdir,
212 .ioctl = udf_ioctl, 212 .ioctl = udf_ioctl,
213 .fsync = udf_fsync_file, 213 .fsync = simple_fsync,
214}; 214};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index eb91f3b70320..7464305382b5 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -209,7 +209,7 @@ const struct file_operations udf_file_operations = {
209 .write = do_sync_write, 209 .write = do_sync_write,
210 .aio_write = udf_file_aio_write, 210 .aio_write = udf_file_aio_write,
211 .release = udf_release_file, 211 .release = udf_release_file,
212 .fsync = udf_fsync_file, 212 .fsync = simple_fsync,
213 .splice_read = generic_file_splice_read, 213 .splice_read = generic_file_splice_read,
214 .llseek = generic_file_llseek, 214 .llseek = generic_file_llseek,
215}; 215};
diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c
deleted file mode 100644
index b2c472b733b8..000000000000
--- a/fs/udf/fsync.c
+++ /dev/null
@@ -1,52 +0,0 @@
1/*
2 * fsync.c
3 *
4 * PURPOSE
5 * Fsync handling routines for the OSTA-UDF(tm) filesystem.
6 *
7 * COPYRIGHT
8 * This file is distributed under the terms of the GNU General Public
9 * License (GPL). Copies of the GPL can be obtained from:
10 * ftp://prep.ai.mit.edu/pub/gnu/GPL
11 * Each contributing author retains all rights to their own work.
12 *
13 * (C) 1999-2001 Ben Fennema
14 * (C) 1999-2000 Stelias Computing Inc
15 *
16 * HISTORY
17 *
18 * 05/22/99 blf Created.
19 */
20
21#include "udfdecl.h"
22
23#include <linux/fs.h>
24
25static int udf_fsync_inode(struct inode *, int);
26
27/*
28 * File may be NULL when we are called. Perhaps we shouldn't
29 * even pass file to fsync ?
30 */
31
32int udf_fsync_file(struct file *file, struct dentry *dentry, int datasync)
33{
34 struct inode *inode = dentry->d_inode;
35
36 return udf_fsync_inode(inode, datasync);
37}
38
39static int udf_fsync_inode(struct inode *inode, int datasync)
40{
41 int err;
42
43 err = sync_mapping_buffers(inode->i_mapping);
44 if (!(inode->i_state & I_DIRTY))
45 return err;
46 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
47 return err;
48
49 err |= udf_sync_inode(inode);
50
51 return err ? -EIO : 0;
52}
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 703843f30ffd..1b88fd5df05d 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -56,7 +56,12 @@ unsigned long udf_get_last_block(struct super_block *sb)
56 struct block_device *bdev = sb->s_bdev; 56 struct block_device *bdev = sb->s_bdev;
57 unsigned long lblock = 0; 57 unsigned long lblock = 0;
58 58
59 if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock)) 59 /*
60 * ioctl failed or returned obviously bogus value?
61 * Try using the device size...
62 */
63 if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock) ||
64 lblock == 0)
60 lblock = bdev->bd_inode->i_size >> sb->s_blocksize_bits; 65 lblock = bdev->bd_inode->i_size >> sb->s_blocksize_bits;
61 66
62 if (lblock) 67 if (lblock)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 72348cc855a4..9d1b8c2e6c45 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -568,6 +568,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
568 if (!udf_parse_options(options, &uopt, true)) 568 if (!udf_parse_options(options, &uopt, true))
569 return -EINVAL; 569 return -EINVAL;
570 570
571 lock_kernel();
571 sbi->s_flags = uopt.flags; 572 sbi->s_flags = uopt.flags;
572 sbi->s_uid = uopt.uid; 573 sbi->s_uid = uopt.uid;
573 sbi->s_gid = uopt.gid; 574 sbi->s_gid = uopt.gid;
@@ -581,13 +582,16 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
581 *flags |= MS_RDONLY; 582 *flags |= MS_RDONLY;
582 } 583 }
583 584
584 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 585 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
586 unlock_kernel();
585 return 0; 587 return 0;
588 }
586 if (*flags & MS_RDONLY) 589 if (*flags & MS_RDONLY)
587 udf_close_lvid(sb); 590 udf_close_lvid(sb);
588 else 591 else
589 udf_open_lvid(sb); 592 udf_open_lvid(sb);
590 593
594 unlock_kernel();
591 return 0; 595 return 0;
592} 596}
593 597
@@ -1083,11 +1087,23 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1083 struct udf_inode_info *vati; 1087 struct udf_inode_info *vati;
1084 uint32_t pos; 1088 uint32_t pos;
1085 struct virtualAllocationTable20 *vat20; 1089 struct virtualAllocationTable20 *vat20;
1090 sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
1086 1091
1087 /* VAT file entry is in the last recorded block */ 1092 /* VAT file entry is in the last recorded block */
1088 ino.partitionReferenceNum = type1_index; 1093 ino.partitionReferenceNum = type1_index;
1089 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; 1094 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
1090 sbi->s_vat_inode = udf_iget(sb, &ino); 1095 sbi->s_vat_inode = udf_iget(sb, &ino);
1096 if (!sbi->s_vat_inode &&
1097 sbi->s_last_block != blocks - 1) {
1098 printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
1099 " last recorded block (%lu), retrying with the last "
1100 "block of the device (%lu).\n",
1101 (unsigned long)sbi->s_last_block,
1102 (unsigned long)blocks - 1);
1103 ino.partitionReferenceNum = type1_index;
1104 ino.logicalBlockNum = blocks - 1 - map->s_partition_root;
1105 sbi->s_vat_inode = udf_iget(sb, &ino);
1106 }
1091 if (!sbi->s_vat_inode) 1107 if (!sbi->s_vat_inode)
1092 return 1; 1108 return 1;
1093 1109
@@ -1915,7 +1931,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1915 if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) { 1931 if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
1916 ret = udf_load_vrs(sb, &uopt, silent, &fileset); 1932 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1917 } else { 1933 } else {
1918 uopt.blocksize = bdev_hardsect_size(sb->s_bdev); 1934 uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
1919 ret = udf_load_vrs(sb, &uopt, silent, &fileset); 1935 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1920 if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { 1936 if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
1921 if (!silent) 1937 if (!silent)
@@ -2062,6 +2078,9 @@ static void udf_put_super(struct super_block *sb)
2062 struct udf_sb_info *sbi; 2078 struct udf_sb_info *sbi;
2063 2079
2064 sbi = UDF_SB(sb); 2080 sbi = UDF_SB(sb);
2081
2082 lock_kernel();
2083
2065 if (sbi->s_vat_inode) 2084 if (sbi->s_vat_inode)
2066 iput(sbi->s_vat_inode); 2085 iput(sbi->s_vat_inode);
2067 if (sbi->s_partitions) 2086 if (sbi->s_partitions)
@@ -2077,6 +2096,8 @@ static void udf_put_super(struct super_block *sb)
2077 kfree(sbi->s_partmaps); 2096 kfree(sbi->s_partmaps);
2078 kfree(sb->s_fs_info); 2097 kfree(sb->s_fs_info);
2079 sb->s_fs_info = NULL; 2098 sb->s_fs_info = NULL;
2099
2100 unlock_kernel();
2080} 2101}
2081 2102
2082static int udf_sync_fs(struct super_block *sb, int wait) 2103static int udf_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index cac51b77a5d1..8d46f4294ee7 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -223,9 +223,6 @@ extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
223extern int udf_new_block(struct super_block *, struct inode *, uint16_t, 223extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
224 uint32_t, int *); 224 uint32_t, int *);
225 225
226/* fsync.c */
227extern int udf_fsync_file(struct file *, struct dentry *, int);
228
229/* directory.c */ 226/* directory.c */
230extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *, 227extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
231 struct udf_fileident_bh *, 228 struct udf_fileident_bh *,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 6321b797061b..6f671f1ac271 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
666const struct file_operations ufs_dir_operations = { 666const struct file_operations ufs_dir_operations = {
667 .read = generic_read_dir, 667 .read = generic_read_dir,
668 .readdir = ufs_readdir, 668 .readdir = ufs_readdir,
669 .fsync = ufs_sync_file, 669 .fsync = simple_fsync,
670 .llseek = generic_file_llseek, 670 .llseek = generic_file_llseek,
671}; 671};
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 2bd3a1615714..73655c61240a 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,31 +24,10 @@
24 */ 24 */
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/buffer_head.h> /* for sync_mapping_buffers() */
28 27
29#include "ufs_fs.h" 28#include "ufs_fs.h"
30#include "ufs.h" 29#include "ufs.h"
31 30
32
33int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
34{
35 struct inode *inode = dentry->d_inode;
36 int err;
37 int ret;
38
39 ret = sync_mapping_buffers(inode->i_mapping);
40 if (!(inode->i_state & I_DIRTY))
41 return ret;
42 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
43 return ret;
44
45 err = ufs_sync_inode(inode);
46 if (ret == 0)
47 ret = err;
48 return ret;
49}
50
51
52/* 31/*
53 * We have mostly NULL's here: the current defaults are ok for 32 * We have mostly NULL's here: the current defaults are ok for
54 * the ufs filesystem. 33 * the ufs filesystem.
@@ -62,6 +41,6 @@ const struct file_operations ufs_file_operations = {
62 .aio_write = generic_file_aio_write, 41 .aio_write = generic_file_aio_write,
63 .mmap = generic_file_mmap, 42 .mmap = generic_file_mmap,
64 .open = generic_file_open, 43 .open = generic_file_open,
65 .fsync = ufs_sync_file, 44 .fsync = simple_fsync,
66 .splice_read = generic_file_splice_read, 45 .splice_read = generic_file_splice_read,
67}; 46};
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 3d2512c21f05..7cf33379fd46 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -56,9 +56,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
56 56
57 57
58 UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks); 58 UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
59 if (i_block < 0) { 59 if (i_block < direct_blocks) {
60 ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0");
61 } else if (i_block < direct_blocks) {
62 offsets[n++] = i_block; 60 offsets[n++] = i_block;
63 } else if ((i_block -= direct_blocks) < indirect_blocks) { 61 } else if ((i_block -= direct_blocks) < indirect_blocks) {
64 offsets[n++] = UFS_IND_BLOCK; 62 offsets[n++] = UFS_IND_BLOCK;
@@ -440,8 +438,6 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
440 lock_kernel(); 438 lock_kernel();
441 439
442 UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); 440 UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
443 if (fragment < 0)
444 goto abort_negative;
445 if (fragment > 441 if (fragment >
446 ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb) 442 ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb)
447 << uspi->s_fpbshift)) 443 << uspi->s_fpbshift))
@@ -504,10 +500,6 @@ abort:
504 unlock_kernel(); 500 unlock_kernel();
505 return err; 501 return err;
506 502
507abort_negative:
508 ufs_warning(sb, "ufs_get_block", "block < 0");
509 goto abort;
510
511abort_too_big: 503abort_too_big:
512 ufs_warning(sb, "ufs_get_block", "block > big"); 504 ufs_warning(sb, "ufs_get_block", "block > big");
513 goto abort; 505 goto abort;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 60359291761f..5faed7954d0a 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -263,6 +263,7 @@ void ufs_panic (struct super_block * sb, const char * function,
263 struct ufs_super_block_first * usb1; 263 struct ufs_super_block_first * usb1;
264 va_list args; 264 va_list args;
265 265
266 lock_kernel();
266 uspi = UFS_SB(sb)->s_uspi; 267 uspi = UFS_SB(sb)->s_uspi;
267 usb1 = ubh_get_usb_first(uspi); 268 usb1 = ubh_get_usb_first(uspi);
268 269
@@ -594,6 +595,9 @@ static void ufs_put_super_internal(struct super_block *sb)
594 595
595 596
596 UFSD("ENTER\n"); 597 UFSD("ENTER\n");
598
599 lock_kernel();
600
597 ufs_put_cstotal(sb); 601 ufs_put_cstotal(sb);
598 size = uspi->s_cssize; 602 size = uspi->s_cssize;
599 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; 603 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -621,6 +625,9 @@ static void ufs_put_super_internal(struct super_block *sb)
621 brelse (sbi->s_ucg[i]); 625 brelse (sbi->s_ucg[i]);
622 kfree (sbi->s_ucg); 626 kfree (sbi->s_ucg);
623 kfree (base); 627 kfree (base);
628
629 unlock_kernel();
630
624 UFSD("EXIT\n"); 631 UFSD("EXIT\n");
625} 632}
626 633
@@ -1118,32 +1125,45 @@ failed_nomem:
1118 return -ENOMEM; 1125 return -ENOMEM;
1119} 1126}
1120 1127
1121static void ufs_write_super(struct super_block *sb) 1128static int ufs_sync_fs(struct super_block *sb, int wait)
1122{ 1129{
1123 struct ufs_sb_private_info * uspi; 1130 struct ufs_sb_private_info * uspi;
1124 struct ufs_super_block_first * usb1; 1131 struct ufs_super_block_first * usb1;
1125 struct ufs_super_block_third * usb3; 1132 struct ufs_super_block_third * usb3;
1126 unsigned flags; 1133 unsigned flags;
1127 1134
1135 lock_super(sb);
1128 lock_kernel(); 1136 lock_kernel();
1137
1129 UFSD("ENTER\n"); 1138 UFSD("ENTER\n");
1139
1130 flags = UFS_SB(sb)->s_flags; 1140 flags = UFS_SB(sb)->s_flags;
1131 uspi = UFS_SB(sb)->s_uspi; 1141 uspi = UFS_SB(sb)->s_uspi;
1132 usb1 = ubh_get_usb_first(uspi); 1142 usb1 = ubh_get_usb_first(uspi);
1133 usb3 = ubh_get_usb_third(uspi); 1143 usb3 = ubh_get_usb_third(uspi);
1134 1144
1135 if (!(sb->s_flags & MS_RDONLY)) { 1145 usb1->fs_time = cpu_to_fs32(sb, get_seconds());
1136 usb1->fs_time = cpu_to_fs32(sb, get_seconds()); 1146 if ((flags & UFS_ST_MASK) == UFS_ST_SUN ||
1137 if ((flags & UFS_ST_MASK) == UFS_ST_SUN 1147 (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
1138 || (flags & UFS_ST_MASK) == UFS_ST_SUNOS 1148 (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
1139 || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 1149 ufs_set_fs_state(sb, usb1, usb3,
1140 ufs_set_fs_state(sb, usb1, usb3, 1150 UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
1141 UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); 1151 ufs_put_cstotal(sb);
1142 ufs_put_cstotal(sb);
1143 }
1144 sb->s_dirt = 0; 1152 sb->s_dirt = 0;
1153
1145 UFSD("EXIT\n"); 1154 UFSD("EXIT\n");
1146 unlock_kernel(); 1155 unlock_kernel();
1156 unlock_super(sb);
1157
1158 return 0;
1159}
1160
1161static void ufs_write_super(struct super_block *sb)
1162{
1163 if (!(sb->s_flags & MS_RDONLY))
1164 ufs_sync_fs(sb, 1);
1165 else
1166 sb->s_dirt = 0;
1147} 1167}
1148 1168
1149static void ufs_put_super(struct super_block *sb) 1169static void ufs_put_super(struct super_block *sb)
@@ -1152,6 +1172,9 @@ static void ufs_put_super(struct super_block *sb)
1152 1172
1153 UFSD("ENTER\n"); 1173 UFSD("ENTER\n");
1154 1174
1175 if (sb->s_dirt)
1176 ufs_write_super(sb);
1177
1155 if (!(sb->s_flags & MS_RDONLY)) 1178 if (!(sb->s_flags & MS_RDONLY))
1156 ufs_put_super_internal(sb); 1179 ufs_put_super_internal(sb);
1157 1180
@@ -1171,7 +1194,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1171 struct ufs_super_block_third * usb3; 1194 struct ufs_super_block_third * usb3;
1172 unsigned new_mount_opt, ufstype; 1195 unsigned new_mount_opt, ufstype;
1173 unsigned flags; 1196 unsigned flags;
1174 1197
1198 lock_kernel();
1199 lock_super(sb);
1175 uspi = UFS_SB(sb)->s_uspi; 1200 uspi = UFS_SB(sb)->s_uspi;
1176 flags = UFS_SB(sb)->s_flags; 1201 flags = UFS_SB(sb)->s_flags;
1177 usb1 = ubh_get_usb_first(uspi); 1202 usb1 = ubh_get_usb_first(uspi);
@@ -1184,17 +1209,24 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1184 ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; 1209 ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
1185 new_mount_opt = 0; 1210 new_mount_opt = 0;
1186 ufs_set_opt (new_mount_opt, ONERROR_LOCK); 1211 ufs_set_opt (new_mount_opt, ONERROR_LOCK);
1187 if (!ufs_parse_options (data, &new_mount_opt)) 1212 if (!ufs_parse_options (data, &new_mount_opt)) {
1213 unlock_super(sb);
1214 unlock_kernel();
1188 return -EINVAL; 1215 return -EINVAL;
1216 }
1189 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { 1217 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
1190 new_mount_opt |= ufstype; 1218 new_mount_opt |= ufstype;
1191 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { 1219 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
1192 printk("ufstype can't be changed during remount\n"); 1220 printk("ufstype can't be changed during remount\n");
1221 unlock_super(sb);
1222 unlock_kernel();
1193 return -EINVAL; 1223 return -EINVAL;
1194 } 1224 }
1195 1225
1196 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1226 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1197 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1227 UFS_SB(sb)->s_mount_opt = new_mount_opt;
1228 unlock_super(sb);
1229 unlock_kernel();
1198 return 0; 1230 return 0;
1199 } 1231 }
1200 1232
@@ -1219,6 +1251,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1219#ifndef CONFIG_UFS_FS_WRITE 1251#ifndef CONFIG_UFS_FS_WRITE
1220 printk("ufs was compiled with read-only support, " 1252 printk("ufs was compiled with read-only support, "
1221 "can't be mounted as read-write\n"); 1253 "can't be mounted as read-write\n");
1254 unlock_super(sb);
1255 unlock_kernel();
1222 return -EINVAL; 1256 return -EINVAL;
1223#else 1257#else
1224 if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 1258 if (ufstype != UFS_MOUNT_UFSTYPE_SUN &&
@@ -1227,16 +1261,22 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1227 ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && 1261 ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
1228 ufstype != UFS_MOUNT_UFSTYPE_UFS2) { 1262 ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
1229 printk("this ufstype is read-only supported\n"); 1263 printk("this ufstype is read-only supported\n");
1264 unlock_super(sb);
1265 unlock_kernel();
1230 return -EINVAL; 1266 return -EINVAL;
1231 } 1267 }
1232 if (!ufs_read_cylinder_structures(sb)) { 1268 if (!ufs_read_cylinder_structures(sb)) {
1233 printk("failed during remounting\n"); 1269 printk("failed during remounting\n");
1270 unlock_super(sb);
1271 unlock_kernel();
1234 return -EPERM; 1272 return -EPERM;
1235 } 1273 }
1236 sb->s_flags &= ~MS_RDONLY; 1274 sb->s_flags &= ~MS_RDONLY;
1237#endif 1275#endif
1238 } 1276 }
1239 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1277 UFS_SB(sb)->s_mount_opt = new_mount_opt;
1278 unlock_super(sb);
1279 unlock_kernel();
1240 return 0; 1280 return 0;
1241} 1281}
1242 1282
@@ -1352,6 +1392,7 @@ static const struct super_operations ufs_super_ops = {
1352 .delete_inode = ufs_delete_inode, 1392 .delete_inode = ufs_delete_inode,
1353 .put_super = ufs_put_super, 1393 .put_super = ufs_put_super,
1354 .write_super = ufs_write_super, 1394 .write_super = ufs_write_super,
1395 .sync_fs = ufs_sync_fs,
1355 .statfs = ufs_statfs, 1396 .statfs = ufs_statfs,
1356 .remount_fs = ufs_remount, 1397 .remount_fs = ufs_remount,
1357 .show_options = ufs_show_options, 1398 .show_options = ufs_show_options,
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index d0c4acd4f1f3..644e77e13599 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -99,7 +99,6 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
99extern const struct inode_operations ufs_file_inode_operations; 99extern const struct inode_operations ufs_file_inode_operations;
100extern const struct file_operations ufs_file_operations; 100extern const struct file_operations ufs_file_operations;
101extern const struct address_space_operations ufs_aops; 101extern const struct address_space_operations ufs_aops;
102extern int ufs_sync_file(struct file *, struct dentry *, int);
103 102
104/* ialloc.c */ 103/* ialloc.c */
105extern void ufs_free_inode (struct inode *inode); 104extern void ufs_free_inode (struct inode *inode);
diff --git a/fs/xattr.c b/fs/xattr.c
index d51b8f9db921..1c3d0af59ddf 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -297,7 +297,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
297 return error; 297 return error;
298 dentry = f->f_path.dentry; 298 dentry = f->f_path.dentry;
299 audit_inode(NULL, dentry); 299 audit_inode(NULL, dentry);
300 error = mnt_want_write(f->f_path.mnt); 300 error = mnt_want_write_file(f);
301 if (!error) { 301 if (!error) {
302 error = setxattr(dentry, name, value, size, flags); 302 error = setxattr(dentry, name, value, size, flags);
303 mnt_drop_write(f->f_path.mnt); 303 mnt_drop_write(f->f_path.mnt);
@@ -524,7 +524,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
524 return error; 524 return error;
525 dentry = f->f_path.dentry; 525 dentry = f->f_path.dentry;
526 audit_inode(NULL, dentry); 526 audit_inode(NULL, dentry);
527 error = mnt_want_write(f->f_path.mnt); 527 error = mnt_want_write_file(f);
528 if (!error) { 528 if (!error) {
529 error = removexattr(dentry, name); 529 error = removexattr(dentry, name);
530 mnt_drop_write(f->f_path.mnt); 530 mnt_drop_write(f->f_path.mnt);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 29228f5899cd..480f28127f09 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -39,6 +39,7 @@ config XFS_QUOTA
39config XFS_POSIX_ACL 39config XFS_POSIX_ACL
40 bool "XFS POSIX ACL support" 40 bool "XFS POSIX ACL support"
41 depends on XFS_FS 41 depends on XFS_FS
42 select FS_POSIX_ACL
42 help 43 help
43 POSIX Access Control Lists (ACLs) support permissions for users and 44 POSIX Access Control Lists (ACLs) support permissions for users and
44 groups beyond the owner/group/world scheme. 45 groups beyond the owner/group/world scheme.
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 60f107e47fe9..7a59daed1782 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -40,7 +40,7 @@ xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o
40endif 40endif
41 41
42xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o 42xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
43xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o 43xfs-$(CONFIG_XFS_POSIX_ACL) += $(XFS_LINUX)/xfs_acl.o
44xfs-$(CONFIG_PROC_FS) += $(XFS_LINUX)/xfs_stats.o 44xfs-$(CONFIG_PROC_FS) += $(XFS_LINUX)/xfs_stats.o
45xfs-$(CONFIG_SYSCTL) += $(XFS_LINUX)/xfs_sysctl.o 45xfs-$(CONFIG_SYSCTL) += $(XFS_LINUX)/xfs_sysctl.o
46xfs-$(CONFIG_COMPAT) += $(XFS_LINUX)/xfs_ioctl32.o 46xfs-$(CONFIG_COMPAT) += $(XFS_LINUX)/xfs_ioctl32.o
@@ -88,8 +88,7 @@ xfs-y += xfs_alloc.o \
88 xfs_utils.o \ 88 xfs_utils.o \
89 xfs_vnodeops.o \ 89 xfs_vnodeops.o \
90 xfs_rw.o \ 90 xfs_rw.o \
91 xfs_dmops.o \ 91 xfs_dmops.o
92 xfs_qmops.o
93 92
94xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o \ 93xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o \
95 xfs_dir2_trace.o 94 xfs_dir2_trace.o
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 1cd3b55ee3d2..2d3f90afe5f1 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -53,7 +53,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
53 printk(KERN_ERR "XFS: possible memory allocation " 53 printk(KERN_ERR "XFS: possible memory allocation "
54 "deadlock in %s (mode:0x%x)\n", 54 "deadlock in %s (mode:0x%x)\n",
55 __func__, lflags); 55 __func__, lflags);
56 congestion_wait(WRITE, HZ/50); 56 congestion_wait(BLK_RW_ASYNC, HZ/50);
57 } while (1); 57 } while (1);
58} 58}
59 59
@@ -130,7 +130,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
130 printk(KERN_ERR "XFS: possible memory allocation " 130 printk(KERN_ERR "XFS: possible memory allocation "
131 "deadlock in %s (mode:0x%x)\n", 131 "deadlock in %s (mode:0x%x)\n",
132 __func__, lflags); 132 __func__, lflags);
133 congestion_wait(WRITE, HZ/50); 133 congestion_wait(BLK_RW_ASYNC, HZ/50);
134 } while (1); 134 } while (1);
135} 135}
136 136
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
new file mode 100644
index 000000000000..b23a54506446
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -0,0 +1,468 @@
1/*
2 * Copyright (c) 2008, Christoph Hellwig
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_acl.h"
20#include "xfs_attr.h"
21#include "xfs_bmap_btree.h"
22#include "xfs_inode.h"
23#include "xfs_vnodeops.h"
24#include <linux/xattr.h>
25#include <linux/posix_acl_xattr.h>
26
27
28/*
29 * Locking scheme:
30 * - all ACL updates are protected by inode->i_mutex, which is taken before
31 * calling into this file.
32 */
33
34STATIC struct posix_acl *
35xfs_acl_from_disk(struct xfs_acl *aclp)
36{
37 struct posix_acl_entry *acl_e;
38 struct posix_acl *acl;
39 struct xfs_acl_entry *ace;
40 int count, i;
41
42 count = be32_to_cpu(aclp->acl_cnt);
43
44 acl = posix_acl_alloc(count, GFP_KERNEL);
45 if (!acl)
46 return ERR_PTR(-ENOMEM);
47
48 for (i = 0; i < count; i++) {
49 acl_e = &acl->a_entries[i];
50 ace = &aclp->acl_entry[i];
51
52 /*
53 * The tag is 32 bits on disk and 16 bits in core.
54 *
55 * Because every access to it goes through the core
56 * format first this is not a problem.
57 */
58 acl_e->e_tag = be32_to_cpu(ace->ae_tag);
59 acl_e->e_perm = be16_to_cpu(ace->ae_perm);
60
61 switch (acl_e->e_tag) {
62 case ACL_USER:
63 case ACL_GROUP:
64 acl_e->e_id = be32_to_cpu(ace->ae_id);
65 break;
66 case ACL_USER_OBJ:
67 case ACL_GROUP_OBJ:
68 case ACL_MASK:
69 case ACL_OTHER:
70 acl_e->e_id = ACL_UNDEFINED_ID;
71 break;
72 default:
73 goto fail;
74 }
75 }
76 return acl;
77
78fail:
79 posix_acl_release(acl);
80 return ERR_PTR(-EINVAL);
81}
82
83STATIC void
84xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
85{
86 const struct posix_acl_entry *acl_e;
87 struct xfs_acl_entry *ace;
88 int i;
89
90 aclp->acl_cnt = cpu_to_be32(acl->a_count);
91 for (i = 0; i < acl->a_count; i++) {
92 ace = &aclp->acl_entry[i];
93 acl_e = &acl->a_entries[i];
94
95 ace->ae_tag = cpu_to_be32(acl_e->e_tag);
96 ace->ae_id = cpu_to_be32(acl_e->e_id);
97 ace->ae_perm = cpu_to_be16(acl_e->e_perm);
98 }
99}
100
101struct posix_acl *
102xfs_get_acl(struct inode *inode, int type)
103{
104 struct xfs_inode *ip = XFS_I(inode);
105 struct posix_acl *acl;
106 struct xfs_acl *xfs_acl;
107 int len = sizeof(struct xfs_acl);
108 char *ea_name;
109 int error;
110
111 acl = get_cached_acl(inode, type);
112 if (acl != ACL_NOT_CACHED)
113 return acl;
114
115 switch (type) {
116 case ACL_TYPE_ACCESS:
117 ea_name = SGI_ACL_FILE;
118 break;
119 case ACL_TYPE_DEFAULT:
120 ea_name = SGI_ACL_DEFAULT;
121 break;
122 default:
123 BUG();
124 }
125
126 /*
127 * If we have a cached ACLs value just return it, not need to
128 * go out to the disk.
129 */
130
131 xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
132 if (!xfs_acl)
133 return ERR_PTR(-ENOMEM);
134
135 error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT);
136 if (error) {
137 /*
138 * If the attribute doesn't exist make sure we have a negative
139 * cache entry, for any other error assume it is transient and
140 * leave the cache entry as ACL_NOT_CACHED.
141 */
142 if (error == -ENOATTR) {
143 acl = NULL;
144 goto out_update_cache;
145 }
146 goto out;
147 }
148
149 acl = xfs_acl_from_disk(xfs_acl);
150 if (IS_ERR(acl))
151 goto out;
152
153 out_update_cache:
154 set_cached_acl(inode, type, acl);
155 out:
156 kfree(xfs_acl);
157 return acl;
158}
159
160STATIC int
161xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
162{
163 struct xfs_inode *ip = XFS_I(inode);
164 char *ea_name;
165 int error;
166
167 if (S_ISLNK(inode->i_mode))
168 return -EOPNOTSUPP;
169
170 switch (type) {
171 case ACL_TYPE_ACCESS:
172 ea_name = SGI_ACL_FILE;
173 break;
174 case ACL_TYPE_DEFAULT:
175 if (!S_ISDIR(inode->i_mode))
176 return acl ? -EACCES : 0;
177 ea_name = SGI_ACL_DEFAULT;
178 break;
179 default:
180 return -EINVAL;
181 }
182
183 if (acl) {
184 struct xfs_acl *xfs_acl;
185 int len;
186
187 xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
188 if (!xfs_acl)
189 return -ENOMEM;
190
191 xfs_acl_to_disk(xfs_acl, acl);
192 len = sizeof(struct xfs_acl) -
193 (sizeof(struct xfs_acl_entry) *
194 (XFS_ACL_MAX_ENTRIES - acl->a_count));
195
196 error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl,
197 len, ATTR_ROOT);
198
199 kfree(xfs_acl);
200 } else {
201 /*
202 * A NULL ACL argument means we want to remove the ACL.
203 */
204 error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
205
206 /*
207 * If the attribute didn't exist to start with that's fine.
208 */
209 if (error == -ENOATTR)
210 error = 0;
211 }
212
213 if (!error)
214 set_cached_acl(inode, type, acl);
215 return error;
216}
217
218int
219xfs_check_acl(struct inode *inode, int mask)
220{
221 struct xfs_inode *ip = XFS_I(inode);
222 struct posix_acl *acl;
223 int error = -EAGAIN;
224
225 xfs_itrace_entry(ip);
226
227 /*
228 * If there is no attribute fork no ACL exists on this inode and
229 * we can skip the whole exercise.
230 */
231 if (!XFS_IFORK_Q(ip))
232 return -EAGAIN;
233
234 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
235 if (IS_ERR(acl))
236 return PTR_ERR(acl);
237 if (acl) {
238 error = posix_acl_permission(inode, acl, mask);
239 posix_acl_release(acl);
240 }
241
242 return error;
243}
244
245static int
246xfs_set_mode(struct inode *inode, mode_t mode)
247{
248 int error = 0;
249
250 if (mode != inode->i_mode) {
251 struct iattr iattr;
252
253 iattr.ia_valid = ATTR_MODE;
254 iattr.ia_mode = mode;
255
256 error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
257 }
258
259 return error;
260}
261
262static int
263xfs_acl_exists(struct inode *inode, char *name)
264{
265 int len = sizeof(struct xfs_acl);
266
267 return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
268 ATTR_ROOT|ATTR_KERNOVAL) == 0);
269}
270
271int
272posix_acl_access_exists(struct inode *inode)
273{
274 return xfs_acl_exists(inode, SGI_ACL_FILE);
275}
276
277int
278posix_acl_default_exists(struct inode *inode)
279{
280 if (!S_ISDIR(inode->i_mode))
281 return 0;
282 return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
283}
284
285/*
286 * No need for i_mutex because the inode is not yet exposed to the VFS.
287 */
288int
289xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl)
290{
291 struct posix_acl *clone;
292 mode_t mode;
293 int error = 0, inherit = 0;
294
295 if (S_ISDIR(inode->i_mode)) {
296 error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl);
297 if (error)
298 return error;
299 }
300
301 clone = posix_acl_clone(default_acl, GFP_KERNEL);
302 if (!clone)
303 return -ENOMEM;
304
305 mode = inode->i_mode;
306 error = posix_acl_create_masq(clone, &mode);
307 if (error < 0)
308 goto out_release_clone;
309
310 /*
311 * If posix_acl_create_masq returns a positive value we need to
312 * inherit a permission that can't be represented using the Unix
313 * mode bits and we actually need to set an ACL.
314 */
315 if (error > 0)
316 inherit = 1;
317
318 error = xfs_set_mode(inode, mode);
319 if (error)
320 goto out_release_clone;
321
322 if (inherit)
323 error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
324
325 out_release_clone:
326 posix_acl_release(clone);
327 return error;
328}
329
330int
331xfs_acl_chmod(struct inode *inode)
332{
333 struct posix_acl *acl, *clone;
334 int error;
335
336 if (S_ISLNK(inode->i_mode))
337 return -EOPNOTSUPP;
338
339 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
340 if (IS_ERR(acl) || !acl)
341 return PTR_ERR(acl);
342
343 clone = posix_acl_clone(acl, GFP_KERNEL);
344 posix_acl_release(acl);
345 if (!clone)
346 return -ENOMEM;
347
348 error = posix_acl_chmod_masq(clone, inode->i_mode);
349 if (!error)
350 error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
351
352 posix_acl_release(clone);
353 return error;
354}
355
356/*
357 * System xattr handlers.
358 *
359 * Currently Posix ACLs are the only system namespace extended attribute
360 * handlers supported by XFS, so we just implement the handlers here.
361 * If we ever support other system extended attributes this will need
362 * some refactoring.
363 */
364
365static int
366xfs_decode_acl(const char *name)
367{
368 if (strcmp(name, "posix_acl_access") == 0)
369 return ACL_TYPE_ACCESS;
370 else if (strcmp(name, "posix_acl_default") == 0)
371 return ACL_TYPE_DEFAULT;
372 return -EINVAL;
373}
374
375static int
376xfs_xattr_system_get(struct inode *inode, const char *name,
377 void *value, size_t size)
378{
379 struct posix_acl *acl;
380 int type, error;
381
382 type = xfs_decode_acl(name);
383 if (type < 0)
384 return type;
385
386 acl = xfs_get_acl(inode, type);
387 if (IS_ERR(acl))
388 return PTR_ERR(acl);
389 if (acl == NULL)
390 return -ENODATA;
391
392 error = posix_acl_to_xattr(acl, value, size);
393 posix_acl_release(acl);
394
395 return error;
396}
397
398static int
399xfs_xattr_system_set(struct inode *inode, const char *name,
400 const void *value, size_t size, int flags)
401{
402 struct posix_acl *acl = NULL;
403 int error = 0, type;
404
405 type = xfs_decode_acl(name);
406 if (type < 0)
407 return type;
408 if (flags & XATTR_CREATE)
409 return -EINVAL;
410 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
411 return value ? -EACCES : 0;
412 if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
413 return -EPERM;
414
415 if (!value)
416 goto set_acl;
417
418 acl = posix_acl_from_xattr(value, size);
419 if (!acl) {
420 /*
421 * acl_set_file(3) may request that we set default ACLs with
422 * zero length -- defend (gracefully) against that here.
423 */
424 goto out;
425 }
426 if (IS_ERR(acl)) {
427 error = PTR_ERR(acl);
428 goto out;
429 }
430
431 error = posix_acl_valid(acl);
432 if (error)
433 goto out_release;
434
435 error = -EINVAL;
436 if (acl->a_count > XFS_ACL_MAX_ENTRIES)
437 goto out_release;
438
439 if (type == ACL_TYPE_ACCESS) {
440 mode_t mode = inode->i_mode;
441 error = posix_acl_equiv_mode(acl, &mode);
442
443 if (error <= 0) {
444 posix_acl_release(acl);
445 acl = NULL;
446
447 if (error < 0)
448 return error;
449 }
450
451 error = xfs_set_mode(inode, mode);
452 if (error)
453 goto out_release;
454 }
455
456 set_acl:
457 error = xfs_set_acl(inode, type, acl);
458 out_release:
459 posix_acl_release(acl);
460 out:
461 return error;
462}
463
464struct xattr_handler xfs_xattr_system_handler = {
465 .prefix = XATTR_SYSTEM_PREFIX,
466 .get = xfs_xattr_system_get,
467 .set = xfs_xattr_system_set,
468};
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 7ec89fc05b2b..aecf2519db76 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1268,6 +1268,14 @@ xfs_vm_writepage(
1268 if (!page_has_buffers(page)) 1268 if (!page_has_buffers(page))
1269 create_empty_buffers(page, 1 << inode->i_blkbits, 0); 1269 create_empty_buffers(page, 1 << inode->i_blkbits, 0);
1270 1270
1271
1272 /*
1273 * VM calculation for nr_to_write seems off. Bump it way
1274 * up, this gets simple streaming writes zippy again.
1275 * To be reviewed again after Jens' writeback changes.
1276 */
1277 wbc->nr_to_write *= 4;
1278
1271 /* 1279 /*
1272 * Convert delayed allocate, unwritten or unmapped space 1280 * Convert delayed allocate, unwritten or unmapped space
1273 * to real space and flush out to disk. 1281 * to real space and flush out to disk.
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e28800a9f2b5..965df1227d64 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -412,7 +412,7 @@ _xfs_buf_lookup_pages(
412 412
413 XFS_STATS_INC(xb_page_retries); 413 XFS_STATS_INC(xb_page_retries);
414 xfsbufd_wakeup(0, gfp_mask); 414 xfsbufd_wakeup(0, gfp_mask);
415 congestion_wait(WRITE, HZ/50); 415 congestion_wait(BLK_RW_ASYNC, HZ/50);
416 goto retry; 416 goto retry;
417 } 417 }
418 418
@@ -770,7 +770,7 @@ xfs_buf_associate_memory(
770 bp->b_pages = NULL; 770 bp->b_pages = NULL;
771 bp->b_addr = mem; 771 bp->b_addr = mem;
772 772
773 rval = _xfs_buf_get_pages(bp, page_count, 0); 773 rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
774 if (rval) 774 if (rval)
775 return rval; 775 return rval;
776 776
@@ -1501,7 +1501,7 @@ xfs_setsize_buftarg_early(
1501 struct block_device *bdev) 1501 struct block_device *bdev)
1502{ 1502{
1503 return xfs_setsize_buftarg_flags(btp, 1503 return xfs_setsize_buftarg_flags(btp,
1504 PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0); 1504 PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
1505} 1505}
1506 1506
1507int 1507int
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index f4e255441574..0542fd507649 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -41,7 +41,6 @@
41#include "xfs_ioctl.h" 41#include "xfs_ioctl.h"
42 42
43#include <linux/dcache.h> 43#include <linux/dcache.h>
44#include <linux/smp_lock.h>
45 44
46static struct vm_operations_struct xfs_file_vm_ops; 45static struct vm_operations_struct xfs_file_vm_ops;
47 46
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 34eaab608e6e..5bb523d7f37e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -41,7 +41,6 @@
41#include "xfs_itable.h" 41#include "xfs_itable.h"
42#include "xfs_error.h" 42#include "xfs_error.h"
43#include "xfs_rw.h" 43#include "xfs_rw.h"
44#include "xfs_acl.h"
45#include "xfs_attr.h" 44#include "xfs_attr.h"
46#include "xfs_bmap.h" 45#include "xfs_bmap.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
@@ -899,7 +898,8 @@ xfs_ioctl_setattr(
899 struct xfs_mount *mp = ip->i_mount; 898 struct xfs_mount *mp = ip->i_mount;
900 struct xfs_trans *tp; 899 struct xfs_trans *tp;
901 unsigned int lock_flags = 0; 900 unsigned int lock_flags = 0;
902 struct xfs_dquot *udqp = NULL, *gdqp = NULL; 901 struct xfs_dquot *udqp = NULL;
902 struct xfs_dquot *gdqp = NULL;
903 struct xfs_dquot *olddquot = NULL; 903 struct xfs_dquot *olddquot = NULL;
904 int code; 904 int code;
905 905
@@ -919,7 +919,7 @@ xfs_ioctl_setattr(
919 * because the i_*dquot fields will get updated anyway. 919 * because the i_*dquot fields will get updated anyway.
920 */ 920 */
921 if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { 921 if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
922 code = XFS_QM_DQVOPALLOC(mp, ip, ip->i_d.di_uid, 922 code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
923 ip->i_d.di_gid, fa->fsx_projid, 923 ip->i_d.di_gid, fa->fsx_projid,
924 XFS_QMOPT_PQUOTA, &udqp, &gdqp); 924 XFS_QMOPT_PQUOTA, &udqp, &gdqp);
925 if (code) 925 if (code)
@@ -954,10 +954,11 @@ xfs_ioctl_setattr(
954 * Do a quota reservation only if projid is actually going to change. 954 * Do a quota reservation only if projid is actually going to change.
955 */ 955 */
956 if (mask & FSX_PROJID) { 956 if (mask & FSX_PROJID) {
957 if (XFS_IS_PQUOTA_ON(mp) && 957 if (XFS_IS_QUOTA_RUNNING(mp) &&
958 XFS_IS_PQUOTA_ON(mp) &&
958 ip->i_d.di_projid != fa->fsx_projid) { 959 ip->i_d.di_projid != fa->fsx_projid) {
959 ASSERT(tp); 960 ASSERT(tp);
960 code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp, 961 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
961 capable(CAP_FOWNER) ? 962 capable(CAP_FOWNER) ?
962 XFS_QMOPT_FORCE_RES : 0); 963 XFS_QMOPT_FORCE_RES : 0);
963 if (code) /* out of quota */ 964 if (code) /* out of quota */
@@ -1059,8 +1060,8 @@ xfs_ioctl_setattr(
1059 * in the transaction. 1060 * in the transaction.
1060 */ 1061 */
1061 if (ip->i_d.di_projid != fa->fsx_projid) { 1062 if (ip->i_d.di_projid != fa->fsx_projid) {
1062 if (XFS_IS_PQUOTA_ON(mp)) { 1063 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1063 olddquot = XFS_QM_DQVOPCHOWN(mp, tp, ip, 1064 olddquot = xfs_qm_vop_chown(tp, ip,
1064 &ip->i_gdquot, gdqp); 1065 &ip->i_gdquot, gdqp);
1065 } 1066 }
1066 ip->i_d.di_projid = fa->fsx_projid; 1067 ip->i_d.di_projid = fa->fsx_projid;
@@ -1106,9 +1107,9 @@ xfs_ioctl_setattr(
1106 /* 1107 /*
1107 * Release any dquot(s) the inode had kept before chown. 1108 * Release any dquot(s) the inode had kept before chown.
1108 */ 1109 */
1109 XFS_QM_DQRELE(mp, olddquot); 1110 xfs_qm_dqrele(olddquot);
1110 XFS_QM_DQRELE(mp, udqp); 1111 xfs_qm_dqrele(udqp);
1111 XFS_QM_DQRELE(mp, gdqp); 1112 xfs_qm_dqrele(gdqp);
1112 1113
1113 if (code) 1114 if (code)
1114 return code; 1115 return code;
@@ -1122,8 +1123,8 @@ xfs_ioctl_setattr(
1122 return 0; 1123 return 0;
1123 1124
1124 error_return: 1125 error_return:
1125 XFS_QM_DQRELE(mp, udqp); 1126 xfs_qm_dqrele(udqp);
1126 XFS_QM_DQRELE(mp, gdqp); 1127 xfs_qm_dqrele(gdqp);
1127 xfs_trans_cancel(tp, 0); 1128 xfs_trans_cancel(tp, 0);
1128 if (lock_flags) 1129 if (lock_flags)
1129 xfs_iunlock(ip, lock_flags); 1130 xfs_iunlock(ip, lock_flags);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 6075382336d7..8070b34cc287 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_acl.h"
20#include "xfs_bit.h" 21#include "xfs_bit.h"
21#include "xfs_log.h" 22#include "xfs_log.h"
22#include "xfs_inum.h" 23#include "xfs_inum.h"
@@ -51,6 +52,7 @@
51#include <linux/capability.h> 52#include <linux/capability.h>
52#include <linux/xattr.h> 53#include <linux/xattr.h>
53#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/posix_acl.h>
54#include <linux/security.h> 56#include <linux/security.h>
55#include <linux/falloc.h> 57#include <linux/falloc.h>
56#include <linux/fiemap.h> 58#include <linux/fiemap.h>
@@ -202,9 +204,8 @@ xfs_vn_mknod(
202{ 204{
203 struct inode *inode; 205 struct inode *inode;
204 struct xfs_inode *ip = NULL; 206 struct xfs_inode *ip = NULL;
205 xfs_acl_t *default_acl = NULL; 207 struct posix_acl *default_acl = NULL;
206 struct xfs_name name; 208 struct xfs_name name;
207 int (*test_default_acl)(struct inode *) = _ACL_DEFAULT_EXISTS;
208 int error; 209 int error;
209 210
210 /* 211 /*
@@ -219,18 +220,14 @@ xfs_vn_mknod(
219 rdev = 0; 220 rdev = 0;
220 } 221 }
221 222
222 if (test_default_acl && test_default_acl(dir)) { 223 if (IS_POSIXACL(dir)) {
223 if (!_ACL_ALLOC(default_acl)) { 224 default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
224 return -ENOMEM; 225 if (IS_ERR(default_acl))
225 } 226 return -PTR_ERR(default_acl);
226 if (!_ACL_GET_DEFAULT(dir, default_acl)) {
227 _ACL_FREE(default_acl);
228 default_acl = NULL;
229 }
230 }
231 227
232 if (IS_POSIXACL(dir) && !default_acl) 228 if (!default_acl)
233 mode &= ~current_umask(); 229 mode &= ~current_umask();
230 }
234 231
235 xfs_dentry_to_name(&name, dentry); 232 xfs_dentry_to_name(&name, dentry);
236 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); 233 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
@@ -244,10 +241,10 @@ xfs_vn_mknod(
244 goto out_cleanup_inode; 241 goto out_cleanup_inode;
245 242
246 if (default_acl) { 243 if (default_acl) {
247 error = _ACL_INHERIT(inode, mode, default_acl); 244 error = -xfs_inherit_acl(inode, default_acl);
248 if (unlikely(error)) 245 if (unlikely(error))
249 goto out_cleanup_inode; 246 goto out_cleanup_inode;
250 _ACL_FREE(default_acl); 247 posix_acl_release(default_acl);
251 } 248 }
252 249
253 250
@@ -257,8 +254,7 @@ xfs_vn_mknod(
257 out_cleanup_inode: 254 out_cleanup_inode:
258 xfs_cleanup_inode(dir, inode, dentry); 255 xfs_cleanup_inode(dir, inode, dentry);
259 out_free_acl: 256 out_free_acl:
260 if (default_acl) 257 posix_acl_release(default_acl);
261 _ACL_FREE(default_acl);
262 return -error; 258 return -error;
263} 259}
264 260
@@ -488,26 +484,6 @@ xfs_vn_put_link(
488 kfree(s); 484 kfree(s);
489} 485}
490 486
491#ifdef CONFIG_XFS_POSIX_ACL
492STATIC int
493xfs_check_acl(
494 struct inode *inode,
495 int mask)
496{
497 struct xfs_inode *ip = XFS_I(inode);
498 int error;
499
500 xfs_itrace_entry(ip);
501
502 if (XFS_IFORK_Q(ip)) {
503 error = xfs_acl_iaccess(ip, mask, NULL);
504 if (error != -1)
505 return -error;
506 }
507
508 return -EAGAIN;
509}
510
511STATIC int 487STATIC int
512xfs_vn_permission( 488xfs_vn_permission(
513 struct inode *inode, 489 struct inode *inode,
@@ -515,9 +491,6 @@ xfs_vn_permission(
515{ 491{
516 return generic_permission(inode, mask, xfs_check_acl); 492 return generic_permission(inode, mask, xfs_check_acl);
517} 493}
518#else
519#define xfs_vn_permission NULL
520#endif
521 494
522STATIC int 495STATIC int
523xfs_vn_getattr( 496xfs_vn_getattr(
@@ -707,8 +680,8 @@ xfs_vn_fiemap(
707 else 680 else
708 bm.bmv_length = BTOBB(length); 681 bm.bmv_length = BTOBB(length);
709 682
710 /* our formatter will tell xfs_getbmap when to stop. */ 683 /* We add one because in getbmap world count includes the header */
711 bm.bmv_count = MAXEXTNUM; 684 bm.bmv_count = fieinfo->fi_extents_max + 1;
712 bm.bmv_iflags = BMV_IF_PREALLOC; 685 bm.bmv_iflags = BMV_IF_PREALLOC;
713 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) 686 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
714 bm.bmv_iflags |= BMV_IF_ATTRFORK; 687 bm.bmv_iflags |= BMV_IF_ATTRFORK;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index f65a53f8752f..6127e24062d0 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -24,7 +24,7 @@
24 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. 24 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
25 * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set. 25 * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
26 */ 26 */
27#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) 27#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
28# define XFS_BIG_BLKNOS 1 28# define XFS_BIG_BLKNOS 1
29# define XFS_BIG_INUMS 1 29# define XFS_BIG_INUMS 1
30#else 30#else
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 9142192ccbe6..7078974a6eee 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -42,7 +42,6 @@
42#include "xfs_error.h" 42#include "xfs_error.h"
43#include "xfs_itable.h" 43#include "xfs_itable.h"
44#include "xfs_rw.h" 44#include "xfs_rw.h"
45#include "xfs_acl.h"
46#include "xfs_attr.h" 45#include "xfs_attr.h"
47#include "xfs_inode_item.h" 46#include "xfs_inode_item.h"
48#include "xfs_buf_item.h" 47#include "xfs_buf_item.h"
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 94d9a633d3d9..cb6e2cca214f 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -50,9 +50,11 @@ xfs_fs_quota_sync(
50{ 50{
51 struct xfs_mount *mp = XFS_M(sb); 51 struct xfs_mount *mp = XFS_M(sb);
52 52
53 if (sb->s_flags & MS_RDONLY)
54 return -EROFS;
53 if (!XFS_IS_QUOTA_RUNNING(mp)) 55 if (!XFS_IS_QUOTA_RUNNING(mp))
54 return -ENOSYS; 56 return -ENOSYS;
55 return -xfs_sync_inodes(mp, SYNC_DELWRI); 57 return -xfs_sync_data(mp, 0);
56} 58}
57 59
58STATIC int 60STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bb685269f832..a220d36f789b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -43,7 +43,6 @@
43#include "xfs_itable.h" 43#include "xfs_itable.h"
44#include "xfs_fsops.h" 44#include "xfs_fsops.h"
45#include "xfs_rw.h" 45#include "xfs_rw.h"
46#include "xfs_acl.h"
47#include "xfs_attr.h" 46#include "xfs_attr.h"
48#include "xfs_buf_item.h" 47#include "xfs_buf_item.h"
49#include "xfs_utils.h" 48#include "xfs_utils.h"
@@ -405,6 +404,14 @@ xfs_parseargs(
405 return EINVAL; 404 return EINVAL;
406 } 405 }
407 406
407#ifndef CONFIG_XFS_QUOTA
408 if (XFS_IS_QUOTA_RUNNING(mp)) {
409 cmn_err(CE_WARN,
410 "XFS: quota support not available in this kernel.");
411 return EINVAL;
412 }
413#endif
414
408 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && 415 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
409 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) { 416 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
410 cmn_err(CE_WARN, 417 cmn_err(CE_WARN,
@@ -609,7 +616,7 @@ xfs_max_file_offset(
609 */ 616 */
610 617
611#if BITS_PER_LONG == 32 618#if BITS_PER_LONG == 32
612# if defined(CONFIG_LBD) 619# if defined(CONFIG_LBDAF)
613 ASSERT(sizeof(sector_t) == 8); 620 ASSERT(sizeof(sector_t) == 8);
614 pagefactor = PAGE_CACHE_SIZE; 621 pagefactor = PAGE_CACHE_SIZE;
615 bitshift = BITS_PER_LONG; 622 bitshift = BITS_PER_LONG;
@@ -1063,7 +1070,18 @@ xfs_fs_put_super(
1063 int unmount_event_flags = 0; 1070 int unmount_event_flags = 0;
1064 1071
1065 xfs_syncd_stop(mp); 1072 xfs_syncd_stop(mp);
1066 xfs_sync_inodes(mp, SYNC_ATTR|SYNC_DELWRI); 1073
1074 if (!(sb->s_flags & MS_RDONLY)) {
1075 /*
1076 * XXX(hch): this should be SYNC_WAIT.
1077 *
1078 * Or more likely not needed at all because the VFS is already
1079 * calling ->sync_fs after shutting down all filestem
1080 * operations and just before calling ->put_super.
1081 */
1082 xfs_sync_data(mp, 0);
1083 xfs_sync_attr(mp, 0);
1084 }
1067 1085
1068#ifdef HAVE_DMAPI 1086#ifdef HAVE_DMAPI
1069 if (mp->m_flags & XFS_MOUNT_DMAPI) { 1087 if (mp->m_flags & XFS_MOUNT_DMAPI) {
@@ -1098,21 +1116,11 @@ xfs_fs_put_super(
1098 xfs_freesb(mp); 1116 xfs_freesb(mp);
1099 xfs_icsb_destroy_counters(mp); 1117 xfs_icsb_destroy_counters(mp);
1100 xfs_close_devices(mp); 1118 xfs_close_devices(mp);
1101 xfs_qmops_put(mp);
1102 xfs_dmops_put(mp); 1119 xfs_dmops_put(mp);
1103 xfs_free_fsname(mp); 1120 xfs_free_fsname(mp);
1104 kfree(mp); 1121 kfree(mp);
1105} 1122}
1106 1123
1107STATIC void
1108xfs_fs_write_super(
1109 struct super_block *sb)
1110{
1111 if (!(sb->s_flags & MS_RDONLY))
1112 xfs_sync_fsdata(XFS_M(sb), 0);
1113 sb->s_dirt = 0;
1114}
1115
1116STATIC int 1124STATIC int
1117xfs_fs_sync_super( 1125xfs_fs_sync_super(
1118 struct super_block *sb, 1126 struct super_block *sb,
@@ -1137,7 +1145,6 @@ xfs_fs_sync_super(
1137 error = xfs_quiesce_data(mp); 1145 error = xfs_quiesce_data(mp);
1138 else 1146 else
1139 error = xfs_sync_fsdata(mp, 0); 1147 error = xfs_sync_fsdata(mp, 0);
1140 sb->s_dirt = 0;
1141 1148
1142 if (unlikely(laptop_mode)) { 1149 if (unlikely(laptop_mode)) {
1143 int prev_sync_seq = mp->m_sync_seq; 1150 int prev_sync_seq = mp->m_sync_seq;
@@ -1168,6 +1175,7 @@ xfs_fs_statfs(
1168{ 1175{
1169 struct xfs_mount *mp = XFS_M(dentry->d_sb); 1176 struct xfs_mount *mp = XFS_M(dentry->d_sb);
1170 xfs_sb_t *sbp = &mp->m_sb; 1177 xfs_sb_t *sbp = &mp->m_sb;
1178 struct xfs_inode *ip = XFS_I(dentry->d_inode);
1171 __uint64_t fakeinos, id; 1179 __uint64_t fakeinos, id;
1172 xfs_extlen_t lsize; 1180 xfs_extlen_t lsize;
1173 1181
@@ -1196,7 +1204,10 @@ xfs_fs_statfs(
1196 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1204 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
1197 spin_unlock(&mp->m_sb_lock); 1205 spin_unlock(&mp->m_sb_lock);
1198 1206
1199 XFS_QM_DQSTATVFS(XFS_I(dentry->d_inode), statp); 1207 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
1208 ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
1209 (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
1210 xfs_qm_statvfs(ip, statp);
1200 return 0; 1211 return 0;
1201} 1212}
1202 1213
@@ -1404,16 +1415,13 @@ xfs_fs_fill_super(
1404 error = xfs_dmops_get(mp); 1415 error = xfs_dmops_get(mp);
1405 if (error) 1416 if (error)
1406 goto out_free_fsname; 1417 goto out_free_fsname;
1407 error = xfs_qmops_get(mp);
1408 if (error)
1409 goto out_put_dmops;
1410 1418
1411 if (silent) 1419 if (silent)
1412 flags |= XFS_MFSI_QUIET; 1420 flags |= XFS_MFSI_QUIET;
1413 1421
1414 error = xfs_open_devices(mp); 1422 error = xfs_open_devices(mp);
1415 if (error) 1423 if (error)
1416 goto out_put_qmops; 1424 goto out_put_dmops;
1417 1425
1418 if (xfs_icsb_init_counters(mp)) 1426 if (xfs_icsb_init_counters(mp))
1419 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; 1427 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
@@ -1443,7 +1451,6 @@ xfs_fs_fill_super(
1443 1451
1444 XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname); 1452 XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
1445 1453
1446 sb->s_dirt = 1;
1447 sb->s_magic = XFS_SB_MAGIC; 1454 sb->s_magic = XFS_SB_MAGIC;
1448 sb->s_blocksize = mp->m_sb.sb_blocksize; 1455 sb->s_blocksize = mp->m_sb.sb_blocksize;
1449 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; 1456 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1482,8 +1489,6 @@ xfs_fs_fill_super(
1482 out_destroy_counters: 1489 out_destroy_counters:
1483 xfs_icsb_destroy_counters(mp); 1490 xfs_icsb_destroy_counters(mp);
1484 xfs_close_devices(mp); 1491 xfs_close_devices(mp);
1485 out_put_qmops:
1486 xfs_qmops_put(mp);
1487 out_put_dmops: 1492 out_put_dmops:
1488 xfs_dmops_put(mp); 1493 xfs_dmops_put(mp);
1489 out_free_fsname: 1494 out_free_fsname:
@@ -1533,7 +1538,6 @@ static struct super_operations xfs_super_operations = {
1533 .write_inode = xfs_fs_write_inode, 1538 .write_inode = xfs_fs_write_inode,
1534 .clear_inode = xfs_fs_clear_inode, 1539 .clear_inode = xfs_fs_clear_inode,
1535 .put_super = xfs_fs_put_super, 1540 .put_super = xfs_fs_put_super,
1536 .write_super = xfs_fs_write_super,
1537 .sync_fs = xfs_fs_sync_super, 1541 .sync_fs = xfs_fs_sync_super,
1538 .freeze_fs = xfs_fs_freeze, 1542 .freeze_fs = xfs_fs_freeze,
1539 .statfs = xfs_fs_statfs, 1543 .statfs = xfs_fs_statfs,
@@ -1718,18 +1722,8 @@ xfs_init_zones(void)
1718 if (!xfs_ili_zone) 1722 if (!xfs_ili_zone)
1719 goto out_destroy_inode_zone; 1723 goto out_destroy_inode_zone;
1720 1724
1721#ifdef CONFIG_XFS_POSIX_ACL
1722 xfs_acl_zone = kmem_zone_init(sizeof(xfs_acl_t), "xfs_acl");
1723 if (!xfs_acl_zone)
1724 goto out_destroy_ili_zone;
1725#endif
1726
1727 return 0; 1725 return 0;
1728 1726
1729#ifdef CONFIG_XFS_POSIX_ACL
1730 out_destroy_ili_zone:
1731#endif
1732 kmem_zone_destroy(xfs_ili_zone);
1733 out_destroy_inode_zone: 1727 out_destroy_inode_zone:
1734 kmem_zone_destroy(xfs_inode_zone); 1728 kmem_zone_destroy(xfs_inode_zone);
1735 out_destroy_efi_zone: 1729 out_destroy_efi_zone:
@@ -1763,9 +1757,6 @@ xfs_init_zones(void)
1763STATIC void 1757STATIC void
1764xfs_destroy_zones(void) 1758xfs_destroy_zones(void)
1765{ 1759{
1766#ifdef CONFIG_XFS_POSIX_ACL
1767 kmem_zone_destroy(xfs_acl_zone);
1768#endif
1769 kmem_zone_destroy(xfs_ili_zone); 1760 kmem_zone_destroy(xfs_ili_zone);
1770 kmem_zone_destroy(xfs_inode_zone); 1761 kmem_zone_destroy(xfs_inode_zone);
1771 kmem_zone_destroy(xfs_efi_zone); 1762 kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index f7ba76633c29..98ef624d9baf 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -43,166 +43,267 @@
43#include "xfs_buf_item.h" 43#include "xfs_buf_item.h"
44#include "xfs_inode_item.h" 44#include "xfs_inode_item.h"
45#include "xfs_rw.h" 45#include "xfs_rw.h"
46#include "xfs_quota.h"
46 47
47#include <linux/kthread.h> 48#include <linux/kthread.h>
48#include <linux/freezer.h> 49#include <linux/freezer.h>
49 50
50/*
51 * Sync all the inodes in the given AG according to the
52 * direction given by the flags.
53 */
54STATIC int
55xfs_sync_inodes_ag(
56 xfs_mount_t *mp,
57 int ag,
58 int flags)
59{
60 xfs_perag_t *pag = &mp->m_perag[ag];
61 int nr_found;
62 uint32_t first_index = 0;
63 int error = 0;
64 int last_error = 0;
65 51
66 do { 52STATIC xfs_inode_t *
67 struct inode *inode; 53xfs_inode_ag_lookup(
68 xfs_inode_t *ip = NULL; 54 struct xfs_mount *mp,
69 int lock_flags = XFS_ILOCK_SHARED; 55 struct xfs_perag *pag,
56 uint32_t *first_index,
57 int tag)
58{
59 int nr_found;
60 struct xfs_inode *ip;
70 61
71 /* 62 /*
72 * use a gang lookup to find the next inode in the tree 63 * use a gang lookup to find the next inode in the tree
73 * as the tree is sparse and a gang lookup walks to find 64 * as the tree is sparse and a gang lookup walks to find
74 * the number of objects requested. 65 * the number of objects requested.
75 */ 66 */
76 read_lock(&pag->pag_ici_lock); 67 read_lock(&pag->pag_ici_lock);
68 if (tag == XFS_ICI_NO_TAG) {
77 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 69 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
78 (void**)&ip, first_index, 1); 70 (void **)&ip, *first_index, 1);
71 } else {
72 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
73 (void **)&ip, *first_index, 1, tag);
74 }
75 if (!nr_found)
76 goto unlock;
79 77
80 if (!nr_found) { 78 /*
81 read_unlock(&pag->pag_ici_lock); 79 * Update the index for the next lookup. Catch overflows
82 break; 80 * into the next AG range which can occur if we have inodes
83 } 81 * in the last block of the AG and we are currently
82 * pointing to the last inode.
83 */
84 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
85 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
86 goto unlock;
84 87
85 /* 88 return ip;
86 * Update the index for the next lookup. Catch overflows
87 * into the next AG range which can occur if we have inodes
88 * in the last block of the AG and we are currently
89 * pointing to the last inode.
90 */
91 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
92 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
93 read_unlock(&pag->pag_ici_lock);
94 break;
95 }
96 89
97 /* nothing to sync during shutdown */ 90unlock:
98 if (XFS_FORCED_SHUTDOWN(mp)) { 91 read_unlock(&pag->pag_ici_lock);
99 read_unlock(&pag->pag_ici_lock); 92 return NULL;
100 return 0; 93}
101 }
102 94
103 /* 95STATIC int
104 * If we can't get a reference on the inode, it must be 96xfs_inode_ag_walk(
105 * in reclaim. Leave it for the reclaim code to flush. 97 struct xfs_mount *mp,
106 */ 98 xfs_agnumber_t ag,
107 inode = VFS_I(ip); 99 int (*execute)(struct xfs_inode *ip,
108 if (!igrab(inode)) { 100 struct xfs_perag *pag, int flags),
109 read_unlock(&pag->pag_ici_lock); 101 int flags,
110 continue; 102 int tag)
111 } 103{
112 read_unlock(&pag->pag_ici_lock); 104 struct xfs_perag *pag = &mp->m_perag[ag];
105 uint32_t first_index;
106 int last_error = 0;
107 int skipped;
113 108
114 /* avoid new or bad inodes */ 109restart:
115 if (is_bad_inode(inode) || 110 skipped = 0;
116 xfs_iflags_test(ip, XFS_INEW)) { 111 first_index = 0;
117 IRELE(ip); 112 do {
118 continue; 113 int error = 0;
119 } 114 xfs_inode_t *ip;
120 115
121 /* 116 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
122 * If we have to flush data or wait for I/O completion 117 if (!ip)
123 * we need to hold the iolock. 118 break;
124 */
125 if (flags & SYNC_DELWRI) {
126 if (VN_DIRTY(inode)) {
127 if (flags & SYNC_TRYLOCK) {
128 if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
129 lock_flags |= XFS_IOLOCK_SHARED;
130 } else {
131 xfs_ilock(ip, XFS_IOLOCK_SHARED);
132 lock_flags |= XFS_IOLOCK_SHARED;
133 }
134 if (lock_flags & XFS_IOLOCK_SHARED) {
135 error = xfs_flush_pages(ip, 0, -1,
136 (flags & SYNC_WAIT) ? 0
137 : XFS_B_ASYNC,
138 FI_NONE);
139 }
140 }
141 if (VN_CACHED(inode) && (flags & SYNC_IOWAIT))
142 xfs_ioend_wait(ip);
143 }
144 xfs_ilock(ip, XFS_ILOCK_SHARED);
145
146 if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
147 if (flags & SYNC_WAIT) {
148 xfs_iflock(ip);
149 if (!xfs_inode_clean(ip))
150 error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
151 else
152 xfs_ifunlock(ip);
153 } else if (xfs_iflock_nowait(ip)) {
154 if (!xfs_inode_clean(ip))
155 error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
156 else
157 xfs_ifunlock(ip);
158 }
159 }
160 xfs_iput(ip, lock_flags);
161 119
120 error = execute(ip, pag, flags);
121 if (error == EAGAIN) {
122 skipped++;
123 continue;
124 }
162 if (error) 125 if (error)
163 last_error = error; 126 last_error = error;
164 /* 127 /*
165 * bail out if the filesystem is corrupted. 128 * bail out if the filesystem is corrupted.
166 */ 129 */
167 if (error == EFSCORRUPTED) 130 if (error == EFSCORRUPTED)
168 return XFS_ERROR(error); 131 break;
169 132
170 } while (nr_found); 133 } while (1);
171 134
135 if (skipped) {
136 delay(1);
137 goto restart;
138 }
139
140 xfs_put_perag(mp, pag);
172 return last_error; 141 return last_error;
173} 142}
174 143
175int 144int
176xfs_sync_inodes( 145xfs_inode_ag_iterator(
177 xfs_mount_t *mp, 146 struct xfs_mount *mp,
178 int flags) 147 int (*execute)(struct xfs_inode *ip,
148 struct xfs_perag *pag, int flags),
149 int flags,
150 int tag)
179{ 151{
180 int error; 152 int error = 0;
181 int last_error; 153 int last_error = 0;
182 int i; 154 xfs_agnumber_t ag;
183 int lflags = XFS_LOG_FORCE;
184 155
185 if (mp->m_flags & XFS_MOUNT_RDONLY) 156 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
186 return 0; 157 if (!mp->m_perag[ag].pag_ici_init)
187 error = 0; 158 continue;
188 last_error = 0; 159 error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
160 if (error) {
161 last_error = error;
162 if (error == EFSCORRUPTED)
163 break;
164 }
165 }
166 return XFS_ERROR(last_error);
167}
168
169/* must be called with pag_ici_lock held and releases it */
170int
171xfs_sync_inode_valid(
172 struct xfs_inode *ip,
173 struct xfs_perag *pag)
174{
175 struct inode *inode = VFS_I(ip);
176
177 /* nothing to sync during shutdown */
178 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
179 read_unlock(&pag->pag_ici_lock);
180 return EFSCORRUPTED;
181 }
189 182
183 /*
184 * If we can't get a reference on the inode, it must be in reclaim.
185 * Leave it for the reclaim code to flush. Also avoid inodes that
186 * haven't been fully initialised.
187 */
188 if (!igrab(inode)) {
189 read_unlock(&pag->pag_ici_lock);
190 return ENOENT;
191 }
192 read_unlock(&pag->pag_ici_lock);
193
194 if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
195 IRELE(ip);
196 return ENOENT;
197 }
198
199 return 0;
200}
201
202STATIC int
203xfs_sync_inode_data(
204 struct xfs_inode *ip,
205 struct xfs_perag *pag,
206 int flags)
207{
208 struct inode *inode = VFS_I(ip);
209 struct address_space *mapping = inode->i_mapping;
210 int error = 0;
211
212 error = xfs_sync_inode_valid(ip, pag);
213 if (error)
214 return error;
215
216 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
217 goto out_wait;
218
219 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
220 if (flags & SYNC_TRYLOCK)
221 goto out_wait;
222 xfs_ilock(ip, XFS_IOLOCK_SHARED);
223 }
224
225 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
226 0 : XFS_B_ASYNC, FI_NONE);
227 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
228
229 out_wait:
190 if (flags & SYNC_WAIT) 230 if (flags & SYNC_WAIT)
191 lflags |= XFS_LOG_SYNC; 231 xfs_ioend_wait(ip);
232 IRELE(ip);
233 return error;
234}
192 235
193 for (i = 0; i < mp->m_sb.sb_agcount; i++) { 236STATIC int
194 if (!mp->m_perag[i].pag_ici_init) 237xfs_sync_inode_attr(
195 continue; 238 struct xfs_inode *ip,
196 error = xfs_sync_inodes_ag(mp, i, flags); 239 struct xfs_perag *pag,
197 if (error) 240 int flags)
198 last_error = error; 241{
199 if (error == EFSCORRUPTED) 242 int error = 0;
200 break; 243
244 error = xfs_sync_inode_valid(ip, pag);
245 if (error)
246 return error;
247
248 xfs_ilock(ip, XFS_ILOCK_SHARED);
249 if (xfs_inode_clean(ip))
250 goto out_unlock;
251 if (!xfs_iflock_nowait(ip)) {
252 if (!(flags & SYNC_WAIT))
253 goto out_unlock;
254 xfs_iflock(ip);
201 } 255 }
202 if (flags & SYNC_DELWRI)
203 xfs_log_force(mp, 0, lflags);
204 256
205 return XFS_ERROR(last_error); 257 if (xfs_inode_clean(ip)) {
258 xfs_ifunlock(ip);
259 goto out_unlock;
260 }
261
262 error = xfs_iflush(ip, (flags & SYNC_WAIT) ?
263 XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
264
265 out_unlock:
266 xfs_iunlock(ip, XFS_ILOCK_SHARED);
267 IRELE(ip);
268 return error;
269}
270
271/*
272 * Write out pagecache data for the whole filesystem.
273 */
274int
275xfs_sync_data(
276 struct xfs_mount *mp,
277 int flags)
278{
279 int error;
280
281 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
282
283 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
284 XFS_ICI_NO_TAG);
285 if (error)
286 return XFS_ERROR(error);
287
288 xfs_log_force(mp, 0,
289 (flags & SYNC_WAIT) ?
290 XFS_LOG_FORCE | XFS_LOG_SYNC :
291 XFS_LOG_FORCE);
292 return 0;
293}
294
295/*
296 * Write out inode metadata (attributes) for the whole filesystem.
297 */
298int
299xfs_sync_attr(
300 struct xfs_mount *mp,
301 int flags)
302{
303 ASSERT((flags & ~SYNC_WAIT) == 0);
304
305 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
306 XFS_ICI_NO_TAG);
206} 307}
207 308
208STATIC int 309STATIC int
@@ -252,7 +353,7 @@ xfs_sync_fsdata(
252 * If this is xfssyncd() then only sync the superblock if we can 353 * If this is xfssyncd() then only sync the superblock if we can
253 * lock it without sleeping and it is not pinned. 354 * lock it without sleeping and it is not pinned.
254 */ 355 */
255 if (flags & SYNC_BDFLUSH) { 356 if (flags & SYNC_TRYLOCK) {
256 ASSERT(!(flags & SYNC_WAIT)); 357 ASSERT(!(flags & SYNC_WAIT));
257 358
258 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); 359 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
@@ -316,13 +417,13 @@ xfs_quiesce_data(
316 int error; 417 int error;
317 418
318 /* push non-blocking */ 419 /* push non-blocking */
319 xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH); 420 xfs_sync_data(mp, 0);
320 XFS_QM_DQSYNC(mp, SYNC_BDFLUSH); 421 xfs_qm_sync(mp, SYNC_TRYLOCK);
321 xfs_filestream_flush(mp); 422 xfs_filestream_flush(mp);
322 423
323 /* push and block */ 424 /* push and block */
324 xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT); 425 xfs_sync_data(mp, SYNC_WAIT);
325 XFS_QM_DQSYNC(mp, SYNC_WAIT); 426 xfs_qm_sync(mp, SYNC_WAIT);
326 427
327 /* write superblock and hoover up shutdown errors */ 428 /* write superblock and hoover up shutdown errors */
328 error = xfs_sync_fsdata(mp, 0); 429 error = xfs_sync_fsdata(mp, 0);
@@ -341,7 +442,7 @@ xfs_quiesce_fs(
341 int count = 0, pincount; 442 int count = 0, pincount;
342 443
343 xfs_flush_buftarg(mp->m_ddev_targp, 0); 444 xfs_flush_buftarg(mp->m_ddev_targp, 0);
344 xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC); 445 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
345 446
346 /* 447 /*
347 * This loop must run at least twice. The first instance of the loop 448 * This loop must run at least twice. The first instance of the loop
@@ -350,7 +451,7 @@ xfs_quiesce_fs(
350 * logged before we can write the unmount record. 451 * logged before we can write the unmount record.
351 */ 452 */
352 do { 453 do {
353 xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT); 454 xfs_sync_attr(mp, SYNC_WAIT);
354 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 455 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
355 if (!pincount) { 456 if (!pincount) {
356 delay(50); 457 delay(50);
@@ -433,8 +534,8 @@ xfs_flush_inodes_work(
433 void *arg) 534 void *arg)
434{ 535{
435 struct inode *inode = arg; 536 struct inode *inode = arg;
436 xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK); 537 xfs_sync_data(mp, SYNC_TRYLOCK);
437 xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT); 538 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
438 iput(inode); 539 iput(inode);
439} 540}
440 541
@@ -465,10 +566,10 @@ xfs_sync_worker(
465 566
466 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 567 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
467 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 568 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
468 xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC); 569 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
469 /* dgc: errors ignored here */ 570 /* dgc: errors ignored here */
470 error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH); 571 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
471 error = xfs_sync_fsdata(mp, SYNC_BDFLUSH); 572 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
472 if (xfs_log_need_covered(mp)) 573 if (xfs_log_need_covered(mp))
473 error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE); 574 error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
474 } 575 }
@@ -569,7 +670,7 @@ xfs_reclaim_inode(
569 xfs_ifunlock(ip); 670 xfs_ifunlock(ip);
570 xfs_iunlock(ip, XFS_ILOCK_EXCL); 671 xfs_iunlock(ip, XFS_ILOCK_EXCL);
571 } 672 }
572 return 1; 673 return -EAGAIN;
573 } 674 }
574 __xfs_iflags_set(ip, XFS_IRECLAIM); 675 __xfs_iflags_set(ip, XFS_IRECLAIM);
575 spin_unlock(&ip->i_flags_lock); 676 spin_unlock(&ip->i_flags_lock);
@@ -607,6 +708,16 @@ xfs_reclaim_inode(
607 return 0; 708 return 0;
608} 709}
609 710
711void
712__xfs_inode_set_reclaim_tag(
713 struct xfs_perag *pag,
714 struct xfs_inode *ip)
715{
716 radix_tree_tag_set(&pag->pag_ici_root,
717 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
718 XFS_ICI_RECLAIM_TAG);
719}
720
610/* 721/*
611 * We set the inode flag atomically with the radix tree tag. 722 * We set the inode flag atomically with the radix tree tag.
612 * Once we get tag lookups on the radix tree, this inode flag 723 * Once we get tag lookups on the radix tree, this inode flag
@@ -621,8 +732,7 @@ xfs_inode_set_reclaim_tag(
621 732
622 read_lock(&pag->pag_ici_lock); 733 read_lock(&pag->pag_ici_lock);
623 spin_lock(&ip->i_flags_lock); 734 spin_lock(&ip->i_flags_lock);
624 radix_tree_tag_set(&pag->pag_ici_root, 735 __xfs_inode_set_reclaim_tag(pag, ip);
625 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
626 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 736 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
627 spin_unlock(&ip->i_flags_lock); 737 spin_unlock(&ip->i_flags_lock);
628 read_unlock(&pag->pag_ici_lock); 738 read_unlock(&pag->pag_ici_lock);
@@ -654,101 +764,27 @@ xfs_inode_clear_reclaim_tag(
654 xfs_put_perag(mp, pag); 764 xfs_put_perag(mp, pag);
655} 765}
656 766
657 767STATIC int
658STATIC void 768xfs_reclaim_inode_now(
659xfs_reclaim_inodes_ag( 769 struct xfs_inode *ip,
660 xfs_mount_t *mp, 770 struct xfs_perag *pag,
661 int ag, 771 int flags)
662 int noblock,
663 int mode)
664{ 772{
665 xfs_inode_t *ip = NULL; 773 /* ignore if already under reclaim */
666 xfs_perag_t *pag = &mp->m_perag[ag]; 774 if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
667 int nr_found;
668 uint32_t first_index;
669 int skipped;
670
671restart:
672 first_index = 0;
673 skipped = 0;
674 do {
675 /*
676 * use a gang lookup to find the next inode in the tree
677 * as the tree is sparse and a gang lookup walks to find
678 * the number of objects requested.
679 */
680 read_lock(&pag->pag_ici_lock);
681 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
682 (void**)&ip, first_index, 1,
683 XFS_ICI_RECLAIM_TAG);
684
685 if (!nr_found) {
686 read_unlock(&pag->pag_ici_lock);
687 break;
688 }
689
690 /*
691 * Update the index for the next lookup. Catch overflows
692 * into the next AG range which can occur if we have inodes
693 * in the last block of the AG and we are currently
694 * pointing to the last inode.
695 */
696 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
697 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
698 read_unlock(&pag->pag_ici_lock);
699 break;
700 }
701
702 /* ignore if already under reclaim */
703 if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
704 read_unlock(&pag->pag_ici_lock);
705 continue;
706 }
707
708 if (noblock) {
709 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
710 read_unlock(&pag->pag_ici_lock);
711 continue;
712 }
713 if (xfs_ipincount(ip) ||
714 !xfs_iflock_nowait(ip)) {
715 xfs_iunlock(ip, XFS_ILOCK_EXCL);
716 read_unlock(&pag->pag_ici_lock);
717 continue;
718 }
719 }
720 read_unlock(&pag->pag_ici_lock); 775 read_unlock(&pag->pag_ici_lock);
721 776 return 0;
722 /*
723 * hmmm - this is an inode already in reclaim. Do
724 * we even bother catching it here?
725 */
726 if (xfs_reclaim_inode(ip, noblock, mode))
727 skipped++;
728 } while (nr_found);
729
730 if (skipped) {
731 delay(1);
732 goto restart;
733 } 777 }
734 return; 778 read_unlock(&pag->pag_ici_lock);
735 779
780 return xfs_reclaim_inode(ip, 0, flags);
736} 781}
737 782
738int 783int
739xfs_reclaim_inodes( 784xfs_reclaim_inodes(
740 xfs_mount_t *mp, 785 xfs_mount_t *mp,
741 int noblock,
742 int mode) 786 int mode)
743{ 787{
744 int i; 788 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
745 789 XFS_ICI_RECLAIM_TAG);
746 for (i = 0; i < mp->m_sb.sb_agcount; i++) {
747 if (!mp->m_perag[i].pag_ici_init)
748 continue;
749 xfs_reclaim_inodes_ag(mp, i, noblock, mode);
750 }
751 return 0;
752} 790}
753
754
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 308d5bf6dfbd..59120602588a 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -29,17 +29,14 @@ typedef struct xfs_sync_work {
29 struct completion *w_completion; 29 struct completion *w_completion;
30} xfs_sync_work_t; 30} xfs_sync_work_t;
31 31
32#define SYNC_ATTR 0x0001 /* sync attributes */ 32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
33#define SYNC_DELWRI 0x0002 /* look at delayed writes */ 33#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
34#define SYNC_WAIT 0x0004 /* wait for i/o to complete */
35#define SYNC_BDFLUSH 0x0008 /* BDFLUSH is calling -- don't block */
36#define SYNC_IOWAIT 0x0010 /* wait for all I/O to complete */
37#define SYNC_TRYLOCK 0x0020 /* only try to lock inodes */
38 34
39int xfs_syncd_init(struct xfs_mount *mp); 35int xfs_syncd_init(struct xfs_mount *mp);
40void xfs_syncd_stop(struct xfs_mount *mp); 36void xfs_syncd_stop(struct xfs_mount *mp);
41 37
42int xfs_sync_inodes(struct xfs_mount *mp, int flags); 38int xfs_sync_attr(struct xfs_mount *mp, int flags);
39int xfs_sync_data(struct xfs_mount *mp, int flags);
43int xfs_sync_fsdata(struct xfs_mount *mp, int flags); 40int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
44 41
45int xfs_quiesce_data(struct xfs_mount *mp); 42int xfs_quiesce_data(struct xfs_mount *mp);
@@ -48,10 +45,17 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
48void xfs_flush_inodes(struct xfs_inode *ip); 45void xfs_flush_inodes(struct xfs_inode *ip);
49 46
50int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); 47int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
51int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode); 48int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
52 49
53void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); 50void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
51void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
54void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip); 52void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
55void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 53void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
56 struct xfs_inode *ip); 54 struct xfs_inode *ip);
55
56int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
57int xfs_inode_ag_iterator(struct xfs_mount *mp,
58 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
59 int flags, int tag);
60
57#endif 61#endif
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 964621fde6ed..497c7fb75cc1 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -29,67 +29,6 @@
29#include <linux/xattr.h> 29#include <linux/xattr.h>
30 30
31 31
32/*
33 * ACL handling. Should eventually be moved into xfs_acl.c
34 */
35
36static int
37xfs_decode_acl(const char *name)
38{
39 if (strcmp(name, "posix_acl_access") == 0)
40 return _ACL_TYPE_ACCESS;
41 else if (strcmp(name, "posix_acl_default") == 0)
42 return _ACL_TYPE_DEFAULT;
43 return -EINVAL;
44}
45
46/*
47 * Get system extended attributes which at the moment only
48 * includes Posix ACLs.
49 */
50static int
51xfs_xattr_system_get(struct inode *inode, const char *name,
52 void *buffer, size_t size)
53{
54 int acl;
55
56 acl = xfs_decode_acl(name);
57 if (acl < 0)
58 return acl;
59
60 return xfs_acl_vget(inode, buffer, size, acl);
61}
62
63static int
64xfs_xattr_system_set(struct inode *inode, const char *name,
65 const void *value, size_t size, int flags)
66{
67 int acl;
68
69 acl = xfs_decode_acl(name);
70 if (acl < 0)
71 return acl;
72 if (flags & XATTR_CREATE)
73 return -EINVAL;
74
75 if (!value)
76 return xfs_acl_vremove(inode, acl);
77
78 return xfs_acl_vset(inode, (void *)value, size, acl);
79}
80
81static struct xattr_handler xfs_xattr_system_handler = {
82 .prefix = XATTR_SYSTEM_PREFIX,
83 .get = xfs_xattr_system_get,
84 .set = xfs_xattr_system_set,
85};
86
87
88/*
89 * Real xattr handling. The only difference between the namespaces is
90 * a flag passed to the low-level attr code.
91 */
92
93static int 32static int
94__xfs_xattr_get(struct inode *inode, const char *name, 33__xfs_xattr_get(struct inode *inode, const char *name,
95 void *value, size_t size, int xflags) 34 void *value, size_t size, int xflags)
@@ -199,7 +138,9 @@ struct xattr_handler *xfs_xattr_handlers[] = {
199 &xfs_xattr_user_handler, 138 &xfs_xattr_user_handler,
200 &xfs_xattr_trusted_handler, 139 &xfs_xattr_trusted_handler,
201 &xfs_xattr_security_handler, 140 &xfs_xattr_security_handler,
141#ifdef CONFIG_XFS_POSIX_ACL
202 &xfs_xattr_system_handler, 142 &xfs_xattr_system_handler,
143#endif
203 NULL 144 NULL
204}; 145};
205 146
@@ -310,7 +251,7 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
310 /* 251 /*
311 * Then add the two synthetic ACL attributes. 252 * Then add the two synthetic ACL attributes.
312 */ 253 */
313 if (xfs_acl_vhasacl_access(inode)) { 254 if (posix_acl_access_exists(inode)) {
314 error = list_one_attr(POSIX_ACL_XATTR_ACCESS, 255 error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
315 strlen(POSIX_ACL_XATTR_ACCESS) + 1, 256 strlen(POSIX_ACL_XATTR_ACCESS) + 1,
316 data, size, &context.count); 257 data, size, &context.count);
@@ -318,7 +259,7 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
318 return error; 259 return error;
319 } 260 }
320 261
321 if (xfs_acl_vhasacl_default(inode)) { 262 if (posix_acl_default_exists(inode)) {
322 error = list_one_attr(POSIX_ACL_XATTR_DEFAULT, 263 error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
323 strlen(POSIX_ACL_XATTR_DEFAULT) + 1, 264 strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
324 data, size, &context.count); 265 data, size, &context.count);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e4babcc63423..2f3f2229eaaf 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -42,7 +42,6 @@
42#include "xfs_error.h" 42#include "xfs_error.h"
43#include "xfs_itable.h" 43#include "xfs_itable.h"
44#include "xfs_rw.h" 44#include "xfs_rw.h"
45#include "xfs_acl.h"
46#include "xfs_attr.h" 45#include "xfs_attr.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
48#include "xfs_trans_space.h" 47#include "xfs_trans_space.h"
@@ -1194,7 +1193,9 @@ void
1194xfs_qm_dqrele( 1193xfs_qm_dqrele(
1195 xfs_dquot_t *dqp) 1194 xfs_dquot_t *dqp)
1196{ 1195{
1197 ASSERT(dqp); 1196 if (!dqp)
1197 return;
1198
1198 xfs_dqtrace_entry(dqp, "DQRELE"); 1199 xfs_dqtrace_entry(dqp, "DQRELE");
1199 1200
1200 xfs_dqlock(dqp); 1201 xfs_dqlock(dqp);
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index de0f402ddb4c..6533ead9b889 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -181,7 +181,6 @@ extern void xfs_qm_adjust_dqlimits(xfs_mount_t *,
181extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, 181extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
182 xfs_dqid_t, uint, uint, xfs_dquot_t **); 182 xfs_dqid_t, uint, uint, xfs_dquot_t **);
183extern void xfs_qm_dqput(xfs_dquot_t *); 183extern void xfs_qm_dqput(xfs_dquot_t *);
184extern void xfs_qm_dqrele(xfs_dquot_t *);
185extern void xfs_dqlock(xfs_dquot_t *); 184extern void xfs_dqlock(xfs_dquot_t *);
186extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *); 185extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
187extern void xfs_dqunlock(xfs_dquot_t *); 186extern void xfs_dqunlock(xfs_dquot_t *);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 1728f6a7c4f5..d0d4a9a0bbd7 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -42,7 +42,6 @@
42#include "xfs_error.h" 42#include "xfs_error.h"
43#include "xfs_itable.h" 43#include "xfs_itable.h"
44#include "xfs_rw.h" 44#include "xfs_rw.h"
45#include "xfs_acl.h"
46#include "xfs_attr.h" 45#include "xfs_attr.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
48#include "xfs_trans_priv.h" 47#include "xfs_trans_priv.h"
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 5b6695049e00..45b1bfef7388 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -42,7 +42,6 @@
42#include "xfs_error.h" 42#include "xfs_error.h"
43#include "xfs_bmap.h" 43#include "xfs_bmap.h"
44#include "xfs_rw.h" 44#include "xfs_rw.h"
45#include "xfs_acl.h"
46#include "xfs_attr.h" 45#include "xfs_attr.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
48#include "xfs_trans_space.h" 47#include "xfs_trans_space.h"
@@ -287,11 +286,13 @@ xfs_qm_rele_quotafs_ref(
287 * Just destroy the quotainfo structure. 286 * Just destroy the quotainfo structure.
288 */ 287 */
289void 288void
290xfs_qm_unmount_quotadestroy( 289xfs_qm_unmount(
291 xfs_mount_t *mp) 290 struct xfs_mount *mp)
292{ 291{
293 if (mp->m_quotainfo) 292 if (mp->m_quotainfo) {
293 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
294 xfs_qm_destroy_quotainfo(mp); 294 xfs_qm_destroy_quotainfo(mp);
295 }
295} 296}
296 297
297 298
@@ -385,8 +386,13 @@ xfs_qm_mount_quotas(
385 if (error) { 386 if (error) {
386 xfs_fs_cmn_err(CE_WARN, mp, 387 xfs_fs_cmn_err(CE_WARN, mp,
387 "Failed to initialize disk quotas."); 388 "Failed to initialize disk quotas.");
389 return;
388 } 390 }
389 return; 391
392#ifdef QUOTADEBUG
393 if (XFS_IS_QUOTA_ON(mp))
394 xfs_qm_internalqcheck(mp);
395#endif
390} 396}
391 397
392/* 398/*
@@ -774,12 +780,11 @@ xfs_qm_dqattach_grouphint(
774 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON 780 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
775 * into account. 781 * into account.
776 * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. 782 * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
777 * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
778 * Inode may get unlocked and relocked in here, and the caller must deal with 783 * Inode may get unlocked and relocked in here, and the caller must deal with
779 * the consequences. 784 * the consequences.
780 */ 785 */
781int 786int
782xfs_qm_dqattach( 787xfs_qm_dqattach_locked(
783 xfs_inode_t *ip, 788 xfs_inode_t *ip,
784 uint flags) 789 uint flags)
785{ 790{
@@ -787,17 +792,14 @@ xfs_qm_dqattach(
787 uint nquotas = 0; 792 uint nquotas = 0;
788 int error = 0; 793 int error = 0;
789 794
790 if ((! XFS_IS_QUOTA_ON(mp)) || 795 if (!XFS_IS_QUOTA_RUNNING(mp) ||
791 (! XFS_NOT_DQATTACHED(mp, ip)) || 796 !XFS_IS_QUOTA_ON(mp) ||
792 (ip->i_ino == mp->m_sb.sb_uquotino) || 797 !XFS_NOT_DQATTACHED(mp, ip) ||
793 (ip->i_ino == mp->m_sb.sb_gquotino)) 798 ip->i_ino == mp->m_sb.sb_uquotino ||
799 ip->i_ino == mp->m_sb.sb_gquotino)
794 return 0; 800 return 0;
795 801
796 ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 || 802 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
797 xfs_isilocked(ip, XFS_ILOCK_EXCL));
798
799 if (! (flags & XFS_QMOPT_ILOCKED))
800 xfs_ilock(ip, XFS_ILOCK_EXCL);
801 803
802 if (XFS_IS_UQUOTA_ON(mp)) { 804 if (XFS_IS_UQUOTA_ON(mp)) {
803 error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, 805 error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
@@ -849,8 +851,7 @@ xfs_qm_dqattach(
849 xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); 851 xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
850 } 852 }
851 853
852 done: 854 done:
853
854#ifdef QUOTADEBUG 855#ifdef QUOTADEBUG
855 if (! error) { 856 if (! error) {
856 if (XFS_IS_UQUOTA_ON(mp)) 857 if (XFS_IS_UQUOTA_ON(mp))
@@ -858,15 +859,22 @@ xfs_qm_dqattach(
858 if (XFS_IS_OQUOTA_ON(mp)) 859 if (XFS_IS_OQUOTA_ON(mp))
859 ASSERT(ip->i_gdquot); 860 ASSERT(ip->i_gdquot);
860 } 861 }
862 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
861#endif 863#endif
864 return error;
865}
862 866
863 if (! (flags & XFS_QMOPT_ILOCKED)) 867int
864 xfs_iunlock(ip, XFS_ILOCK_EXCL); 868xfs_qm_dqattach(
869 struct xfs_inode *ip,
870 uint flags)
871{
872 int error;
873
874 xfs_ilock(ip, XFS_ILOCK_EXCL);
875 error = xfs_qm_dqattach_locked(ip, flags);
876 xfs_iunlock(ip, XFS_ILOCK_EXCL);
865 877
866#ifdef QUOTADEBUG
867 else
868 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
869#endif
870 return error; 878 return error;
871} 879}
872 880
@@ -896,11 +904,6 @@ xfs_qm_dqdetach(
896 } 904 }
897} 905}
898 906
899/*
900 * This is called to sync quotas. We can be told to use non-blocking
901 * semantics by either the SYNC_BDFLUSH flag or the absence of the
902 * SYNC_WAIT flag.
903 */
904int 907int
905xfs_qm_sync( 908xfs_qm_sync(
906 xfs_mount_t *mp, 909 xfs_mount_t *mp,
@@ -909,17 +912,13 @@ xfs_qm_sync(
909 int recl, restarts; 912 int recl, restarts;
910 xfs_dquot_t *dqp; 913 xfs_dquot_t *dqp;
911 uint flush_flags; 914 uint flush_flags;
912 boolean_t nowait;
913 int error; 915 int error;
914 916
915 if (! XFS_IS_QUOTA_ON(mp)) 917 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
916 return 0; 918 return 0;
917 919
920 flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI;
918 restarts = 0; 921 restarts = 0;
919 /*
920 * We won't block unless we are asked to.
921 */
922 nowait = (boolean_t)(flags & SYNC_BDFLUSH || (flags & SYNC_WAIT) == 0);
923 922
924 again: 923 again:
925 xfs_qm_mplist_lock(mp); 924 xfs_qm_mplist_lock(mp);
@@ -939,18 +938,10 @@ xfs_qm_sync(
939 * don't 'seem' to be dirty. ie. don't acquire dqlock. 938 * don't 'seem' to be dirty. ie. don't acquire dqlock.
940 * This is very similar to what xfs_sync does with inodes. 939 * This is very similar to what xfs_sync does with inodes.
941 */ 940 */
942 if (flags & SYNC_BDFLUSH) { 941 if (flags & SYNC_TRYLOCK) {
943 if (! XFS_DQ_IS_DIRTY(dqp)) 942 if (!XFS_DQ_IS_DIRTY(dqp))
944 continue; 943 continue;
945 } 944 if (!xfs_qm_dqlock_nowait(dqp))
946
947 if (nowait) {
948 /*
949 * Try to acquire the dquot lock. We are NOT out of
950 * lock order, but we just don't want to wait for this
951 * lock, unless somebody wanted us to.
952 */
953 if (! xfs_qm_dqlock_nowait(dqp))
954 continue; 945 continue;
955 } else { 946 } else {
956 xfs_dqlock(dqp); 947 xfs_dqlock(dqp);
@@ -967,7 +958,7 @@ xfs_qm_sync(
967 /* XXX a sentinel would be better */ 958 /* XXX a sentinel would be better */
968 recl = XFS_QI_MPLRECLAIMS(mp); 959 recl = XFS_QI_MPLRECLAIMS(mp);
969 if (!xfs_dqflock_nowait(dqp)) { 960 if (!xfs_dqflock_nowait(dqp)) {
970 if (nowait) { 961 if (flags & SYNC_TRYLOCK) {
971 xfs_dqunlock(dqp); 962 xfs_dqunlock(dqp);
972 continue; 963 continue;
973 } 964 }
@@ -985,7 +976,6 @@ xfs_qm_sync(
985 * Let go of the mplist lock. We don't want to hold it 976 * Let go of the mplist lock. We don't want to hold it
986 * across a disk write 977 * across a disk write
987 */ 978 */
988 flush_flags = (nowait) ? XFS_QMOPT_DELWRI : XFS_QMOPT_SYNC;
989 xfs_qm_mplist_unlock(mp); 979 xfs_qm_mplist_unlock(mp);
990 xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH"); 980 xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH");
991 error = xfs_qm_dqflush(dqp, flush_flags); 981 error = xfs_qm_dqflush(dqp, flush_flags);
@@ -2319,20 +2309,20 @@ xfs_qm_write_sb_changes(
2319 */ 2309 */
2320int 2310int
2321xfs_qm_vop_dqalloc( 2311xfs_qm_vop_dqalloc(
2322 xfs_mount_t *mp, 2312 struct xfs_inode *ip,
2323 xfs_inode_t *ip, 2313 uid_t uid,
2324 uid_t uid, 2314 gid_t gid,
2325 gid_t gid, 2315 prid_t prid,
2326 prid_t prid, 2316 uint flags,
2327 uint flags, 2317 struct xfs_dquot **O_udqpp,
2328 xfs_dquot_t **O_udqpp, 2318 struct xfs_dquot **O_gdqpp)
2329 xfs_dquot_t **O_gdqpp)
2330{ 2319{
2331 int error; 2320 struct xfs_mount *mp = ip->i_mount;
2332 xfs_dquot_t *uq, *gq; 2321 struct xfs_dquot *uq, *gq;
2333 uint lockflags; 2322 int error;
2323 uint lockflags;
2334 2324
2335 if (!XFS_IS_QUOTA_ON(mp)) 2325 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
2336 return 0; 2326 return 0;
2337 2327
2338 lockflags = XFS_ILOCK_EXCL; 2328 lockflags = XFS_ILOCK_EXCL;
@@ -2346,8 +2336,8 @@ xfs_qm_vop_dqalloc(
2346 * if necessary. The dquot(s) will not be locked. 2336 * if necessary. The dquot(s) will not be locked.
2347 */ 2337 */
2348 if (XFS_NOT_DQATTACHED(mp, ip)) { 2338 if (XFS_NOT_DQATTACHED(mp, ip)) {
2349 if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC | 2339 error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC);
2350 XFS_QMOPT_ILOCKED))) { 2340 if (error) {
2351 xfs_iunlock(ip, lockflags); 2341 xfs_iunlock(ip, lockflags);
2352 return error; 2342 return error;
2353 } 2343 }
@@ -2469,6 +2459,7 @@ xfs_qm_vop_chown(
2469 uint bfield = XFS_IS_REALTIME_INODE(ip) ? 2459 uint bfield = XFS_IS_REALTIME_INODE(ip) ?
2470 XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; 2460 XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
2471 2461
2462
2472 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2463 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2473 ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); 2464 ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
2474 2465
@@ -2508,13 +2499,13 @@ xfs_qm_vop_chown_reserve(
2508 xfs_dquot_t *gdqp, 2499 xfs_dquot_t *gdqp,
2509 uint flags) 2500 uint flags)
2510{ 2501{
2511 int error; 2502 xfs_mount_t *mp = ip->i_mount;
2512 xfs_mount_t *mp;
2513 uint delblks, blkflags, prjflags = 0; 2503 uint delblks, blkflags, prjflags = 0;
2514 xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; 2504 xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq;
2505 int error;
2506
2515 2507
2516 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2508 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2517 mp = ip->i_mount;
2518 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 2509 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
2519 2510
2520 delblks = ip->i_delayed_blks; 2511 delblks = ip->i_delayed_blks;
@@ -2582,28 +2573,23 @@ xfs_qm_vop_chown_reserve(
2582 2573
2583int 2574int
2584xfs_qm_vop_rename_dqattach( 2575xfs_qm_vop_rename_dqattach(
2585 xfs_inode_t **i_tab) 2576 struct xfs_inode **i_tab)
2586{ 2577{
2587 xfs_inode_t *ip; 2578 struct xfs_mount *mp = i_tab[0]->i_mount;
2588 int i; 2579 int i;
2589 int error;
2590 2580
2591 ip = i_tab[0]; 2581 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
2592
2593 if (! XFS_IS_QUOTA_ON(ip->i_mount))
2594 return 0; 2582 return 0;
2595 2583
2596 if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) { 2584 for (i = 0; (i < 4 && i_tab[i]); i++) {
2597 error = xfs_qm_dqattach(ip, 0); 2585 struct xfs_inode *ip = i_tab[i];
2598 if (error) 2586 int error;
2599 return error; 2587
2600 }
2601 for (i = 1; (i < 4 && i_tab[i]); i++) {
2602 /* 2588 /*
2603 * Watch out for duplicate entries in the table. 2589 * Watch out for duplicate entries in the table.
2604 */ 2590 */
2605 if ((ip = i_tab[i]) != i_tab[i-1]) { 2591 if (i == 0 || ip != i_tab[i-1]) {
2606 if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) { 2592 if (XFS_NOT_DQATTACHED(mp, ip)) {
2607 error = xfs_qm_dqattach(ip, 0); 2593 error = xfs_qm_dqattach(ip, 0);
2608 if (error) 2594 if (error)
2609 return error; 2595 return error;
@@ -2614,17 +2600,19 @@ xfs_qm_vop_rename_dqattach(
2614} 2600}
2615 2601
2616void 2602void
2617xfs_qm_vop_dqattach_and_dqmod_newinode( 2603xfs_qm_vop_create_dqattach(
2618 xfs_trans_t *tp, 2604 struct xfs_trans *tp,
2619 xfs_inode_t *ip, 2605 struct xfs_inode *ip,
2620 xfs_dquot_t *udqp, 2606 struct xfs_dquot *udqp,
2621 xfs_dquot_t *gdqp) 2607 struct xfs_dquot *gdqp)
2622{ 2608{
2623 if (!XFS_IS_QUOTA_ON(tp->t_mountp)) 2609 struct xfs_mount *mp = tp->t_mountp;
2610
2611 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
2624 return; 2612 return;
2625 2613
2626 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2614 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2627 ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); 2615 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
2628 2616
2629 if (udqp) { 2617 if (udqp) {
2630 xfs_dqlock(udqp); 2618 xfs_dqlock(udqp);
@@ -2632,7 +2620,7 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
2632 xfs_dqunlock(udqp); 2620 xfs_dqunlock(udqp);
2633 ASSERT(ip->i_udquot == NULL); 2621 ASSERT(ip->i_udquot == NULL);
2634 ip->i_udquot = udqp; 2622 ip->i_udquot = udqp;
2635 ASSERT(XFS_IS_UQUOTA_ON(tp->t_mountp)); 2623 ASSERT(XFS_IS_UQUOTA_ON(mp));
2636 ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); 2624 ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
2637 xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); 2625 xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
2638 } 2626 }
@@ -2642,8 +2630,8 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
2642 xfs_dqunlock(gdqp); 2630 xfs_dqunlock(gdqp);
2643 ASSERT(ip->i_gdquot == NULL); 2631 ASSERT(ip->i_gdquot == NULL);
2644 ip->i_gdquot = gdqp; 2632 ip->i_gdquot = gdqp;
2645 ASSERT(XFS_IS_OQUOTA_ON(tp->t_mountp)); 2633 ASSERT(XFS_IS_OQUOTA_ON(mp));
2646 ASSERT((XFS_IS_GQUOTA_ON(tp->t_mountp) ? 2634 ASSERT((XFS_IS_GQUOTA_ON(mp) ?
2647 ip->i_d.di_gid : ip->i_d.di_projid) == 2635 ip->i_d.di_gid : ip->i_d.di_projid) ==
2648 be32_to_cpu(gdqp->q_core.d_id)); 2636 be32_to_cpu(gdqp->q_core.d_id));
2649 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); 2637 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index a371954cae1b..495564b8af38 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -127,8 +127,6 @@ typedef struct xfs_quotainfo {
127} xfs_quotainfo_t; 127} xfs_quotainfo_t;
128 128
129 129
130extern xfs_dqtrxops_t xfs_trans_dquot_ops;
131
132extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); 130extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
133extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, 131extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
134 xfs_dquot_t *, xfs_dquot_t *, long, long, uint); 132 xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
@@ -159,17 +157,11 @@ typedef struct xfs_dquot_acct {
159#define XFS_QM_RTBWARNLIMIT 5 157#define XFS_QM_RTBWARNLIMIT 5
160 158
161extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); 159extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
162extern void xfs_qm_mount_quotas(xfs_mount_t *);
163extern int xfs_qm_quotacheck(xfs_mount_t *); 160extern int xfs_qm_quotacheck(xfs_mount_t *);
164extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *);
165extern void xfs_qm_unmount_quotas(xfs_mount_t *);
166extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); 161extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
167extern int xfs_qm_sync(xfs_mount_t *, int);
168 162
169/* dquot stuff */ 163/* dquot stuff */
170extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **); 164extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **);
171extern int xfs_qm_dqattach(xfs_inode_t *, uint);
172extern void xfs_qm_dqdetach(xfs_inode_t *);
173extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); 165extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint);
174extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); 166extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
175 167
@@ -183,19 +175,6 @@ extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
183extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); 175extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
184extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); 176extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
185 177
186/* vop stuff */
187extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
188 uid_t, gid_t, prid_t, uint,
189 xfs_dquot_t **, xfs_dquot_t **);
190extern void xfs_qm_vop_dqattach_and_dqmod_newinode(
191 xfs_trans_t *, xfs_inode_t *,
192 xfs_dquot_t *, xfs_dquot_t *);
193extern int xfs_qm_vop_rename_dqattach(xfs_inode_t **);
194extern xfs_dquot_t * xfs_qm_vop_chown(xfs_trans_t *, xfs_inode_t *,
195 xfs_dquot_t **, xfs_dquot_t *);
196extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
197 xfs_dquot_t *, xfs_dquot_t *, uint);
198
199/* list stuff */ 178/* list stuff */
200extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); 179extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
201extern void xfs_qm_freelist_unlink(xfs_dquot_t *); 180extern void xfs_qm_freelist_unlink(xfs_dquot_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 63037c689a4b..a5346630dfae 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -42,7 +42,6 @@
42#include "xfs_rtalloc.h" 42#include "xfs_rtalloc.h"
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_rw.h" 44#include "xfs_rw.h"
45#include "xfs_acl.h"
46#include "xfs_attr.h" 45#include "xfs_attr.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
48#include "xfs_qm.h" 47#include "xfs_qm.h"
@@ -84,7 +83,7 @@ xfs_fill_statvfs_from_dquot(
84 * return a statvfs of the project, not the entire filesystem. 83 * return a statvfs of the project, not the entire filesystem.
85 * This makes such trees appear as if they are filesystems in themselves. 84 * This makes such trees appear as if they are filesystems in themselves.
86 */ 85 */
87STATIC void 86void
88xfs_qm_statvfs( 87xfs_qm_statvfs(
89 xfs_inode_t *ip, 88 xfs_inode_t *ip,
90 struct kstatfs *statp) 89 struct kstatfs *statp)
@@ -92,20 +91,13 @@ xfs_qm_statvfs(
92 xfs_mount_t *mp = ip->i_mount; 91 xfs_mount_t *mp = ip->i_mount;
93 xfs_dquot_t *dqp; 92 xfs_dquot_t *dqp;
94 93
95 if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
96 !((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
97 (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
98 return;
99
100 if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) { 94 if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) {
101 xfs_disk_dquot_t *dp = &dqp->q_core; 95 xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
102
103 xfs_fill_statvfs_from_dquot(statp, dp);
104 xfs_qm_dqput(dqp); 96 xfs_qm_dqput(dqp);
105 } 97 }
106} 98}
107 99
108STATIC int 100int
109xfs_qm_newmount( 101xfs_qm_newmount(
110 xfs_mount_t *mp, 102 xfs_mount_t *mp,
111 uint *needquotamount, 103 uint *needquotamount,
@@ -114,9 +106,6 @@ xfs_qm_newmount(
114 uint quotaondisk; 106 uint quotaondisk;
115 uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0; 107 uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
116 108
117 *quotaflags = 0;
118 *needquotamount = B_FALSE;
119
120 quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) && 109 quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) &&
121 (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT); 110 (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
122 111
@@ -179,66 +168,6 @@ xfs_qm_newmount(
179 return 0; 168 return 0;
180} 169}
181 170
182STATIC int
183xfs_qm_endmount(
184 xfs_mount_t *mp,
185 uint needquotamount,
186 uint quotaflags)
187{
188 if (needquotamount) {
189 ASSERT(mp->m_qflags == 0);
190 mp->m_qflags = quotaflags;
191 xfs_qm_mount_quotas(mp);
192 }
193
194#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
195 if (! (XFS_IS_QUOTA_ON(mp)))
196 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
197 else
198 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
199#endif
200
201#ifdef QUOTADEBUG
202 if (XFS_IS_QUOTA_ON(mp) && xfs_qm_internalqcheck(mp))
203 cmn_err(CE_WARN, "XFS: mount internalqcheck failed");
204#endif
205
206 return 0;
207}
208
209STATIC void
210xfs_qm_dqrele_null(
211 xfs_dquot_t *dq)
212{
213 /*
214 * Called from XFS, where we always check first for a NULL dquot.
215 */
216 if (!dq)
217 return;
218 xfs_qm_dqrele(dq);
219}
220
221
222struct xfs_qmops xfs_qmcore_xfs = {
223 .xfs_qminit = xfs_qm_newmount,
224 .xfs_qmdone = xfs_qm_unmount_quotadestroy,
225 .xfs_qmmount = xfs_qm_endmount,
226 .xfs_qmunmount = xfs_qm_unmount_quotas,
227 .xfs_dqrele = xfs_qm_dqrele_null,
228 .xfs_dqattach = xfs_qm_dqattach,
229 .xfs_dqdetach = xfs_qm_dqdetach,
230 .xfs_dqpurgeall = xfs_qm_dqpurge_all,
231 .xfs_dqvopalloc = xfs_qm_vop_dqalloc,
232 .xfs_dqvopcreate = xfs_qm_vop_dqattach_and_dqmod_newinode,
233 .xfs_dqvoprename = xfs_qm_vop_rename_dqattach,
234 .xfs_dqvopchown = xfs_qm_vop_chown,
235 .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve,
236 .xfs_dqstatvfs = xfs_qm_statvfs,
237 .xfs_dqsync = xfs_qm_sync,
238 .xfs_dqtrxops = &xfs_trans_dquot_ops,
239};
240EXPORT_SYMBOL(xfs_qmcore_xfs);
241
242void __init 171void __init
243xfs_qm_init(void) 172xfs_qm_init(void)
244{ 173{
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 709f5f545cf5..21b08c0396a1 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -42,7 +42,6 @@
42#include "xfs_rtalloc.h" 42#include "xfs_rtalloc.h"
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_rw.h" 44#include "xfs_rw.h"
45#include "xfs_acl.h"
46#include "xfs_attr.h" 45#include "xfs_attr.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
48#include "xfs_qm.h" 47#include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index c7b66f6506ce..4e4276b956e8 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -45,7 +45,6 @@
45#include "xfs_rtalloc.h" 45#include "xfs_rtalloc.h"
46#include "xfs_error.h" 46#include "xfs_error.h"
47#include "xfs_rw.h" 47#include "xfs_rw.h"
48#include "xfs_acl.h"
49#include "xfs_attr.h" 48#include "xfs_attr.h"
50#include "xfs_buf_item.h" 49#include "xfs_buf_item.h"
51#include "xfs_utils.h" 50#include "xfs_utils.h"
@@ -847,105 +846,55 @@ xfs_qm_export_flags(
847} 846}
848 847
849 848
850/* 849STATIC int
851 * Release all the dquots on the inodes in an AG. 850xfs_dqrele_inode(
852 */ 851 struct xfs_inode *ip,
853STATIC void 852 struct xfs_perag *pag,
854xfs_qm_dqrele_inodes_ag( 853 int flags)
855 xfs_mount_t *mp,
856 int ag,
857 uint flags)
858{ 854{
859 xfs_inode_t *ip = NULL; 855 int error;
860 xfs_perag_t *pag = &mp->m_perag[ag];
861 int first_index = 0;
862 int nr_found;
863
864 do {
865 /*
866 * use a gang lookup to find the next inode in the tree
867 * as the tree is sparse and a gang lookup walks to find
868 * the number of objects requested.
869 */
870 read_lock(&pag->pag_ici_lock);
871 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
872 (void**)&ip, first_index, 1);
873
874 if (!nr_found) {
875 read_unlock(&pag->pag_ici_lock);
876 break;
877 }
878
879 /*
880 * Update the index for the next lookup. Catch overflows
881 * into the next AG range which can occur if we have inodes
882 * in the last block of the AG and we are currently
883 * pointing to the last inode.
884 */
885 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
886 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
887 read_unlock(&pag->pag_ici_lock);
888 break;
889 }
890
891 /* skip quota inodes */
892 if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
893 ASSERT(ip->i_udquot == NULL);
894 ASSERT(ip->i_gdquot == NULL);
895 read_unlock(&pag->pag_ici_lock);
896 continue;
897 }
898 856
899 /* 857 /* skip quota inodes */
900 * If we can't get a reference on the inode, it must be 858 if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) {
901 * in reclaim. Leave it for the reclaim code to flush. 859 ASSERT(ip->i_udquot == NULL);
902 */ 860 ASSERT(ip->i_gdquot == NULL);
903 if (!igrab(VFS_I(ip))) {
904 read_unlock(&pag->pag_ici_lock);
905 continue;
906 }
907 read_unlock(&pag->pag_ici_lock); 861 read_unlock(&pag->pag_ici_lock);
862 return 0;
863 }
908 864
909 /* avoid new inodes though we shouldn't find any here */ 865 error = xfs_sync_inode_valid(ip, pag);
910 if (xfs_iflags_test(ip, XFS_INEW)) { 866 if (error)
911 IRELE(ip); 867 return error;
912 continue;
913 }
914 868
915 xfs_ilock(ip, XFS_ILOCK_EXCL); 869 xfs_ilock(ip, XFS_ILOCK_EXCL);
916 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { 870 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
917 xfs_qm_dqrele(ip->i_udquot); 871 xfs_qm_dqrele(ip->i_udquot);
918 ip->i_udquot = NULL; 872 ip->i_udquot = NULL;
919 } 873 }
920 if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && 874 if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
921 ip->i_gdquot) { 875 xfs_qm_dqrele(ip->i_gdquot);
922 xfs_qm_dqrele(ip->i_gdquot); 876 ip->i_gdquot = NULL;
923 ip->i_gdquot = NULL; 877 }
924 } 878 xfs_iput(ip, XFS_ILOCK_EXCL);
925 xfs_iput(ip, XFS_ILOCK_EXCL); 879 IRELE(ip);
926 880
927 } while (nr_found); 881 return 0;
928} 882}
929 883
884
930/* 885/*
931 * Go thru all the inodes in the file system, releasing their dquots. 886 * Go thru all the inodes in the file system, releasing their dquots.
887 *
932 * Note that the mount structure gets modified to indicate that quotas are off 888 * Note that the mount structure gets modified to indicate that quotas are off
933 * AFTER this, in the case of quotaoff. This also gets called from 889 * AFTER this, in the case of quotaoff.
934 * xfs_rootumount.
935 */ 890 */
936void 891void
937xfs_qm_dqrele_all_inodes( 892xfs_qm_dqrele_all_inodes(
938 struct xfs_mount *mp, 893 struct xfs_mount *mp,
939 uint flags) 894 uint flags)
940{ 895{
941 int i;
942
943 ASSERT(mp->m_quotainfo); 896 ASSERT(mp->m_quotainfo);
944 for (i = 0; i < mp->m_sb.sb_agcount; i++) { 897 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
945 if (!mp->m_perag[i].pag_ici_init)
946 continue;
947 xfs_qm_dqrele_inodes_ag(mp, i, flags);
948 }
949} 898}
950 899
951/*------------------------------------------------------------------------*/ 900/*------------------------------------------------------------------------*/
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 447173bcf96d..97ac9640be98 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -42,7 +42,6 @@
42#include "xfs_rtalloc.h" 42#include "xfs_rtalloc.h"
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_rw.h" 44#include "xfs_rw.h"
45#include "xfs_acl.h"
46#include "xfs_attr.h" 45#include "xfs_attr.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
48#include "xfs_trans_priv.h" 47#include "xfs_trans_priv.h"
@@ -111,7 +110,7 @@ xfs_trans_log_dquot(
111 * Carry forward whatever is left of the quota blk reservation to 110 * Carry forward whatever is left of the quota blk reservation to
112 * the spanky new transaction 111 * the spanky new transaction
113 */ 112 */
114STATIC void 113void
115xfs_trans_dup_dqinfo( 114xfs_trans_dup_dqinfo(
116 xfs_trans_t *otp, 115 xfs_trans_t *otp,
117 xfs_trans_t *ntp) 116 xfs_trans_t *ntp)
@@ -167,19 +166,17 @@ xfs_trans_dup_dqinfo(
167/* 166/*
168 * Wrap around mod_dquot to account for both user and group quotas. 167 * Wrap around mod_dquot to account for both user and group quotas.
169 */ 168 */
170STATIC void 169void
171xfs_trans_mod_dquot_byino( 170xfs_trans_mod_dquot_byino(
172 xfs_trans_t *tp, 171 xfs_trans_t *tp,
173 xfs_inode_t *ip, 172 xfs_inode_t *ip,
174 uint field, 173 uint field,
175 long delta) 174 long delta)
176{ 175{
177 xfs_mount_t *mp; 176 xfs_mount_t *mp = tp->t_mountp;
178
179 ASSERT(tp);
180 mp = tp->t_mountp;
181 177
182 if (!XFS_IS_QUOTA_ON(mp) || 178 if (!XFS_IS_QUOTA_RUNNING(mp) ||
179 !XFS_IS_QUOTA_ON(mp) ||
183 ip->i_ino == mp->m_sb.sb_uquotino || 180 ip->i_ino == mp->m_sb.sb_uquotino ||
184 ip->i_ino == mp->m_sb.sb_gquotino) 181 ip->i_ino == mp->m_sb.sb_gquotino)
185 return; 182 return;
@@ -229,6 +226,7 @@ xfs_trans_mod_dquot(
229 xfs_dqtrx_t *qtrx; 226 xfs_dqtrx_t *qtrx;
230 227
231 ASSERT(tp); 228 ASSERT(tp);
229 ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
232 qtrx = NULL; 230 qtrx = NULL;
233 231
234 if (tp->t_dqinfo == NULL) 232 if (tp->t_dqinfo == NULL)
@@ -346,7 +344,7 @@ xfs_trans_dqlockedjoin(
346 * Unreserve just the reservations done by this transaction. 344 * Unreserve just the reservations done by this transaction.
347 * dquot is still left locked at exit. 345 * dquot is still left locked at exit.
348 */ 346 */
349STATIC void 347void
350xfs_trans_apply_dquot_deltas( 348xfs_trans_apply_dquot_deltas(
351 xfs_trans_t *tp) 349 xfs_trans_t *tp)
352{ 350{
@@ -357,7 +355,7 @@ xfs_trans_apply_dquot_deltas(
357 long totalbdelta; 355 long totalbdelta;
358 long totalrtbdelta; 356 long totalrtbdelta;
359 357
360 if (! (tp->t_flags & XFS_TRANS_DQ_DIRTY)) 358 if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
361 return; 359 return;
362 360
363 ASSERT(tp->t_dqinfo); 361 ASSERT(tp->t_dqinfo);
@@ -531,7 +529,7 @@ xfs_trans_apply_dquot_deltas(
531 * we simply throw those away, since that's the expected behavior 529 * we simply throw those away, since that's the expected behavior
532 * when a transaction is curtailed without a commit. 530 * when a transaction is curtailed without a commit.
533 */ 531 */
534STATIC void 532void
535xfs_trans_unreserve_and_mod_dquots( 533xfs_trans_unreserve_and_mod_dquots(
536 xfs_trans_t *tp) 534 xfs_trans_t *tp)
537{ 535{
@@ -768,7 +766,7 @@ xfs_trans_reserve_quota_bydquots(
768{ 766{
769 int resvd = 0, error; 767 int resvd = 0, error;
770 768
771 if (!XFS_IS_QUOTA_ON(mp)) 769 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
772 return 0; 770 return 0;
773 771
774 if (tp && tp->t_dqinfo == NULL) 772 if (tp && tp->t_dqinfo == NULL)
@@ -811,18 +809,17 @@ xfs_trans_reserve_quota_bydquots(
811 * This doesn't change the actual usage, just the reservation. 809 * This doesn't change the actual usage, just the reservation.
812 * The inode sent in is locked. 810 * The inode sent in is locked.
813 */ 811 */
814STATIC int 812int
815xfs_trans_reserve_quota_nblks( 813xfs_trans_reserve_quota_nblks(
816 xfs_trans_t *tp, 814 struct xfs_trans *tp,
817 xfs_mount_t *mp, 815 struct xfs_inode *ip,
818 xfs_inode_t *ip, 816 long nblks,
819 long nblks, 817 long ninos,
820 long ninos, 818 uint flags)
821 uint flags)
822{ 819{
823 int error; 820 struct xfs_mount *mp = ip->i_mount;
824 821
825 if (!XFS_IS_QUOTA_ON(mp)) 822 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
826 return 0; 823 return 0;
827 if (XFS_IS_PQUOTA_ON(mp)) 824 if (XFS_IS_PQUOTA_ON(mp))
828 flags |= XFS_QMOPT_ENOSPC; 825 flags |= XFS_QMOPT_ENOSPC;
@@ -831,7 +828,6 @@ xfs_trans_reserve_quota_nblks(
831 ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); 828 ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
832 829
833 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 830 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
834 ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
835 ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == 831 ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
836 XFS_TRANS_DQ_RES_RTBLKS || 832 XFS_TRANS_DQ_RES_RTBLKS ||
837 (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == 833 (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
@@ -840,11 +836,9 @@ xfs_trans_reserve_quota_nblks(
840 /* 836 /*
841 * Reserve nblks against these dquots, with trans as the mediator. 837 * Reserve nblks against these dquots, with trans as the mediator.
842 */ 838 */
843 error = xfs_trans_reserve_quota_bydquots(tp, mp, 839 return xfs_trans_reserve_quota_bydquots(tp, mp,
844 ip->i_udquot, ip->i_gdquot, 840 ip->i_udquot, ip->i_gdquot,
845 nblks, ninos, 841 nblks, ninos, flags);
846 flags);
847 return error;
848} 842}
849 843
850/* 844/*
@@ -895,25 +889,15 @@ STATIC void
895xfs_trans_alloc_dqinfo( 889xfs_trans_alloc_dqinfo(
896 xfs_trans_t *tp) 890 xfs_trans_t *tp)
897{ 891{
898 (tp)->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP); 892 tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
899} 893}
900 894
901STATIC void 895void
902xfs_trans_free_dqinfo( 896xfs_trans_free_dqinfo(
903 xfs_trans_t *tp) 897 xfs_trans_t *tp)
904{ 898{
905 if (!tp->t_dqinfo) 899 if (!tp->t_dqinfo)
906 return; 900 return;
907 kmem_zone_free(xfs_Gqm->qm_dqtrxzone, (tp)->t_dqinfo); 901 kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo);
908 (tp)->t_dqinfo = NULL; 902 tp->t_dqinfo = NULL;
909} 903}
910
911xfs_dqtrxops_t xfs_trans_dquot_ops = {
912 .qo_dup_dqinfo = xfs_trans_dup_dqinfo,
913 .qo_free_dqinfo = xfs_trans_free_dqinfo,
914 .qo_mod_dquot_byino = xfs_trans_mod_dquot_byino,
915 .qo_apply_dquot_deltas = xfs_trans_apply_dquot_deltas,
916 .qo_reserve_quota_nblks = xfs_trans_reserve_quota_nblks,
917 .qo_reserve_quota_bydquots = xfs_trans_reserve_quota_bydquots,
918 .qo_unreserve_and_mod_dquots = xfs_trans_unreserve_and_mod_dquots,
919};
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
deleted file mode 100644
index a8cdd73999a4..000000000000
--- a/fs/xfs/xfs_acl.c
+++ /dev/null
@@ -1,874 +0,0 @@
1/*
2 * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_inum.h"
23#include "xfs_ag.h"
24#include "xfs_dir2.h"
25#include "xfs_bmap_btree.h"
26#include "xfs_alloc_btree.h"
27#include "xfs_ialloc_btree.h"
28#include "xfs_dir2_sf.h"
29#include "xfs_attr_sf.h"
30#include "xfs_dinode.h"
31#include "xfs_inode.h"
32#include "xfs_btree.h"
33#include "xfs_acl.h"
34#include "xfs_attr.h"
35#include "xfs_vnodeops.h"
36
37#include <linux/capability.h>
38#include <linux/posix_acl_xattr.h>
39
40STATIC int xfs_acl_setmode(struct inode *, xfs_acl_t *, int *);
41STATIC void xfs_acl_filter_mode(mode_t, xfs_acl_t *);
42STATIC void xfs_acl_get_endian(xfs_acl_t *);
43STATIC int xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
44STATIC int xfs_acl_invalid(xfs_acl_t *);
45STATIC void xfs_acl_sync_mode(mode_t, xfs_acl_t *);
46STATIC void xfs_acl_get_attr(struct inode *, xfs_acl_t *, int, int, int *);
47STATIC void xfs_acl_set_attr(struct inode *, xfs_acl_t *, int, int *);
48STATIC int xfs_acl_allow_set(struct inode *, int);
49
50kmem_zone_t *xfs_acl_zone;
51
52
53/*
54 * Test for existence of access ACL attribute as efficiently as possible.
55 */
56int
57xfs_acl_vhasacl_access(
58 struct inode *vp)
59{
60 int error;
61
62 xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error);
63 return (error == 0);
64}
65
66/*
67 * Test for existence of default ACL attribute as efficiently as possible.
68 */
69int
70xfs_acl_vhasacl_default(
71 struct inode *vp)
72{
73 int error;
74
75 if (!S_ISDIR(vp->i_mode))
76 return 0;
77 xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
78 return (error == 0);
79}
80
81/*
82 * Convert from extended attribute representation to in-memory for XFS.
83 */
84STATIC int
85posix_acl_xattr_to_xfs(
86 posix_acl_xattr_header *src,
87 size_t size,
88 xfs_acl_t *dest)
89{
90 posix_acl_xattr_entry *src_entry;
91 xfs_acl_entry_t *dest_entry;
92 int n;
93
94 if (!src || !dest)
95 return EINVAL;
96
97 if (size < sizeof(posix_acl_xattr_header))
98 return EINVAL;
99
100 if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
101 return EOPNOTSUPP;
102
103 memset(dest, 0, sizeof(xfs_acl_t));
104 dest->acl_cnt = posix_acl_xattr_count(size);
105 if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES)
106 return EINVAL;
107
108 /*
109 * acl_set_file(3) may request that we set default ACLs with
110 * zero length -- defend (gracefully) against that here.
111 */
112 if (!dest->acl_cnt)
113 return 0;
114
115 src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src));
116 dest_entry = &dest->acl_entry[0];
117
118 for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) {
119 dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm);
120 if (_ACL_PERM_INVALID(dest_entry->ae_perm))
121 return EINVAL;
122 dest_entry->ae_tag = le16_to_cpu(src_entry->e_tag);
123 switch(dest_entry->ae_tag) {
124 case ACL_USER:
125 case ACL_GROUP:
126 dest_entry->ae_id = le32_to_cpu(src_entry->e_id);
127 break;
128 case ACL_USER_OBJ:
129 case ACL_GROUP_OBJ:
130 case ACL_MASK:
131 case ACL_OTHER:
132 dest_entry->ae_id = ACL_UNDEFINED_ID;
133 break;
134 default:
135 return EINVAL;
136 }
137 }
138 if (xfs_acl_invalid(dest))
139 return EINVAL;
140
141 return 0;
142}
143
144/*
145 * Comparison function called from xfs_sort().
146 * Primary key is ae_tag, secondary key is ae_id.
147 */
148STATIC int
149xfs_acl_entry_compare(
150 const void *va,
151 const void *vb)
152{
153 xfs_acl_entry_t *a = (xfs_acl_entry_t *)va,
154 *b = (xfs_acl_entry_t *)vb;
155
156 if (a->ae_tag == b->ae_tag)
157 return (a->ae_id - b->ae_id);
158 return (a->ae_tag - b->ae_tag);
159}
160
161/*
162 * Convert from in-memory XFS to extended attribute representation.
163 */
164STATIC int
165posix_acl_xfs_to_xattr(
166 xfs_acl_t *src,
167 posix_acl_xattr_header *dest,
168 size_t size)
169{
170 int n;
171 size_t new_size = posix_acl_xattr_size(src->acl_cnt);
172 posix_acl_xattr_entry *dest_entry;
173 xfs_acl_entry_t *src_entry;
174
175 if (size < new_size)
176 return -ERANGE;
177
178 /* Need to sort src XFS ACL by <ae_tag,ae_id> */
179 xfs_sort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]),
180 xfs_acl_entry_compare);
181
182 dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
183 dest_entry = &dest->a_entries[0];
184 src_entry = &src->acl_entry[0];
185 for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) {
186 dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm);
187 if (_ACL_PERM_INVALID(src_entry->ae_perm))
188 return -EINVAL;
189 dest_entry->e_tag = cpu_to_le16(src_entry->ae_tag);
190 switch (src_entry->ae_tag) {
191 case ACL_USER:
192 case ACL_GROUP:
193 dest_entry->e_id = cpu_to_le32(src_entry->ae_id);
194 break;
195 case ACL_USER_OBJ:
196 case ACL_GROUP_OBJ:
197 case ACL_MASK:
198 case ACL_OTHER:
199 dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
200 break;
201 default:
202 return -EINVAL;
203 }
204 }
205 return new_size;
206}
207
208int
209xfs_acl_vget(
210 struct inode *vp,
211 void *acl,
212 size_t size,
213 int kind)
214{
215 int error;
216 xfs_acl_t *xfs_acl = NULL;
217 posix_acl_xattr_header *ext_acl = acl;
218 int flags = 0;
219
220 if(size) {
221 if (!(_ACL_ALLOC(xfs_acl))) {
222 error = ENOMEM;
223 goto out;
224 }
225 memset(xfs_acl, 0, sizeof(xfs_acl_t));
226 } else
227 flags = ATTR_KERNOVAL;
228
229 xfs_acl_get_attr(vp, xfs_acl, kind, flags, &error);
230 if (error)
231 goto out;
232
233 if (!size) {
234 error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES);
235 } else {
236 if (xfs_acl_invalid(xfs_acl)) {
237 error = EINVAL;
238 goto out;
239 }
240 if (kind == _ACL_TYPE_ACCESS)
241 xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, xfs_acl);
242 error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
243 }
244out:
245 if(xfs_acl)
246 _ACL_FREE(xfs_acl);
247 return -error;
248}
249
250int
251xfs_acl_vremove(
252 struct inode *vp,
253 int kind)
254{
255 int error;
256
257 error = xfs_acl_allow_set(vp, kind);
258 if (!error) {
259 error = xfs_attr_remove(XFS_I(vp),
260 kind == _ACL_TYPE_DEFAULT?
261 SGI_ACL_DEFAULT: SGI_ACL_FILE,
262 ATTR_ROOT);
263 if (error == ENOATTR)
264 error = 0; /* 'scool */
265 }
266 return -error;
267}
268
269int
270xfs_acl_vset(
271 struct inode *vp,
272 void *acl,
273 size_t size,
274 int kind)
275{
276 posix_acl_xattr_header *ext_acl = acl;
277 xfs_acl_t *xfs_acl;
278 int error;
279 int basicperms = 0; /* more than std unix perms? */
280
281 if (!acl)
282 return -EINVAL;
283
284 if (!(_ACL_ALLOC(xfs_acl)))
285 return -ENOMEM;
286
287 error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl);
288 if (error) {
289 _ACL_FREE(xfs_acl);
290 return -error;
291 }
292 if (!xfs_acl->acl_cnt) {
293 _ACL_FREE(xfs_acl);
294 return 0;
295 }
296
297 error = xfs_acl_allow_set(vp, kind);
298
299 /* Incoming ACL exists, set file mode based on its value */
300 if (!error && kind == _ACL_TYPE_ACCESS)
301 error = xfs_acl_setmode(vp, xfs_acl, &basicperms);
302
303 if (error)
304 goto out;
305
306 /*
307 * If we have more than std unix permissions, set up the actual attr.
308 * Otherwise, delete any existing attr. This prevents us from
309 * having actual attrs for permissions that can be stored in the
310 * standard permission bits.
311 */
312 if (!basicperms) {
313 xfs_acl_set_attr(vp, xfs_acl, kind, &error);
314 } else {
315 error = -xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
316 }
317
318out:
319 _ACL_FREE(xfs_acl);
320 return -error;
321}
322
323int
324xfs_acl_iaccess(
325 xfs_inode_t *ip,
326 mode_t mode,
327 cred_t *cr)
328{
329 xfs_acl_t *acl;
330 int rval;
331 struct xfs_name acl_name = {SGI_ACL_FILE, SGI_ACL_FILE_SIZE};
332
333 if (!(_ACL_ALLOC(acl)))
334 return -1;
335
336 /* If the file has no ACL return -1. */
337 rval = sizeof(xfs_acl_t);
338 if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval, ATTR_ROOT)) {
339 _ACL_FREE(acl);
340 return -1;
341 }
342 xfs_acl_get_endian(acl);
343
344 /* If the file has an empty ACL return -1. */
345 if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) {
346 _ACL_FREE(acl);
347 return -1;
348 }
349
350 /* Synchronize ACL with mode bits */
351 xfs_acl_sync_mode(ip->i_d.di_mode, acl);
352
353 rval = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr);
354 _ACL_FREE(acl);
355 return rval;
356}
357
358STATIC int
359xfs_acl_allow_set(
360 struct inode *vp,
361 int kind)
362{
363 if (vp->i_flags & (S_IMMUTABLE|S_APPEND))
364 return EPERM;
365 if (kind == _ACL_TYPE_DEFAULT && !S_ISDIR(vp->i_mode))
366 return ENOTDIR;
367 if (vp->i_sb->s_flags & MS_RDONLY)
368 return EROFS;
369 if (XFS_I(vp)->i_d.di_uid != current_fsuid() && !capable(CAP_FOWNER))
370 return EPERM;
371 return 0;
372}
373
374/*
375 * Note: cr is only used here for the capability check if the ACL test fails.
376 * It is not used to find out the credentials uid or groups etc, as was
377 * done in IRIX. It is assumed that the uid and groups for the current
378 * thread are taken from "current" instead of the cr parameter.
379 */
380STATIC int
381xfs_acl_access(
382 uid_t fuid,
383 gid_t fgid,
384 xfs_acl_t *fap,
385 mode_t md,
386 cred_t *cr)
387{
388 xfs_acl_entry_t matched;
389 int i, allows;
390 int maskallows = -1; /* true, but not 1, either */
391 int seen_userobj = 0;
392
393 matched.ae_tag = 0; /* Invalid type */
394 matched.ae_perm = 0;
395
396 for (i = 0; i < fap->acl_cnt; i++) {
397 /*
398 * Break out if we've got a user_obj entry or
399 * a user entry and the mask (and have processed USER_OBJ)
400 */
401 if (matched.ae_tag == ACL_USER_OBJ)
402 break;
403 if (matched.ae_tag == ACL_USER) {
404 if (maskallows != -1 && seen_userobj)
405 break;
406 if (fap->acl_entry[i].ae_tag != ACL_MASK &&
407 fap->acl_entry[i].ae_tag != ACL_USER_OBJ)
408 continue;
409 }
410 /* True if this entry allows the requested access */
411 allows = ((fap->acl_entry[i].ae_perm & md) == md);
412
413 switch (fap->acl_entry[i].ae_tag) {
414 case ACL_USER_OBJ:
415 seen_userobj = 1;
416 if (fuid != current_fsuid())
417 continue;
418 matched.ae_tag = ACL_USER_OBJ;
419 matched.ae_perm = allows;
420 break;
421 case ACL_USER:
422 if (fap->acl_entry[i].ae_id != current_fsuid())
423 continue;
424 matched.ae_tag = ACL_USER;
425 matched.ae_perm = allows;
426 break;
427 case ACL_GROUP_OBJ:
428 if ((matched.ae_tag == ACL_GROUP_OBJ ||
429 matched.ae_tag == ACL_GROUP) && !allows)
430 continue;
431 if (!in_group_p(fgid))
432 continue;
433 matched.ae_tag = ACL_GROUP_OBJ;
434 matched.ae_perm = allows;
435 break;
436 case ACL_GROUP:
437 if ((matched.ae_tag == ACL_GROUP_OBJ ||
438 matched.ae_tag == ACL_GROUP) && !allows)
439 continue;
440 if (!in_group_p(fap->acl_entry[i].ae_id))
441 continue;
442 matched.ae_tag = ACL_GROUP;
443 matched.ae_perm = allows;
444 break;
445 case ACL_MASK:
446 maskallows = allows;
447 break;
448 case ACL_OTHER:
449 if (matched.ae_tag != 0)
450 continue;
451 matched.ae_tag = ACL_OTHER;
452 matched.ae_perm = allows;
453 break;
454 }
455 }
456 /*
457 * First possibility is that no matched entry allows access.
458 * The capability to override DAC may exist, so check for it.
459 */
460 switch (matched.ae_tag) {
461 case ACL_OTHER:
462 case ACL_USER_OBJ:
463 if (matched.ae_perm)
464 return 0;
465 break;
466 case ACL_USER:
467 case ACL_GROUP_OBJ:
468 case ACL_GROUP:
469 if (maskallows && matched.ae_perm)
470 return 0;
471 break;
472 case 0:
473 break;
474 }
475
476 /* EACCES tells generic_permission to check for capability overrides */
477 return EACCES;
478}
479
480/*
481 * ACL validity checker.
482 * This acl validation routine checks each ACL entry read in makes sense.
483 */
484STATIC int
485xfs_acl_invalid(
486 xfs_acl_t *aclp)
487{
488 xfs_acl_entry_t *entry, *e;
489 int user = 0, group = 0, other = 0, mask = 0;
490 int mask_required = 0;
491 int i, j;
492
493 if (!aclp)
494 goto acl_invalid;
495
496 if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES)
497 goto acl_invalid;
498
499 for (i = 0; i < aclp->acl_cnt; i++) {
500 entry = &aclp->acl_entry[i];
501 switch (entry->ae_tag) {
502 case ACL_USER_OBJ:
503 if (user++)
504 goto acl_invalid;
505 break;
506 case ACL_GROUP_OBJ:
507 if (group++)
508 goto acl_invalid;
509 break;
510 case ACL_OTHER:
511 if (other++)
512 goto acl_invalid;
513 break;
514 case ACL_USER:
515 case ACL_GROUP:
516 for (j = i + 1; j < aclp->acl_cnt; j++) {
517 e = &aclp->acl_entry[j];
518 if (e->ae_id == entry->ae_id &&
519 e->ae_tag == entry->ae_tag)
520 goto acl_invalid;
521 }
522 mask_required++;
523 break;
524 case ACL_MASK:
525 if (mask++)
526 goto acl_invalid;
527 break;
528 default:
529 goto acl_invalid;
530 }
531 }
532 if (!user || !group || !other || (mask_required && !mask))
533 goto acl_invalid;
534 else
535 return 0;
536acl_invalid:
537 return EINVAL;
538}
539
540/*
541 * Do ACL endian conversion.
542 */
543STATIC void
544xfs_acl_get_endian(
545 xfs_acl_t *aclp)
546{
547 xfs_acl_entry_t *ace, *end;
548
549 INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
550 end = &aclp->acl_entry[0]+aclp->acl_cnt;
551 for (ace = &aclp->acl_entry[0]; ace < end; ace++) {
552 INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag);
553 INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id);
554 INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm);
555 }
556}
557
558/*
559 * Get the ACL from the EA and do endian conversion.
560 */
561STATIC void
562xfs_acl_get_attr(
563 struct inode *vp,
564 xfs_acl_t *aclp,
565 int kind,
566 int flags,
567 int *error)
568{
569 int len = sizeof(xfs_acl_t);
570
571 ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
572 flags |= ATTR_ROOT;
573 *error = xfs_attr_get(XFS_I(vp),
574 kind == _ACL_TYPE_ACCESS ?
575 SGI_ACL_FILE : SGI_ACL_DEFAULT,
576 (char *)aclp, &len, flags);
577 if (*error || (flags & ATTR_KERNOVAL))
578 return;
579 xfs_acl_get_endian(aclp);
580}
581
582/*
583 * Set the EA with the ACL and do endian conversion.
584 */
585STATIC void
586xfs_acl_set_attr(
587 struct inode *vp,
588 xfs_acl_t *aclp,
589 int kind,
590 int *error)
591{
592 xfs_acl_entry_t *ace, *newace, *end;
593 xfs_acl_t *newacl;
594 int len;
595
596 if (!(_ACL_ALLOC(newacl))) {
597 *error = ENOMEM;
598 return;
599 }
600
601 len = sizeof(xfs_acl_t) -
602 (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt));
603 end = &aclp->acl_entry[0]+aclp->acl_cnt;
604 for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0];
605 ace < end;
606 ace++, newace++) {
607 INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag);
608 INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id);
609 INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
610 }
611 INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
612 *error = xfs_attr_set(XFS_I(vp),
613 kind == _ACL_TYPE_ACCESS ?
614 SGI_ACL_FILE: SGI_ACL_DEFAULT,
615 (char *)newacl, len, ATTR_ROOT);
616 _ACL_FREE(newacl);
617}
618
619int
620xfs_acl_vtoacl(
621 struct inode *vp,
622 xfs_acl_t *access_acl,
623 xfs_acl_t *default_acl)
624{
625 int error = 0;
626
627 if (access_acl) {
628 /*
629 * Get the Access ACL and the mode. If either cannot
630 * be obtained for some reason, invalidate the access ACL.
631 */
632 xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error);
633 if (error)
634 access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
635 else /* We have a good ACL and the file mode, synchronize. */
636 xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, access_acl);
637 }
638
639 if (default_acl) {
640 xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error);
641 if (error)
642 default_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
643 }
644 return error;
645}
646
647/*
648 * This function retrieves the parent directory's acl, processes it
649 * and lets the child inherit the acl(s) that it should.
650 */
651int
652xfs_acl_inherit(
653 struct inode *vp,
654 mode_t mode,
655 xfs_acl_t *pdaclp)
656{
657 xfs_acl_t *cacl;
658 int error = 0;
659 int basicperms = 0;
660
661 /*
662 * If the parent does not have a default ACL, or it's an
663 * invalid ACL, we're done.
664 */
665 if (!vp)
666 return 0;
667 if (!pdaclp || xfs_acl_invalid(pdaclp))
668 return 0;
669
670 /*
671 * Copy the default ACL of the containing directory to
672 * the access ACL of the new file and use the mode that
673 * was passed in to set up the correct initial values for
674 * the u::,g::[m::], and o:: entries. This is what makes
675 * umask() "work" with ACL's.
676 */
677
678 if (!(_ACL_ALLOC(cacl)))
679 return ENOMEM;
680
681 memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
682 xfs_acl_filter_mode(mode, cacl);
683 error = xfs_acl_setmode(vp, cacl, &basicperms);
684 if (error)
685 goto out_error;
686
687 /*
688 * Set the Default and Access ACL on the file. The mode is already
689 * set on the file, so we don't need to worry about that.
690 *
691 * If the new file is a directory, its default ACL is a copy of
692 * the containing directory's default ACL.
693 */
694 if (S_ISDIR(vp->i_mode))
695 xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
696 if (!error && !basicperms)
697 xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
698out_error:
699 _ACL_FREE(cacl);
700 return error;
701}
702
703/*
704 * Set up the correct mode on the file based on the supplied ACL. This
705 * makes sure that the mode on the file reflects the state of the
706 * u::,g::[m::], and o:: entries in the ACL. Since the mode is where
707 * the ACL is going to get the permissions for these entries, we must
708 * synchronize the mode whenever we set the ACL on a file.
709 */
710STATIC int
711xfs_acl_setmode(
712 struct inode *vp,
713 xfs_acl_t *acl,
714 int *basicperms)
715{
716 struct iattr iattr;
717 xfs_acl_entry_t *ap;
718 xfs_acl_entry_t *gap = NULL;
719 int i, nomask = 1;
720
721 *basicperms = 1;
722
723 if (acl->acl_cnt == XFS_ACL_NOT_PRESENT)
724 return 0;
725
726 /*
727 * Copy the u::, g::, o::, and m:: bits from the ACL into the
728 * mode. The m:: bits take precedence over the g:: bits.
729 */
730 iattr.ia_valid = ATTR_MODE;
731 iattr.ia_mode = XFS_I(vp)->i_d.di_mode;
732 iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
733 ap = acl->acl_entry;
734 for (i = 0; i < acl->acl_cnt; ++i) {
735 switch (ap->ae_tag) {
736 case ACL_USER_OBJ:
737 iattr.ia_mode |= ap->ae_perm << 6;
738 break;
739 case ACL_GROUP_OBJ:
740 gap = ap;
741 break;
742 case ACL_MASK: /* more than just standard modes */
743 nomask = 0;
744 iattr.ia_mode |= ap->ae_perm << 3;
745 *basicperms = 0;
746 break;
747 case ACL_OTHER:
748 iattr.ia_mode |= ap->ae_perm;
749 break;
750 default: /* more than just standard modes */
751 *basicperms = 0;
752 break;
753 }
754 ap++;
755 }
756
757 /* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
758 if (gap && nomask)
759 iattr.ia_mode |= gap->ae_perm << 3;
760
761 return xfs_setattr(XFS_I(vp), &iattr, 0);
762}
763
764/*
765 * The permissions for the special ACL entries (u::, g::[m::], o::) are
766 * actually stored in the file mode (if there is both a group and a mask,
767 * the group is stored in the ACL entry and the mask is stored on the file).
768 * This allows the mode to remain automatically in sync with the ACL without
769 * the need for a call-back to the ACL system at every point where the mode
770 * could change. This function takes the permissions from the specified mode
771 * and places it in the supplied ACL.
772 *
773 * This implementation draws its validity from the fact that, when the ACL
774 * was assigned, the mode was copied from the ACL.
775 * If the mode did not change, therefore, the mode remains exactly what was
776 * taken from the special ACL entries at assignment.
777 * If a subsequent chmod() was done, the POSIX spec says that the change in
778 * mode must cause an update to the ACL seen at user level and used for
779 * access checks. Before and after a mode change, therefore, the file mode
780 * most accurately reflects what the special ACL entries should permit/deny.
781 *
782 * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly,
783 * the existing mode bits will override whatever is in the
784 * ACL. Similarly, if there is a pre-existing ACL that was
785 * never in sync with its mode (owing to a bug in 6.5 and
786 * before), it will now magically (or mystically) be
787 * synchronized. This could cause slight astonishment, but
788 * it is better than inconsistent permissions.
789 *
790 * The supplied ACL is a template that may contain any combination
791 * of special entries. These are treated as place holders when we fill
792 * out the ACL. This routine does not add or remove special entries, it
793 * simply unites each special entry with its associated set of permissions.
794 */
795STATIC void
796xfs_acl_sync_mode(
797 mode_t mode,
798 xfs_acl_t *acl)
799{
800 int i, nomask = 1;
801 xfs_acl_entry_t *ap;
802 xfs_acl_entry_t *gap = NULL;
803
804 /*
805 * Set ACL entries. POSIX1003.1eD16 requires that the MASK
806 * be set instead of the GROUP entry, if there is a MASK.
807 */
808 for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
809 switch (ap->ae_tag) {
810 case ACL_USER_OBJ:
811 ap->ae_perm = (mode >> 6) & 0x7;
812 break;
813 case ACL_GROUP_OBJ:
814 gap = ap;
815 break;
816 case ACL_MASK:
817 nomask = 0;
818 ap->ae_perm = (mode >> 3) & 0x7;
819 break;
820 case ACL_OTHER:
821 ap->ae_perm = mode & 0x7;
822 break;
823 default:
824 break;
825 }
826 }
827 /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
828 if (gap && nomask)
829 gap->ae_perm = (mode >> 3) & 0x7;
830}
831
832/*
833 * When inheriting an Access ACL from a directory Default ACL,
834 * the ACL bits are set to the intersection of the ACL default
835 * permission bits and the file permission bits in mode. If there
836 * are no permission bits on the file then we must not give them
837 * the ACL. This is what what makes umask() work with ACLs.
838 */
839STATIC void
840xfs_acl_filter_mode(
841 mode_t mode,
842 xfs_acl_t *acl)
843{
844 int i, nomask = 1;
845 xfs_acl_entry_t *ap;
846 xfs_acl_entry_t *gap = NULL;
847
848 /*
849 * Set ACL entries. POSIX1003.1eD16 requires that the MASK
850 * be merged with GROUP entry, if there is a MASK.
851 */
852 for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
853 switch (ap->ae_tag) {
854 case ACL_USER_OBJ:
855 ap->ae_perm &= (mode >> 6) & 0x7;
856 break;
857 case ACL_GROUP_OBJ:
858 gap = ap;
859 break;
860 case ACL_MASK:
861 nomask = 0;
862 ap->ae_perm &= (mode >> 3) & 0x7;
863 break;
864 case ACL_OTHER:
865 ap->ae_perm &= mode & 0x7;
866 break;
867 default:
868 break;
869 }
870 }
871 /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
872 if (gap && nomask)
873 gap->ae_perm &= (mode >> 3) & 0x7;
874}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 642f1db4def4..947b150df8ed 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -18,81 +18,44 @@
18#ifndef __XFS_ACL_H__ 18#ifndef __XFS_ACL_H__
19#define __XFS_ACL_H__ 19#define __XFS_ACL_H__
20 20
21/* 21struct inode;
22 * Access Control Lists 22struct posix_acl;
23 */ 23struct xfs_inode;
24typedef __uint16_t xfs_acl_perm_t;
25typedef __int32_t xfs_acl_tag_t;
26typedef __int32_t xfs_acl_id_t;
27 24
28#define XFS_ACL_MAX_ENTRIES 25 25#define XFS_ACL_MAX_ENTRIES 25
29#define XFS_ACL_NOT_PRESENT (-1) 26#define XFS_ACL_NOT_PRESENT (-1)
30 27
31typedef struct xfs_acl_entry { 28/* On-disk XFS access control list structure */
32 xfs_acl_tag_t ae_tag; 29struct xfs_acl {
33 xfs_acl_id_t ae_id; 30 __be32 acl_cnt;
34 xfs_acl_perm_t ae_perm; 31 struct xfs_acl_entry {
35} xfs_acl_entry_t; 32 __be32 ae_tag;
36 33 __be32 ae_id;
37typedef struct xfs_acl { 34 __be16 ae_perm;
38 __int32_t acl_cnt; 35 } acl_entry[XFS_ACL_MAX_ENTRIES];
39 xfs_acl_entry_t acl_entry[XFS_ACL_MAX_ENTRIES]; 36};
40} xfs_acl_t;
41 37
42/* On-disk XFS extended attribute names */ 38/* On-disk XFS extended attribute names */
43#define SGI_ACL_FILE "SGI_ACL_FILE" 39#define SGI_ACL_FILE "SGI_ACL_FILE"
44#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" 40#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
45#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) 41#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
46#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) 42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
47 43
48#define _ACL_TYPE_ACCESS 1
49#define _ACL_TYPE_DEFAULT 2
50
51#ifdef CONFIG_XFS_POSIX_ACL 44#ifdef CONFIG_XFS_POSIX_ACL
52 45extern int xfs_check_acl(struct inode *inode, int mask);
53struct vattr; 46extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
54struct xfs_inode; 47extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
55 48extern int xfs_acl_chmod(struct inode *inode);
56extern struct kmem_zone *xfs_acl_zone; 49extern int posix_acl_access_exists(struct inode *inode);
57#define xfs_acl_zone_init(zone, name) \ 50extern int posix_acl_default_exists(struct inode *inode);
58 (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name)) 51
59#define xfs_acl_zone_destroy(zone) kmem_zone_destroy(zone) 52extern struct xattr_handler xfs_xattr_system_handler;
60
61extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *);
62extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
63extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *);
64extern int xfs_acl_vhasacl_access(struct inode *);
65extern int xfs_acl_vhasacl_default(struct inode *);
66extern int xfs_acl_vset(struct inode *, void *, size_t, int);
67extern int xfs_acl_vget(struct inode *, void *, size_t, int);
68extern int xfs_acl_vremove(struct inode *, int);
69
70#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
71
72#define _ACL_INHERIT(c,m,d) (xfs_acl_inherit(c,m,d))
73#define _ACL_GET_ACCESS(pv,pa) (xfs_acl_vtoacl(pv,pa,NULL) == 0)
74#define _ACL_GET_DEFAULT(pv,pd) (xfs_acl_vtoacl(pv,NULL,pd) == 0)
75#define _ACL_ACCESS_EXISTS xfs_acl_vhasacl_access
76#define _ACL_DEFAULT_EXISTS xfs_acl_vhasacl_default
77
78#define _ACL_ALLOC(a) ((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP))
79#define _ACL_FREE(a) ((a)? kmem_zone_free(xfs_acl_zone, (a)):(void)0)
80
81#else 53#else
82#define xfs_acl_zone_init(zone,name) 54# define xfs_check_acl NULL
83#define xfs_acl_zone_destroy(zone) 55# define xfs_get_acl(inode, type) NULL
84#define xfs_acl_vset(v,p,sz,t) (-EOPNOTSUPP) 56# define xfs_inherit_acl(inode, default_acl) 0
85#define xfs_acl_vget(v,p,sz,t) (-EOPNOTSUPP) 57# define xfs_acl_chmod(inode) 0
86#define xfs_acl_vremove(v,t) (-EOPNOTSUPP) 58# define posix_acl_access_exists(inode) 0
87#define xfs_acl_vhasacl_access(v) (0) 59# define posix_acl_default_exists(inode) 0
88#define xfs_acl_vhasacl_default(v) (0) 60#endif /* CONFIG_XFS_POSIX_ACL */
89#define _ACL_ALLOC(a) (1) /* successfully allocate nothing */
90#define _ACL_FREE(a) ((void)0)
91#define _ACL_INHERIT(c,m,d) (0)
92#define _ACL_GET_ACCESS(pv,pa) (0)
93#define _ACL_GET_DEFAULT(pv,pd) (0)
94#define _ACL_ACCESS_EXISTS (NULL)
95#define _ACL_DEFAULT_EXISTS (NULL)
96#endif
97
98#endif /* __XFS_ACL_H__ */ 61#endif /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index c8641f713caa..f24b50b68d03 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -212,6 +212,8 @@ typedef struct xfs_perag
212/* 212/*
213 * tags for inode radix tree 213 * tags for inode radix tree
214 */ 214 */
215#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
216 in xfs_inode_ag_iterator */
215#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ 217#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
216 218
217#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) 219#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index 53d5e70d1360..0902249354a0 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -73,28 +73,6 @@ static inline void be64_add_cpu(__be64 *a, __s64 b)
73 73
74#endif /* __KERNEL__ */ 74#endif /* __KERNEL__ */
75 75
76/* do we need conversion? */
77#define ARCH_NOCONVERT 1
78#ifdef XFS_NATIVE_HOST
79# define ARCH_CONVERT ARCH_NOCONVERT
80#else
81# define ARCH_CONVERT 0
82#endif
83
84/* generic swapping macros */
85
86#ifndef HAVE_SWABMACROS
87#define INT_SWAP16(type,var) ((typeof(type))(__swab16((__u16)(var))))
88#define INT_SWAP32(type,var) ((typeof(type))(__swab32((__u32)(var))))
89#define INT_SWAP64(type,var) ((typeof(type))(__swab64((__u64)(var))))
90#endif
91
92#define INT_SWAP(type, var) \
93 ((sizeof(type) == 8) ? INT_SWAP64(type,var) : \
94 ((sizeof(type) == 4) ? INT_SWAP32(type,var) : \
95 ((sizeof(type) == 2) ? INT_SWAP16(type,var) : \
96 (var))))
97
98/* 76/*
99 * get and set integers from potentially unaligned locations 77 * get and set integers from potentially unaligned locations
100 */ 78 */
@@ -107,16 +85,6 @@ static inline void be64_add_cpu(__be64 *a, __s64 b)
107 ((__u8*)(pointer))[1] = (((value) ) & 0xff); \ 85 ((__u8*)(pointer))[1] = (((value) ) & 0xff); \
108 } 86 }
109 87
110/* does not return a value */
111#define INT_SET(reference,arch,valueref) \
112 (__builtin_constant_p(valueref) ? \
113 (void)( (reference) = ( ((arch) != ARCH_NOCONVERT) ? (INT_SWAP((reference),(valueref))) : (valueref)) ) : \
114 (void)( \
115 ((reference) = (valueref)), \
116 ( ((arch) != ARCH_NOCONVERT) ? (reference) = INT_SWAP((reference),(reference)) : 0 ) \
117 ) \
118 )
119
120/* 88/*
121 * In directories inode numbers are stored as unaligned arrays of unsigned 89 * In directories inode numbers are stored as unaligned arrays of unsigned
122 * 8bit integers on disk. 90 * 8bit integers on disk.
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 5fde1654b430..4ece1906bd41 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -45,7 +45,6 @@
45#include "xfs_error.h" 45#include "xfs_error.h"
46#include "xfs_quota.h" 46#include "xfs_quota.h"
47#include "xfs_trans_space.h" 47#include "xfs_trans_space.h"
48#include "xfs_acl.h"
49#include "xfs_rw.h" 48#include "xfs_rw.h"
50#include "xfs_vnodeops.h" 49#include "xfs_vnodeops.h"
51 50
@@ -249,8 +248,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
249 /* 248 /*
250 * Attach the dquots to the inode. 249 * Attach the dquots to the inode.
251 */ 250 */
252 if ((error = XFS_QM_DQATTACH(mp, dp, 0))) 251 error = xfs_qm_dqattach(dp, 0);
253 return (error); 252 if (error)
253 return error;
254 254
255 /* 255 /*
256 * If the inode doesn't have an attribute fork, add one. 256 * If the inode doesn't have an attribute fork, add one.
@@ -311,7 +311,7 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
311 } 311 }
312 xfs_ilock(dp, XFS_ILOCK_EXCL); 312 xfs_ilock(dp, XFS_ILOCK_EXCL);
313 313
314 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, args.total, 0, 314 error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
315 rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : 315 rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
316 XFS_QMOPT_RES_REGBLKS); 316 XFS_QMOPT_RES_REGBLKS);
317 if (error) { 317 if (error) {
@@ -501,8 +501,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
501 /* 501 /*
502 * Attach the dquots to the inode. 502 * Attach the dquots to the inode.
503 */ 503 */
504 if ((error = XFS_QM_DQATTACH(mp, dp, 0))) 504 error = xfs_qm_dqattach(dp, 0);
505 return (error); 505 if (error)
506 return error;
506 507
507 /* 508 /*
508 * Start our first transaction of the day. 509 * Start our first transaction of the day.
@@ -2009,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
2009 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); 2010 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
2010 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 2011 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
2011 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, 2012 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
2012 blkcnt, XFS_BUF_LOCK, &bp); 2013 blkcnt,
2014 XFS_BUF_LOCK | XBF_DONT_BLOCK,
2015 &bp);
2013 if (error) 2016 if (error)
2014 return(error); 2017 return(error);
2015 2018
@@ -2140,8 +2143,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2140 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), 2143 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
2141 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); 2144 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
2142 2145
2143 bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, 2146 bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt,
2144 blkcnt, XFS_BUF_LOCK); 2147 XFS_BUF_LOCK | XBF_DONT_BLOCK);
2145 ASSERT(bp); 2148 ASSERT(bp);
2146 ASSERT(!XFS_BUF_GETERROR(bp)); 2149 ASSERT(!XFS_BUF_GETERROR(bp));
2147 2150
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index ca7c6005a487..8ee5b5a76a2a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2691,7 +2691,7 @@ xfs_bmap_rtalloc(
2691 * Adjust the disk quota also. This was reserved 2691 * Adjust the disk quota also. This was reserved
2692 * earlier. 2692 * earlier.
2693 */ 2693 */
2694 XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip, 2694 xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
2695 ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : 2695 ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
2696 XFS_TRANS_DQ_RTBCOUNT, (long) ralen); 2696 XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
2697 } else { 2697 } else {
@@ -2995,7 +2995,7 @@ xfs_bmap_btalloc(
2995 * Adjust the disk quota also. This was reserved 2995 * Adjust the disk quota also. This was reserved
2996 * earlier. 2996 * earlier.
2997 */ 2997 */
2998 XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip, 2998 xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
2999 ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : 2999 ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
3000 XFS_TRANS_DQ_BCOUNT, 3000 XFS_TRANS_DQ_BCOUNT,
3001 (long) args.len); 3001 (long) args.len);
@@ -3066,7 +3066,7 @@ xfs_bmap_btree_to_extents(
3066 return error; 3066 return error;
3067 xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); 3067 xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
3068 ip->i_d.di_nblocks--; 3068 ip->i_d.di_nblocks--;
3069 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); 3069 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
3070 xfs_trans_binval(tp, cbp); 3070 xfs_trans_binval(tp, cbp);
3071 if (cur->bc_bufs[0] == cbp) 3071 if (cur->bc_bufs[0] == cbp)
3072 cur->bc_bufs[0] = NULL; 3072 cur->bc_bufs[0] = NULL;
@@ -3386,7 +3386,7 @@ xfs_bmap_del_extent(
3386 * Adjust quota data. 3386 * Adjust quota data.
3387 */ 3387 */
3388 if (qfield) 3388 if (qfield)
3389 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, qfield, (long)-nblks); 3389 xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
3390 3390
3391 /* 3391 /*
3392 * Account for change in delayed indirect blocks. 3392 * Account for change in delayed indirect blocks.
@@ -3523,7 +3523,7 @@ xfs_bmap_extents_to_btree(
3523 *firstblock = cur->bc_private.b.firstblock = args.fsbno; 3523 *firstblock = cur->bc_private.b.firstblock = args.fsbno;
3524 cur->bc_private.b.allocated++; 3524 cur->bc_private.b.allocated++;
3525 ip->i_d.di_nblocks++; 3525 ip->i_d.di_nblocks++;
3526 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); 3526 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
3527 abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); 3527 abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
3528 /* 3528 /*
3529 * Fill in the child block. 3529 * Fill in the child block.
@@ -3690,7 +3690,7 @@ xfs_bmap_local_to_extents(
3690 XFS_BMAP_TRACE_POST_UPDATE("new", ip, 0, whichfork); 3690 XFS_BMAP_TRACE_POST_UPDATE("new", ip, 0, whichfork);
3691 XFS_IFORK_NEXT_SET(ip, whichfork, 1); 3691 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
3692 ip->i_d.di_nblocks = 1; 3692 ip->i_d.di_nblocks = 1;
3693 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip, 3693 xfs_trans_mod_dquot_byino(tp, ip,
3694 XFS_TRANS_DQ_BCOUNT, 1L); 3694 XFS_TRANS_DQ_BCOUNT, 1L);
3695 flags |= xfs_ilog_fext(whichfork); 3695 flags |= xfs_ilog_fext(whichfork);
3696 } else { 3696 } else {
@@ -4048,7 +4048,7 @@ xfs_bmap_add_attrfork(
4048 XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) 4048 XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
4049 goto error0; 4049 goto error0;
4050 xfs_ilock(ip, XFS_ILOCK_EXCL); 4050 xfs_ilock(ip, XFS_ILOCK_EXCL);
4051 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, blks, 0, rsvd ? 4051 error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
4052 XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : 4052 XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
4053 XFS_QMOPT_RES_REGBLKS); 4053 XFS_QMOPT_RES_REGBLKS);
4054 if (error) { 4054 if (error) {
@@ -4983,10 +4983,11 @@ xfs_bmapi(
4983 * adjusted later. We return if we haven't 4983 * adjusted later. We return if we haven't
4984 * allocated blocks already inside this loop. 4984 * allocated blocks already inside this loop.
4985 */ 4985 */
4986 if ((error = XFS_TRANS_RESERVE_QUOTA_NBLKS( 4986 error = xfs_trans_reserve_quota_nblks(
4987 mp, NULL, ip, (long)alen, 0, 4987 NULL, ip, (long)alen, 0,
4988 rt ? XFS_QMOPT_RES_RTBLKS : 4988 rt ? XFS_QMOPT_RES_RTBLKS :
4989 XFS_QMOPT_RES_REGBLKS))) { 4989 XFS_QMOPT_RES_REGBLKS);
4990 if (error) {
4990 if (n == 0) { 4991 if (n == 0) {
4991 *nmap = 0; 4992 *nmap = 0;
4992 ASSERT(cur == NULL); 4993 ASSERT(cur == NULL);
@@ -5035,8 +5036,8 @@ xfs_bmapi(
5035 if (XFS_IS_QUOTA_ON(mp)) 5036 if (XFS_IS_QUOTA_ON(mp))
5036 /* unreserve the blocks now */ 5037 /* unreserve the blocks now */
5037 (void) 5038 (void)
5038 XFS_TRANS_UNRESERVE_QUOTA_NBLKS( 5039 xfs_trans_unreserve_quota_nblks(
5039 mp, NULL, ip, 5040 NULL, ip,
5040 (long)alen, 0, rt ? 5041 (long)alen, 0, rt ?
5041 XFS_QMOPT_RES_RTBLKS : 5042 XFS_QMOPT_RES_RTBLKS :
5042 XFS_QMOPT_RES_REGBLKS); 5043 XFS_QMOPT_RES_REGBLKS);
@@ -5691,14 +5692,14 @@ xfs_bunmapi(
5691 do_div(rtexts, mp->m_sb.sb_rextsize); 5692 do_div(rtexts, mp->m_sb.sb_rextsize);
5692 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, 5693 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
5693 (int64_t)rtexts, rsvd); 5694 (int64_t)rtexts, rsvd);
5694 (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, 5695 (void)xfs_trans_reserve_quota_nblks(NULL,
5695 NULL, ip, -((long)del.br_blockcount), 0, 5696 ip, -((long)del.br_blockcount), 0,
5696 XFS_QMOPT_RES_RTBLKS); 5697 XFS_QMOPT_RES_RTBLKS);
5697 } else { 5698 } else {
5698 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, 5699 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
5699 (int64_t)del.br_blockcount, rsvd); 5700 (int64_t)del.br_blockcount, rsvd);
5700 (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, 5701 (void)xfs_trans_reserve_quota_nblks(NULL,
5701 NULL, ip, -((long)del.br_blockcount), 0, 5702 ip, -((long)del.br_blockcount), 0,
5702 XFS_QMOPT_RES_REGBLKS); 5703 XFS_QMOPT_RES_REGBLKS);
5703 } 5704 }
5704 ip->i_delayed_blks -= del.br_blockcount; 5705 ip->i_delayed_blks -= del.br_blockcount;
@@ -6008,7 +6009,7 @@ xfs_getbmap(
6008 */ 6009 */
6009 error = ENOMEM; 6010 error = ENOMEM;
6010 subnex = 16; 6011 subnex = 16;
6011 map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL); 6012 map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
6012 if (!map) 6013 if (!map)
6013 goto out_unlock_ilock; 6014 goto out_unlock_ilock;
6014 6015
@@ -6085,6 +6086,7 @@ xfs_getbmap(
6085 break; 6086 break;
6086 } 6087 }
6087 6088
6089 kmem_free(out);
6088 return error; 6090 return error;
6089} 6091}
6090 6092
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 0760d352586f..5c1ade06578e 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -590,7 +590,7 @@ xfs_bmbt_alloc_block(
590 cur->bc_private.b.allocated++; 590 cur->bc_private.b.allocated++;
591 cur->bc_private.b.ip->i_d.di_nblocks++; 591 cur->bc_private.b.ip->i_d.di_nblocks++;
592 xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); 592 xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
593 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip, 593 xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
594 XFS_TRANS_DQ_BCOUNT, 1L); 594 XFS_TRANS_DQ_BCOUNT, 1L);
595 595
596 new->l = cpu_to_be64(args.fsbno); 596 new->l = cpu_to_be64(args.fsbno);
@@ -618,7 +618,7 @@ xfs_bmbt_free_block(
618 ip->i_d.di_nblocks--; 618 ip->i_d.di_nblocks--;
619 619
620 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 620 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
621 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); 621 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
622 xfs_trans_binval(tp, bp); 622 xfs_trans_binval(tp, bp);
623 return 0; 623 return 0;
624} 624}
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e9df99574829..26717388acf5 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -120,8 +120,8 @@ xfs_btree_check_sblock(
120 XFS_RANDOM_BTREE_CHECK_SBLOCK))) { 120 XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
121 if (bp) 121 if (bp)
122 xfs_buftrace("SBTREE ERROR", bp); 122 xfs_buftrace("SBTREE ERROR", bp);
123 XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW, 123 XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
124 cur->bc_mp); 124 XFS_ERRLEVEL_LOW, cur->bc_mp, block);
125 return XFS_ERROR(EFSCORRUPTED); 125 return XFS_ERROR(EFSCORRUPTED);
126 } 126 }
127 return 0; 127 return 0;
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9ff6e57a5075..2847bbc1c534 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */
2201xfs_da_state_t * 2201xfs_da_state_t *
2202xfs_da_state_alloc(void) 2202xfs_da_state_alloc(void)
2203{ 2203{
2204 return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP); 2204 return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
2205} 2205}
2206 2206
2207/* 2207/*
@@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
2261 int off; 2261 int off;
2262 2262
2263 if (nbuf == 1) 2263 if (nbuf == 1)
2264 dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP); 2264 dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS);
2265 else 2265 else
2266 dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP); 2266 dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS);
2267 dabuf->dirty = 0; 2267 dabuf->dirty = 0;
2268#ifdef XFS_DABUF_DEBUG 2268#ifdef XFS_DABUF_DEBUG
2269 dabuf->ra = ra; 2269 dabuf->ra = ra;
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index c657bec6d951..bb1d58eb3982 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -256,7 +256,7 @@ xfs_dir_cilookup_result(
256 !(args->op_flags & XFS_DA_OP_CILOOKUP)) 256 !(args->op_flags & XFS_DA_OP_CILOOKUP))
257 return EEXIST; 257 return EEXIST;
258 258
259 args->value = kmem_alloc(len, KM_MAYFAIL); 259 args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
260 if (!args->value) 260 if (!args->value)
261 return ENOMEM; 261 return ENOMEM;
262 262
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 6c87c8f304ef..edf8bdf4141f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -542,10 +542,8 @@ xfs_filestream_associate(
542 * waiting for the lock because someone else is waiting on the lock we 542 * waiting for the lock because someone else is waiting on the lock we
543 * hold and we cannot drop that as we are in a transaction here. 543 * hold and we cannot drop that as we are in a transaction here.
544 * 544 *
545 * Lucky for us, this inversion is rarely a problem because it's a 545 * Lucky for us, this inversion is not a problem because it's a
546 * directory inode that we are trying to lock here and that means the 546 * directory inode that we are trying to lock here.
547 * only place that matters is xfs_sync_inodes() and SYNC_DELWRI is
548 * used. i.e. freeze, remount-ro, quotasync or unmount.
549 * 547 *
550 * So, if we can't get the iolock without sleeping then just give up 548 * So, if we can't get the iolock without sleeping then just give up
551 */ 549 */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index f7c06fac8229..c4ea51b55dce 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -239,10 +239,13 @@ typedef struct xfs_fsop_resblks {
239 * Minimum and maximum sizes need for growth checks 239 * Minimum and maximum sizes need for growth checks
240 */ 240 */
241#define XFS_MIN_AG_BLOCKS 64 241#define XFS_MIN_AG_BLOCKS 64
242#define XFS_MIN_LOG_BLOCKS 512 242#define XFS_MIN_LOG_BLOCKS 512ULL
243#define XFS_MAX_LOG_BLOCKS (64 * 1024) 243#define XFS_MAX_LOG_BLOCKS (1024 * 1024ULL)
244#define XFS_MIN_LOG_BYTES (256 * 1024) 244#define XFS_MIN_LOG_BYTES (10 * 1024 * 1024ULL)
245#define XFS_MAX_LOG_BYTES (128 * 1024 * 1024) 245
246/* keep the maximum size under 2^31 by a small amount */
247#define XFS_MAX_LOG_BYTES \
248 ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
246 249
247/* 250/*
248 * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT 251 * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cbd451bb4848..2d0b3e1da9e6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,17 +167,25 @@ xfs_growfs_data_private(
167 new = nb - mp->m_sb.sb_dblocks; 167 new = nb - mp->m_sb.sb_dblocks;
168 oagcount = mp->m_sb.sb_agcount; 168 oagcount = mp->m_sb.sb_agcount;
169 if (nagcount > oagcount) { 169 if (nagcount > oagcount) {
170 void *new_perag, *old_perag;
171
170 xfs_filestream_flush(mp); 172 xfs_filestream_flush(mp);
173
174 new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
175 KM_MAYFAIL);
176 if (!new_perag)
177 return XFS_ERROR(ENOMEM);
178
171 down_write(&mp->m_peraglock); 179 down_write(&mp->m_peraglock);
172 mp->m_perag = kmem_realloc(mp->m_perag, 180 memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
173 sizeof(xfs_perag_t) * nagcount, 181 old_perag = mp->m_perag;
174 sizeof(xfs_perag_t) * oagcount, 182 mp->m_perag = new_perag;
175 KM_SLEEP); 183
176 memset(&mp->m_perag[oagcount], 0,
177 (nagcount - oagcount) * sizeof(xfs_perag_t));
178 mp->m_flags |= XFS_MOUNT_32BITINODES; 184 mp->m_flags |= XFS_MOUNT_32BITINODES;
179 nagimax = xfs_initialize_perag(mp, nagcount); 185 nagimax = xfs_initialize_perag(mp, nagcount);
180 up_write(&mp->m_peraglock); 186 up_write(&mp->m_peraglock);
187
188 kmem_free(old_perag);
181 } 189 }
182 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); 190 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
183 tp->t_flags |= XFS_TRANS_RESERVE; 191 tp->t_flags |= XFS_TRANS_RESERVE;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 89b81eedce6a..ecbf8b4d2e2e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -18,6 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_acl.h"
21#include "xfs_bit.h" 22#include "xfs_bit.h"
22#include "xfs_log.h" 23#include "xfs_log.h"
23#include "xfs_inum.h" 24#include "xfs_inum.h"
@@ -63,6 +64,10 @@ xfs_inode_alloc(
63 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); 64 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
64 if (!ip) 65 if (!ip)
65 return NULL; 66 return NULL;
67 if (inode_init_always(mp->m_super, VFS_I(ip))) {
68 kmem_zone_free(xfs_inode_zone, ip);
69 return NULL;
70 }
66 71
67 ASSERT(atomic_read(&ip->i_iocount) == 0); 72 ASSERT(atomic_read(&ip->i_iocount) == 0);
68 ASSERT(atomic_read(&ip->i_pincount) == 0); 73 ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -104,17 +109,6 @@ xfs_inode_alloc(
104#ifdef XFS_DIR2_TRACE 109#ifdef XFS_DIR2_TRACE
105 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); 110 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
106#endif 111#endif
107 /*
108 * Now initialise the VFS inode. We do this after the xfs_inode
109 * initialisation as internal failures will result in ->destroy_inode
110 * being called and that will pass down through the reclaim path and
111 * free the XFS inode. This path requires the XFS inode to already be
112 * initialised. Hence if this call fails, the xfs_inode has already
113 * been freed and we should not reference it at all in the error
114 * handling.
115 */
116 if (!inode_init_always(mp->m_super, VFS_I(ip)))
117 return NULL;
118 112
119 /* prevent anyone from using this yet */ 113 /* prevent anyone from using this yet */
120 VFS_I(ip)->i_state = I_NEW|I_LOCK; 114 VFS_I(ip)->i_state = I_NEW|I_LOCK;
@@ -122,6 +116,71 @@ xfs_inode_alloc(
122 return ip; 116 return ip;
123} 117}
124 118
119STATIC void
120xfs_inode_free(
121 struct xfs_inode *ip)
122{
123 switch (ip->i_d.di_mode & S_IFMT) {
124 case S_IFREG:
125 case S_IFDIR:
126 case S_IFLNK:
127 xfs_idestroy_fork(ip, XFS_DATA_FORK);
128 break;
129 }
130
131 if (ip->i_afp)
132 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
133
134#ifdef XFS_INODE_TRACE
135 ktrace_free(ip->i_trace);
136#endif
137#ifdef XFS_BMAP_TRACE
138 ktrace_free(ip->i_xtrace);
139#endif
140#ifdef XFS_BTREE_TRACE
141 ktrace_free(ip->i_btrace);
142#endif
143#ifdef XFS_RW_TRACE
144 ktrace_free(ip->i_rwtrace);
145#endif
146#ifdef XFS_ILOCK_TRACE
147 ktrace_free(ip->i_lock_trace);
148#endif
149#ifdef XFS_DIR2_TRACE
150 ktrace_free(ip->i_dir_trace);
151#endif
152
153 if (ip->i_itemp) {
154 /*
155 * Only if we are shutting down the fs will we see an
156 * inode still in the AIL. If it is there, we should remove
157 * it to prevent a use-after-free from occurring.
158 */
159 xfs_log_item_t *lip = &ip->i_itemp->ili_item;
160 struct xfs_ail *ailp = lip->li_ailp;
161
162 ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
163 XFS_FORCED_SHUTDOWN(ip->i_mount));
164 if (lip->li_flags & XFS_LI_IN_AIL) {
165 spin_lock(&ailp->xa_lock);
166 if (lip->li_flags & XFS_LI_IN_AIL)
167 xfs_trans_ail_delete(ailp, lip);
168 else
169 spin_unlock(&ailp->xa_lock);
170 }
171 xfs_inode_item_destroy(ip);
172 ip->i_itemp = NULL;
173 }
174
175 /* asserts to verify all state is correct here */
176 ASSERT(atomic_read(&ip->i_iocount) == 0);
177 ASSERT(atomic_read(&ip->i_pincount) == 0);
178 ASSERT(!spin_is_locked(&ip->i_flags_lock));
179 ASSERT(completion_done(&ip->i_flush));
180
181 kmem_zone_free(xfs_inode_zone, ip);
182}
183
125/* 184/*
126 * Check the validity of the inode we just found it the cache 185 * Check the validity of the inode we just found it the cache
127 */ 186 */
@@ -132,80 +191,82 @@ xfs_iget_cache_hit(
132 int flags, 191 int flags,
133 int lock_flags) __releases(pag->pag_ici_lock) 192 int lock_flags) __releases(pag->pag_ici_lock)
134{ 193{
194 struct inode *inode = VFS_I(ip);
135 struct xfs_mount *mp = ip->i_mount; 195 struct xfs_mount *mp = ip->i_mount;
136 int error = EAGAIN; 196 int error;
197
198 spin_lock(&ip->i_flags_lock);
137 199
138 /* 200 /*
139 * If INEW is set this inode is being set up 201 * If we are racing with another cache hit that is currently
140 * If IRECLAIM is set this inode is being torn down 202 * instantiating this inode or currently recycling it out of
141 * Pause and try again. 203 * reclaimabe state, wait for the initialisation to complete
204 * before continuing.
205 *
206 * XXX(hch): eventually we should do something equivalent to
207 * wait_on_inode to wait for these flags to be cleared
208 * instead of polling for it.
142 */ 209 */
143 if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) { 210 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
144 XFS_STATS_INC(xs_ig_frecycle); 211 XFS_STATS_INC(xs_ig_frecycle);
212 error = EAGAIN;
145 goto out_error; 213 goto out_error;
146 } 214 }
147 215
148 /* If IRECLAIMABLE is set, we've torn down the vfs inode part */ 216 /*
149 if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { 217 * If lookup is racing with unlink return an error immediately.
150 218 */
151 /* 219 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
152 * If lookup is racing with unlink, then we should return an 220 error = ENOENT;
153 * error immediately so we don't remove it from the reclaim 221 goto out_error;
154 * list and potentially leak the inode. 222 }
155 */
156 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
157 error = ENOENT;
158 goto out_error;
159 }
160 223
224 /*
225 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
226 * Need to carefully get it back into useable state.
227 */
228 if (ip->i_flags & XFS_IRECLAIMABLE) {
161 xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); 229 xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
162 230
163 /* 231 /*
164 * We need to re-initialise the VFS inode as it has been 232 * We need to set XFS_INEW atomically with clearing the
165 * 'freed' by the VFS. Do this here so we can deal with 233 * reclaimable tag so that we do have an indicator of the
166 * errors cleanly, then tag it so it can be set up correctly 234 * inode still being initialized.
167 * later.
168 */ 235 */
169 if (!inode_init_always(mp->m_super, VFS_I(ip))) { 236 ip->i_flags |= XFS_INEW;
170 error = ENOMEM; 237 ip->i_flags &= ~XFS_IRECLAIMABLE;
171 goto out_error; 238 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
172 }
173 239
174 /* 240 spin_unlock(&ip->i_flags_lock);
175 * We must set the XFS_INEW flag before clearing the 241 read_unlock(&pag->pag_ici_lock);
176 * XFS_IRECLAIMABLE flag so that if a racing lookup does
177 * not find the XFS_IRECLAIMABLE above but has the igrab()
178 * below succeed we can safely check XFS_INEW to detect
179 * that this inode is still being initialised.
180 */
181 xfs_iflags_set(ip, XFS_INEW);
182 xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
183 242
184 /* clear the radix tree reclaim flag as well. */ 243 error = -inode_init_always(mp->m_super, inode);
185 __xfs_inode_clear_reclaim_tag(mp, pag, ip); 244 if (error) {
186 } else if (!igrab(VFS_I(ip))) { 245 /*
246 * Re-initializing the inode failed, and we are in deep
247 * trouble. Try to re-add it to the reclaim list.
248 */
249 read_lock(&pag->pag_ici_lock);
250 spin_lock(&ip->i_flags_lock);
251
252 ip->i_flags &= ~XFS_INEW;
253 ip->i_flags |= XFS_IRECLAIMABLE;
254 __xfs_inode_set_reclaim_tag(pag, ip);
255 goto out_error;
256 }
257 inode->i_state = I_LOCK|I_NEW;
258 } else {
187 /* If the VFS inode is being torn down, pause and try again. */ 259 /* If the VFS inode is being torn down, pause and try again. */
188 XFS_STATS_INC(xs_ig_frecycle); 260 if (!igrab(inode)) {
189 goto out_error; 261 error = EAGAIN;
190 } else if (xfs_iflags_test(ip, XFS_INEW)) { 262 goto out_error;
191 /* 263 }
192 * We are racing with another cache hit that is
193 * currently recycling this inode out of the XFS_IRECLAIMABLE
194 * state. Wait for the initialisation to complete before
195 * continuing.
196 */
197 wait_on_inode(VFS_I(ip));
198 }
199 264
200 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { 265 /* We've got a live one. */
201 error = ENOENT; 266 spin_unlock(&ip->i_flags_lock);
202 iput(VFS_I(ip)); 267 read_unlock(&pag->pag_ici_lock);
203 goto out_error;
204 } 268 }
205 269
206 /* We've got a live one. */
207 read_unlock(&pag->pag_ici_lock);
208
209 if (lock_flags != 0) 270 if (lock_flags != 0)
210 xfs_ilock(ip, lock_flags); 271 xfs_ilock(ip, lock_flags);
211 272
@@ -215,6 +276,7 @@ xfs_iget_cache_hit(
215 return 0; 276 return 0;
216 277
217out_error: 278out_error:
279 spin_unlock(&ip->i_flags_lock);
218 read_unlock(&pag->pag_ici_lock); 280 read_unlock(&pag->pag_ici_lock);
219 return error; 281 return error;
220} 282}
@@ -298,7 +360,8 @@ out_preload_end:
298 if (lock_flags) 360 if (lock_flags)
299 xfs_iunlock(ip, lock_flags); 361 xfs_iunlock(ip, lock_flags);
300out_destroy: 362out_destroy:
301 xfs_destroy_inode(ip); 363 __destroy_inode(VFS_I(ip));
364 xfs_inode_free(ip);
302 return error; 365 return error;
303} 366}
304 367
@@ -500,68 +563,10 @@ xfs_ireclaim(
500 * ilock one but will still hold the iolock. 563 * ilock one but will still hold the iolock.
501 */ 564 */
502 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 565 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
503 /* 566 xfs_qm_dqdetach(ip);
504 * Release dquots (and their references) if any.
505 */
506 XFS_QM_DQDETACH(ip->i_mount, ip);
507 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 567 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
508 568
509 switch (ip->i_d.di_mode & S_IFMT) { 569 xfs_inode_free(ip);
510 case S_IFREG:
511 case S_IFDIR:
512 case S_IFLNK:
513 xfs_idestroy_fork(ip, XFS_DATA_FORK);
514 break;
515 }
516
517 if (ip->i_afp)
518 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
519
520#ifdef XFS_INODE_TRACE
521 ktrace_free(ip->i_trace);
522#endif
523#ifdef XFS_BMAP_TRACE
524 ktrace_free(ip->i_xtrace);
525#endif
526#ifdef XFS_BTREE_TRACE
527 ktrace_free(ip->i_btrace);
528#endif
529#ifdef XFS_RW_TRACE
530 ktrace_free(ip->i_rwtrace);
531#endif
532#ifdef XFS_ILOCK_TRACE
533 ktrace_free(ip->i_lock_trace);
534#endif
535#ifdef XFS_DIR2_TRACE
536 ktrace_free(ip->i_dir_trace);
537#endif
538 if (ip->i_itemp) {
539 /*
540 * Only if we are shutting down the fs will we see an
541 * inode still in the AIL. If it is there, we should remove
542 * it to prevent a use-after-free from occurring.
543 */
544 xfs_log_item_t *lip = &ip->i_itemp->ili_item;
545 struct xfs_ail *ailp = lip->li_ailp;
546
547 ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
548 XFS_FORCED_SHUTDOWN(ip->i_mount));
549 if (lip->li_flags & XFS_LI_IN_AIL) {
550 spin_lock(&ailp->xa_lock);
551 if (lip->li_flags & XFS_LI_IN_AIL)
552 xfs_trans_ail_delete(ailp, lip);
553 else
554 spin_unlock(&ailp->xa_lock);
555 }
556 xfs_inode_item_destroy(ip);
557 ip->i_itemp = NULL;
558 }
559 /* asserts to verify all state is correct here */
560 ASSERT(atomic_read(&ip->i_iocount) == 0);
561 ASSERT(atomic_read(&ip->i_pincount) == 0);
562 ASSERT(!spin_is_locked(&ip->i_flags_lock));
563 ASSERT(completion_done(&ip->i_flush));
564 kmem_zone_free(xfs_inode_zone, ip);
565} 570}
566 571
567/* 572/*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 123b20c8cbf2..da428b3fe0f5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -49,7 +49,6 @@
49#include "xfs_utils.h" 49#include "xfs_utils.h"
50#include "xfs_dir2_trace.h" 50#include "xfs_dir2_trace.h"
51#include "xfs_quota.h" 51#include "xfs_quota.h"
52#include "xfs_acl.h"
53#include "xfs_filestream.h" 52#include "xfs_filestream.h"
54#include "xfs_vnodeops.h" 53#include "xfs_vnodeops.h"
55 54
@@ -344,6 +343,16 @@ xfs_iformat(
344 return XFS_ERROR(EFSCORRUPTED); 343 return XFS_ERROR(EFSCORRUPTED);
345 } 344 }
346 345
346 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
347 !ip->i_mount->m_rtdev_targp)) {
348 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
349 "corrupt dinode %Lu, has realtime flag set.",
350 ip->i_ino);
351 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
352 XFS_ERRLEVEL_LOW, ip->i_mount, dip);
353 return XFS_ERROR(EFSCORRUPTED);
354 }
355
347 switch (ip->i_d.di_mode & S_IFMT) { 356 switch (ip->i_d.di_mode & S_IFMT) {
348 case S_IFIFO: 357 case S_IFIFO:
349 case S_IFCHR: 358 case S_IFCHR:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f879c1bc4b96..65f24a3cc992 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -18,6 +18,7 @@
18#ifndef __XFS_INODE_H__ 18#ifndef __XFS_INODE_H__
19#define __XFS_INODE_H__ 19#define __XFS_INODE_H__
20 20
21struct posix_acl;
21struct xfs_dinode; 22struct xfs_dinode;
22struct xfs_inode; 23struct xfs_inode;
23 24
@@ -309,23 +310,6 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
309} 310}
310 311
311/* 312/*
312 * Get rid of a partially initialized inode.
313 *
314 * We have to go through destroy_inode to make sure allocations
315 * from init_inode_always like the security data are undone.
316 *
317 * We mark the inode bad so that it takes the short cut in
318 * the reclaim path instead of going through the flush path
319 * which doesn't make sense for an inode that has never seen the
320 * light of day.
321 */
322static inline void xfs_destroy_inode(struct xfs_inode *ip)
323{
324 make_bad_inode(VFS_I(ip));
325 return destroy_inode(VFS_I(ip));
326}
327
328/*
329 * i_flags helper functions 313 * i_flags helper functions
330 */ 314 */
331static inline void 315static inline void
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5aaa2d7ec155..67ae5555a30a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -42,7 +42,6 @@
42#include "xfs_error.h" 42#include "xfs_error.h"
43#include "xfs_itable.h" 43#include "xfs_itable.h"
44#include "xfs_rw.h" 44#include "xfs_rw.h"
45#include "xfs_acl.h"
46#include "xfs_attr.h" 45#include "xfs_attr.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
48#include "xfs_trans_space.h" 47#include "xfs_trans_space.h"
@@ -385,7 +384,7 @@ xfs_iomap_write_direct(
385 * Make sure that the dquots are there. This doesn't hold 384 * Make sure that the dquots are there. This doesn't hold
386 * the ilock across a disk read. 385 * the ilock across a disk read.
387 */ 386 */
388 error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED); 387 error = xfs_qm_dqattach_locked(ip, 0);
389 if (error) 388 if (error)
390 return XFS_ERROR(error); 389 return XFS_ERROR(error);
391 390
@@ -444,8 +443,7 @@ xfs_iomap_write_direct(
444 if (error) 443 if (error)
445 goto error_out; 444 goto error_out;
446 445
447 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, 446 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
448 qblocks, 0, quota_flag);
449 if (error) 447 if (error)
450 goto error1; 448 goto error1;
451 449
@@ -495,7 +493,7 @@ xfs_iomap_write_direct(
495 493
496error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ 494error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
497 xfs_bmap_cancel(&free_list); 495 xfs_bmap_cancel(&free_list);
498 XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag); 496 xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
499 497
500error1: /* Just cancel transaction */ 498error1: /* Just cancel transaction */
501 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 499 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
@@ -582,7 +580,7 @@ xfs_iomap_write_delay(
582 * Make sure that the dquots are there. This doesn't hold 580 * Make sure that the dquots are there. This doesn't hold
583 * the ilock across a disk read. 581 * the ilock across a disk read.
584 */ 582 */
585 error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED); 583 error = xfs_qm_dqattach_locked(ip, 0);
586 if (error) 584 if (error)
587 return XFS_ERROR(error); 585 return XFS_ERROR(error);
588 586
@@ -684,7 +682,8 @@ xfs_iomap_write_allocate(
684 /* 682 /*
685 * Make sure that the dquots are there. 683 * Make sure that the dquots are there.
686 */ 684 */
687 if ((error = XFS_QM_DQATTACH(mp, ip, 0))) 685 error = xfs_qm_dqattach(ip, 0);
686 if (error)
688 return XFS_ERROR(error); 687 return XFS_ERROR(error);
689 688
690 offset_fsb = XFS_B_TO_FSBT(mp, offset); 689 offset_fsb = XFS_B_TO_FSBT(mp, offset);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3750f04ede0b..9dbdff3ea484 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3180,7 +3180,7 @@ try_again:
3180STATIC void 3180STATIC void
3181xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) 3181xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
3182{ 3182{
3183 ASSERT(spin_is_locked(&log->l_icloglock)); 3183 assert_spin_locked(&log->l_icloglock);
3184 3184
3185 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3185 if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3186 xlog_state_switch_iclogs(log, iclog, 0); 3186 xlog_state_switch_iclogs(log, iclog, 0);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7ba450116d4f..47da2fb45377 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1975,16 +1975,30 @@ xlog_recover_do_reg_buffer(
1975 error = 0; 1975 error = 0;
1976 if (buf_f->blf_flags & 1976 if (buf_f->blf_flags &
1977 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 1977 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
1978 if (item->ri_buf[i].i_addr == NULL) {
1979 cmn_err(CE_ALERT,
1980 "XFS: NULL dquot in %s.", __func__);
1981 goto next;
1982 }
1983 if (item->ri_buf[i].i_len < sizeof(xfs_dqblk_t)) {
1984 cmn_err(CE_ALERT,
1985 "XFS: dquot too small (%d) in %s.",
1986 item->ri_buf[i].i_len, __func__);
1987 goto next;
1988 }
1978 error = xfs_qm_dqcheck((xfs_disk_dquot_t *) 1989 error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
1979 item->ri_buf[i].i_addr, 1990 item->ri_buf[i].i_addr,
1980 -1, 0, XFS_QMOPT_DOWARN, 1991 -1, 0, XFS_QMOPT_DOWARN,
1981 "dquot_buf_recover"); 1992 "dquot_buf_recover");
1993 if (error)
1994 goto next;
1982 } 1995 }
1983 if (!error) 1996
1984 memcpy(xfs_buf_offset(bp, 1997 memcpy(xfs_buf_offset(bp,
1985 (uint)bit << XFS_BLI_SHIFT), /* dest */ 1998 (uint)bit << XFS_BLI_SHIFT), /* dest */
1986 item->ri_buf[i].i_addr, /* source */ 1999 item->ri_buf[i].i_addr, /* source */
1987 nbits<<XFS_BLI_SHIFT); /* length */ 2000 nbits<<XFS_BLI_SHIFT); /* length */
2001 next:
1988 i++; 2002 i++;
1989 bit += nbits; 2003 bit += nbits;
1990 } 2004 }
@@ -2615,7 +2629,19 @@ xlog_recover_do_dquot_trans(
2615 return (0); 2629 return (0);
2616 2630
2617 recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr; 2631 recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
2618 ASSERT(recddq); 2632
2633 if (item->ri_buf[1].i_addr == NULL) {
2634 cmn_err(CE_ALERT,
2635 "XFS: NULL dquot in %s.", __func__);
2636 return XFS_ERROR(EIO);
2637 }
2638 if (item->ri_buf[1].i_len < sizeof(xfs_dqblk_t)) {
2639 cmn_err(CE_ALERT,
2640 "XFS: dquot too small (%d) in %s.",
2641 item->ri_buf[1].i_len, __func__);
2642 return XFS_ERROR(EIO);
2643 }
2644
2619 /* 2645 /*
2620 * This type of quotas was turned off, so ignore this record. 2646 * This type of quotas was turned off, so ignore this record.
2621 */ 2647 */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 65a99725d0cc..5c6f092659c1 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -960,6 +960,53 @@ xfs_check_sizes(xfs_mount_t *mp)
960} 960}
961 961
962/* 962/*
963 * Clear the quotaflags in memory and in the superblock.
964 */
965int
966xfs_mount_reset_sbqflags(
967 struct xfs_mount *mp)
968{
969 int error;
970 struct xfs_trans *tp;
971
972 mp->m_qflags = 0;
973
974 /*
975 * It is OK to look at sb_qflags here in mount path,
976 * without m_sb_lock.
977 */
978 if (mp->m_sb.sb_qflags == 0)
979 return 0;
980 spin_lock(&mp->m_sb_lock);
981 mp->m_sb.sb_qflags = 0;
982 spin_unlock(&mp->m_sb_lock);
983
984 /*
985 * If the fs is readonly, let the incore superblock run
986 * with quotas off but don't flush the update out to disk
987 */
988 if (mp->m_flags & XFS_MOUNT_RDONLY)
989 return 0;
990
991#ifdef QUOTADEBUG
992 xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
993#endif
994
995 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
996 error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
997 XFS_DEFAULT_LOG_COUNT);
998 if (error) {
999 xfs_trans_cancel(tp, 0);
1000 xfs_fs_cmn_err(CE_ALERT, mp,
1001 "xfs_mount_reset_sbqflags: Superblock update failed!");
1002 return error;
1003 }
1004
1005 xfs_mod_sb(tp, XFS_SB_QFLAGS);
1006 return xfs_trans_commit(tp, 0);
1007}
1008
1009/*
963 * This function does the following on an initial mount of a file system: 1010 * This function does the following on an initial mount of a file system:
964 * - reads the superblock from disk and init the mount struct 1011 * - reads the superblock from disk and init the mount struct
965 * - if we're a 32-bit kernel, do a size check on the superblock 1012 * - if we're a 32-bit kernel, do a size check on the superblock
@@ -976,7 +1023,8 @@ xfs_mountfs(
976 xfs_sb_t *sbp = &(mp->m_sb); 1023 xfs_sb_t *sbp = &(mp->m_sb);
977 xfs_inode_t *rip; 1024 xfs_inode_t *rip;
978 __uint64_t resblks; 1025 __uint64_t resblks;
979 uint quotamount, quotaflags; 1026 uint quotamount = 0;
1027 uint quotaflags = 0;
980 int error = 0; 1028 int error = 0;
981 1029
982 xfs_mount_common(mp, sbp); 1030 xfs_mount_common(mp, sbp);
@@ -1210,9 +1258,28 @@ xfs_mountfs(
1210 /* 1258 /*
1211 * Initialise the XFS quota management subsystem for this mount 1259 * Initialise the XFS quota management subsystem for this mount
1212 */ 1260 */
1213 error = XFS_QM_INIT(mp, &quotamount, &quotaflags); 1261 if (XFS_IS_QUOTA_RUNNING(mp)) {
1214 if (error) 1262 error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
1215 goto out_rtunmount; 1263 if (error)
1264 goto out_rtunmount;
1265 } else {
1266 ASSERT(!XFS_IS_QUOTA_ON(mp));
1267
1268 /*
1269 * If a file system had quotas running earlier, but decided to
1270 * mount without -o uquota/pquota/gquota options, revoke the
1271 * quotachecked license.
1272 */
1273 if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
1274 cmn_err(CE_NOTE,
1275 "XFS: resetting qflags for filesystem %s",
1276 mp->m_fsname);
1277
1278 error = xfs_mount_reset_sbqflags(mp);
1279 if (error)
1280 return error;
1281 }
1282 }
1216 1283
1217 /* 1284 /*
1218 * Finish recovering the file system. This part needed to be 1285 * Finish recovering the file system. This part needed to be
@@ -1228,9 +1295,19 @@ xfs_mountfs(
1228 /* 1295 /*
1229 * Complete the quota initialisation, post-log-replay component. 1296 * Complete the quota initialisation, post-log-replay component.
1230 */ 1297 */
1231 error = XFS_QM_MOUNT(mp, quotamount, quotaflags); 1298 if (quotamount) {
1232 if (error) 1299 ASSERT(mp->m_qflags == 0);
1233 goto out_rtunmount; 1300 mp->m_qflags = quotaflags;
1301
1302 xfs_qm_mount_quotas(mp);
1303 }
1304
1305#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1306 if (XFS_IS_QUOTA_ON(mp))
1307 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
1308 else
1309 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
1310#endif
1234 1311
1235 /* 1312 /*
1236 * Now we are mounted, reserve a small amount of unused space for 1313 * Now we are mounted, reserve a small amount of unused space for
@@ -1279,12 +1356,7 @@ xfs_unmountfs(
1279 __uint64_t resblks; 1356 __uint64_t resblks;
1280 int error; 1357 int error;
1281 1358
1282 /* 1359 xfs_qm_unmount_quotas(mp);
1283 * Release dquot that rootinode, rbmino and rsumino might be holding,
1284 * and release the quota inodes.
1285 */
1286 XFS_QM_UNMOUNT(mp);
1287
1288 xfs_rtunmount_inodes(mp); 1360 xfs_rtunmount_inodes(mp);
1289 IRELE(mp->m_rootip); 1361 IRELE(mp->m_rootip);
1290 1362
@@ -1299,12 +1371,9 @@ xfs_unmountfs(
1299 * need to force the log first. 1371 * need to force the log first.
1300 */ 1372 */
1301 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1373 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
1302 xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC); 1374 xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);
1303
1304 XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
1305 1375
1306 if (mp->m_quotainfo) 1376 xfs_qm_unmount(mp);
1307 XFS_QM_DONE(mp);
1308 1377
1309 /* 1378 /*
1310 * Flush out the log synchronously so that we know for sure 1379 * Flush out the log synchronously so that we know for sure
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d6a64392f983..a5122382afde 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -64,6 +64,8 @@ struct xfs_swapext;
64struct xfs_mru_cache; 64struct xfs_mru_cache;
65struct xfs_nameops; 65struct xfs_nameops;
66struct xfs_ail; 66struct xfs_ail;
67struct xfs_quotainfo;
68
67 69
68/* 70/*
69 * Prototypes and functions for the Data Migration subsystem. 71 * Prototypes and functions for the Data Migration subsystem.
@@ -107,86 +109,6 @@ typedef struct xfs_dmops {
107 (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl) 109 (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
108 110
109 111
110/*
111 * Prototypes and functions for the Quota Management subsystem.
112 */
113
114struct xfs_dquot;
115struct xfs_dqtrxops;
116struct xfs_quotainfo;
117
118typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
119typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
120typedef void (*xfs_qmunmount_t)(struct xfs_mount *);
121typedef void (*xfs_qmdone_t)(struct xfs_mount *);
122typedef void (*xfs_dqrele_t)(struct xfs_dquot *);
123typedef int (*xfs_dqattach_t)(struct xfs_inode *, uint);
124typedef void (*xfs_dqdetach_t)(struct xfs_inode *);
125typedef int (*xfs_dqpurgeall_t)(struct xfs_mount *, uint);
126typedef int (*xfs_dqvopalloc_t)(struct xfs_mount *,
127 struct xfs_inode *, uid_t, gid_t, prid_t, uint,
128 struct xfs_dquot **, struct xfs_dquot **);
129typedef void (*xfs_dqvopcreate_t)(struct xfs_trans *, struct xfs_inode *,
130 struct xfs_dquot *, struct xfs_dquot *);
131typedef int (*xfs_dqvoprename_t)(struct xfs_inode **);
132typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
133 struct xfs_trans *, struct xfs_inode *,
134 struct xfs_dquot **, struct xfs_dquot *);
135typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
136 struct xfs_dquot *, struct xfs_dquot *, uint);
137typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
138typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags);
139
140typedef struct xfs_qmops {
141 xfs_qminit_t xfs_qminit;
142 xfs_qmdone_t xfs_qmdone;
143 xfs_qmmount_t xfs_qmmount;
144 xfs_qmunmount_t xfs_qmunmount;
145 xfs_dqrele_t xfs_dqrele;
146 xfs_dqattach_t xfs_dqattach;
147 xfs_dqdetach_t xfs_dqdetach;
148 xfs_dqpurgeall_t xfs_dqpurgeall;
149 xfs_dqvopalloc_t xfs_dqvopalloc;
150 xfs_dqvopcreate_t xfs_dqvopcreate;
151 xfs_dqvoprename_t xfs_dqvoprename;
152 xfs_dqvopchown_t xfs_dqvopchown;
153 xfs_dqvopchownresv_t xfs_dqvopchownresv;
154 xfs_dqstatvfs_t xfs_dqstatvfs;
155 xfs_dqsync_t xfs_dqsync;
156 struct xfs_dqtrxops *xfs_dqtrxops;
157} xfs_qmops_t;
158
159#define XFS_QM_INIT(mp, mnt, fl) \
160 (*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl)
161#define XFS_QM_MOUNT(mp, mnt, fl) \
162 (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl)
163#define XFS_QM_UNMOUNT(mp) \
164 (*(mp)->m_qm_ops->xfs_qmunmount)(mp)
165#define XFS_QM_DONE(mp) \
166 (*(mp)->m_qm_ops->xfs_qmdone)(mp)
167#define XFS_QM_DQRELE(mp, dq) \
168 (*(mp)->m_qm_ops->xfs_dqrele)(dq)
169#define XFS_QM_DQATTACH(mp, ip, fl) \
170 (*(mp)->m_qm_ops->xfs_dqattach)(ip, fl)
171#define XFS_QM_DQDETACH(mp, ip) \
172 (*(mp)->m_qm_ops->xfs_dqdetach)(ip)
173#define XFS_QM_DQPURGEALL(mp, fl) \
174 (*(mp)->m_qm_ops->xfs_dqpurgeall)(mp, fl)
175#define XFS_QM_DQVOPALLOC(mp, ip, uid, gid, prid, fl, dq1, dq2) \
176 (*(mp)->m_qm_ops->xfs_dqvopalloc)(mp, ip, uid, gid, prid, fl, dq1, dq2)
177#define XFS_QM_DQVOPCREATE(mp, tp, ip, dq1, dq2) \
178 (*(mp)->m_qm_ops->xfs_dqvopcreate)(tp, ip, dq1, dq2)
179#define XFS_QM_DQVOPRENAME(mp, ip) \
180 (*(mp)->m_qm_ops->xfs_dqvoprename)(ip)
181#define XFS_QM_DQVOPCHOWN(mp, tp, ip, dqp, dq) \
182 (*(mp)->m_qm_ops->xfs_dqvopchown)(tp, ip, dqp, dq)
183#define XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, dq1, dq2, fl) \
184 (*(mp)->m_qm_ops->xfs_dqvopchownresv)(tp, ip, dq1, dq2, fl)
185#define XFS_QM_DQSTATVFS(ip, statp) \
186 (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp)
187#define XFS_QM_DQSYNC(mp, flags) \
188 (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags)
189
190#ifdef HAVE_PERCPU_SB 112#ifdef HAVE_PERCPU_SB
191 113
192/* 114/*
@@ -510,8 +432,6 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
510 432
511extern int xfs_dmops_get(struct xfs_mount *); 433extern int xfs_dmops_get(struct xfs_mount *);
512extern void xfs_dmops_put(struct xfs_mount *); 434extern void xfs_dmops_put(struct xfs_mount *);
513extern int xfs_qmops_get(struct xfs_mount *);
514extern void xfs_qmops_put(struct xfs_mount *);
515 435
516extern struct xfs_dmops xfs_dmcore_xfs; 436extern struct xfs_dmops xfs_dmcore_xfs;
517 437
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
deleted file mode 100644
index e101790ea8e7..000000000000
--- a/fs/xfs/xfs_qmops.c
+++ /dev/null
@@ -1,152 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h"
29#include "xfs_quota.h"
30#include "xfs_error.h"
31
32
33STATIC struct xfs_dquot *
34xfs_dqvopchown_default(
35 struct xfs_trans *tp,
36 struct xfs_inode *ip,
37 struct xfs_dquot **dqp,
38 struct xfs_dquot *dq)
39{
40 return NULL;
41}
42
43/*
44 * Clear the quotaflags in memory and in the superblock.
45 */
46int
47xfs_mount_reset_sbqflags(xfs_mount_t *mp)
48{
49 int error;
50 xfs_trans_t *tp;
51
52 mp->m_qflags = 0;
53 /*
54 * It is OK to look at sb_qflags here in mount path,
55 * without m_sb_lock.
56 */
57 if (mp->m_sb.sb_qflags == 0)
58 return 0;
59 spin_lock(&mp->m_sb_lock);
60 mp->m_sb.sb_qflags = 0;
61 spin_unlock(&mp->m_sb_lock);
62
63 /*
64 * if the fs is readonly, let the incore superblock run
65 * with quotas off but don't flush the update out to disk
66 */
67 if (mp->m_flags & XFS_MOUNT_RDONLY)
68 return 0;
69#ifdef QUOTADEBUG
70 xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
71#endif
72 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
73 if ((error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
74 XFS_DEFAULT_LOG_COUNT))) {
75 xfs_trans_cancel(tp, 0);
76 xfs_fs_cmn_err(CE_ALERT, mp,
77 "xfs_mount_reset_sbqflags: Superblock update failed!");
78 return error;
79 }
80 xfs_mod_sb(tp, XFS_SB_QFLAGS);
81 error = xfs_trans_commit(tp, 0);
82 return error;
83}
84
85STATIC int
86xfs_noquota_init(
87 xfs_mount_t *mp,
88 uint *needquotamount,
89 uint *quotaflags)
90{
91 int error = 0;
92
93 *quotaflags = 0;
94 *needquotamount = B_FALSE;
95
96 ASSERT(!XFS_IS_QUOTA_ON(mp));
97
98 /*
99 * If a file system had quotas running earlier, but decided to
100 * mount without -o uquota/pquota/gquota options, revoke the
101 * quotachecked license.
102 */
103 if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
104 cmn_err(CE_NOTE,
105 "XFS resetting qflags for filesystem %s",
106 mp->m_fsname);
107
108 error = xfs_mount_reset_sbqflags(mp);
109 }
110 return error;
111}
112
113static struct xfs_qmops xfs_qmcore_stub = {
114 .xfs_qminit = (xfs_qminit_t) xfs_noquota_init,
115 .xfs_qmdone = (xfs_qmdone_t) fs_noerr,
116 .xfs_qmmount = (xfs_qmmount_t) fs_noerr,
117 .xfs_qmunmount = (xfs_qmunmount_t) fs_noerr,
118 .xfs_dqrele = (xfs_dqrele_t) fs_noerr,
119 .xfs_dqattach = (xfs_dqattach_t) fs_noerr,
120 .xfs_dqdetach = (xfs_dqdetach_t) fs_noerr,
121 .xfs_dqpurgeall = (xfs_dqpurgeall_t) fs_noerr,
122 .xfs_dqvopalloc = (xfs_dqvopalloc_t) fs_noerr,
123 .xfs_dqvopcreate = (xfs_dqvopcreate_t) fs_noerr,
124 .xfs_dqvoprename = (xfs_dqvoprename_t) fs_noerr,
125 .xfs_dqvopchown = xfs_dqvopchown_default,
126 .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr,
127 .xfs_dqstatvfs = (xfs_dqstatvfs_t) fs_noval,
128 .xfs_dqsync = (xfs_dqsync_t) fs_noerr,
129};
130
131int
132xfs_qmops_get(struct xfs_mount *mp)
133{
134 if (XFS_IS_QUOTA_RUNNING(mp)) {
135#ifdef CONFIG_XFS_QUOTA
136 mp->m_qm_ops = &xfs_qmcore_xfs;
137#else
138 cmn_err(CE_WARN,
139 "XFS: qouta support not available in this kernel.");
140 return EINVAL;
141#endif
142 } else {
143 mp->m_qm_ops = &xfs_qmcore_stub;
144 }
145
146 return 0;
147}
148
149void
150xfs_qmops_put(struct xfs_mount *mp)
151{
152}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index f5d1202dde25..3ec91ac74c2a 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -197,7 +197,6 @@ typedef struct xfs_qoff_logformat {
197#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */ 197#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */
198#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */ 198#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */
199#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ 199#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
200#define XFS_QMOPT_ILOCKED 0x0000800 /* inode is already locked (excl) */
201#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ 200#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
202#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ 201#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
203#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ 202#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
@@ -302,69 +301,79 @@ typedef struct xfs_dqtrx {
302 long qt_delrtb_delta; /* delayed RT blk count changes */ 301 long qt_delrtb_delta; /* delayed RT blk count changes */
303} xfs_dqtrx_t; 302} xfs_dqtrx_t;
304 303
305/* 304#ifdef CONFIG_XFS_QUOTA
306 * Dquot transaction functions, used if quota is enabled. 305extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
307 */ 306extern void xfs_trans_free_dqinfo(struct xfs_trans *);
308typedef void (*qo_dup_dqinfo_t)(struct xfs_trans *, struct xfs_trans *); 307extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
309typedef void (*qo_mod_dquot_byino_t)(struct xfs_trans *, 308 uint, long);
310 struct xfs_inode *, uint, long); 309extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
311typedef void (*qo_free_dqinfo_t)(struct xfs_trans *); 310extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
312typedef void (*qo_apply_dquot_deltas_t)(struct xfs_trans *); 311extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
313typedef void (*qo_unreserve_and_mod_dquots_t)(struct xfs_trans *); 312 struct xfs_inode *, long, long, uint);
314typedef int (*qo_reserve_quota_nblks_t)( 313extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
315 struct xfs_trans *, struct xfs_mount *, 314 struct xfs_mount *, struct xfs_dquot *,
316 struct xfs_inode *, long, long, uint); 315 struct xfs_dquot *, long, long, uint);
317typedef int (*qo_reserve_quota_bydquots_t)( 316
318 struct xfs_trans *, struct xfs_mount *, 317extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
319 struct xfs_dquot *, struct xfs_dquot *, 318 struct xfs_dquot **, struct xfs_dquot **);
320 long, long, uint); 319extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
321typedef struct xfs_dqtrxops { 320 struct xfs_dquot *, struct xfs_dquot *);
322 qo_dup_dqinfo_t qo_dup_dqinfo; 321extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
323 qo_free_dqinfo_t qo_free_dqinfo; 322extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
324 qo_mod_dquot_byino_t qo_mod_dquot_byino; 323 struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
325 qo_apply_dquot_deltas_t qo_apply_dquot_deltas; 324extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
326 qo_reserve_quota_nblks_t qo_reserve_quota_nblks; 325 struct xfs_dquot *, struct xfs_dquot *, uint);
327 qo_reserve_quota_bydquots_t qo_reserve_quota_bydquots; 326extern int xfs_qm_dqattach(struct xfs_inode *, uint);
328 qo_unreserve_and_mod_dquots_t qo_unreserve_and_mod_dquots; 327extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
329} xfs_dqtrxops_t; 328extern void xfs_qm_dqdetach(struct xfs_inode *);
330 329extern void xfs_qm_dqrele(struct xfs_dquot *);
331#define XFS_DQTRXOP(mp, tp, op, args...) \ 330extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
332 ((mp)->m_qm_ops->xfs_dqtrxops ? \ 331extern int xfs_qm_sync(struct xfs_mount *, int);
333 ((mp)->m_qm_ops->xfs_dqtrxops->op)(tp, ## args) : 0) 332extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
334 333extern void xfs_qm_mount_quotas(struct xfs_mount *);
335#define XFS_DQTRXOP_VOID(mp, tp, op, args...) \ 334extern void xfs_qm_unmount(struct xfs_mount *);
336 ((mp)->m_qm_ops->xfs_dqtrxops ? \ 335extern void xfs_qm_unmount_quotas(struct xfs_mount *);
337 ((mp)->m_qm_ops->xfs_dqtrxops->op)(tp, ## args) : (void)0) 336
338 337#else
339#define XFS_TRANS_DUP_DQINFO(mp, otp, ntp) \ 338static inline int
340 XFS_DQTRXOP_VOID(mp, otp, qo_dup_dqinfo, ntp) 339xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
341#define XFS_TRANS_FREE_DQINFO(mp, tp) \ 340 uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
342 XFS_DQTRXOP_VOID(mp, tp, qo_free_dqinfo) 341{
343#define XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, field, delta) \ 342 *udqp = NULL;
344 XFS_DQTRXOP_VOID(mp, tp, qo_mod_dquot_byino, ip, field, delta) 343 *gdqp = NULL;
345#define XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp) \ 344 return 0;
346 XFS_DQTRXOP_VOID(mp, tp, qo_apply_dquot_deltas) 345}
347#define XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, fl) \ 346#define xfs_trans_dup_dqinfo(tp, tp2)
348 XFS_DQTRXOP(mp, tp, qo_reserve_quota_nblks, mp, ip, nblks, ninos, fl) 347#define xfs_trans_free_dqinfo(tp)
349#define XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, fl) \ 348#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
350 XFS_DQTRXOP(mp, tp, qo_reserve_quota_bydquots, mp, ud, gd, nb, ni, fl) 349#define xfs_trans_apply_dquot_deltas(tp)
351#define XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp) \ 350#define xfs_trans_unreserve_and_mod_dquots(tp)
352 XFS_DQTRXOP_VOID(mp, tp, qo_unreserve_and_mod_dquots) 351#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags) (0)
353 352#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl) (0)
354#define XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, flags) \ 353#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
355 XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, -(nblks), -(ninos), flags) 354#define xfs_qm_vop_rename_dqattach(it) (0)
356#define XFS_TRANS_RESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \ 355#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
357 XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, \ 356#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl) (0)
358 f | XFS_QMOPT_RES_REGBLKS) 357#define xfs_qm_dqattach(ip, fl) (0)
359#define XFS_TRANS_UNRESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \ 358#define xfs_qm_dqattach_locked(ip, fl) (0)
360 XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, -(nb), -(ni), \ 359#define xfs_qm_dqdetach(ip)
360#define xfs_qm_dqrele(d)
361#define xfs_qm_statvfs(ip, s)
362#define xfs_qm_sync(mp, fl) (0)
363#define xfs_qm_newmount(mp, a, b) (0)
364#define xfs_qm_mount_quotas(mp)
365#define xfs_qm_unmount(mp)
366#define xfs_qm_unmount_quotas(mp) (0)
367#endif /* CONFIG_XFS_QUOTA */
368
369#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
370 xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
371#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \
372 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
361 f | XFS_QMOPT_RES_REGBLKS) 373 f | XFS_QMOPT_RES_REGBLKS)
362 374
363extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *); 375extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
364extern int xfs_mount_reset_sbqflags(struct xfs_mount *); 376extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
365 377
366extern struct xfs_qmops xfs_qmcore_xfs;
367
368#endif /* __KERNEL__ */ 378#endif /* __KERNEL__ */
369
370#endif /* __XFS_QUOTA_H__ */ 379#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 58f85e9cd11d..b81deea0ce19 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -166,7 +166,8 @@ xfs_rename(
166 /* 166 /*
167 * Attach the dquots to the inodes 167 * Attach the dquots to the inodes
168 */ 168 */
169 if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) { 169 error = xfs_qm_vop_rename_dqattach(inodes);
170 if (error) {
170 xfs_trans_cancel(tp, cancel_flags); 171 xfs_trans_cancel(tp, cancel_flags);
171 goto std_return; 172 goto std_return;
172 } 173 }
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 36f3a21c54d2..fea68615ed23 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -41,7 +41,6 @@
41#include "xfs_ialloc.h" 41#include "xfs_ialloc.h"
42#include "xfs_attr.h" 42#include "xfs_attr.h"
43#include "xfs_bmap.h" 43#include "xfs_bmap.h"
44#include "xfs_acl.h"
45#include "xfs_error.h" 44#include "xfs_error.h"
46#include "xfs_buf_item.h" 45#include "xfs_buf_item.h"
47#include "xfs_rw.h" 46#include "xfs_rw.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8570b826fedd..66b849358e62 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -297,7 +297,7 @@ xfs_trans_dup(
297 tp->t_rtx_res = tp->t_rtx_res_used; 297 tp->t_rtx_res = tp->t_rtx_res_used;
298 ntp->t_pflags = tp->t_pflags; 298 ntp->t_pflags = tp->t_pflags;
299 299
300 XFS_TRANS_DUP_DQINFO(tp->t_mountp, tp, ntp); 300 xfs_trans_dup_dqinfo(tp, ntp);
301 301
302 atomic_inc(&tp->t_mountp->m_active_trans); 302 atomic_inc(&tp->t_mountp->m_active_trans);
303 return ntp; 303 return ntp;
@@ -628,8 +628,6 @@ xfs_trans_apply_sb_deltas(
628 xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount), 628 xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
629 offsetof(xfs_dsb_t, sb_frextents) + 629 offsetof(xfs_dsb_t, sb_frextents) +
630 sizeof(sbp->sb_frextents) - 1); 630 sizeof(sbp->sb_frextents) - 1);
631
632 tp->t_mountp->m_super->s_dirt = 1;
633} 631}
634 632
635/* 633/*
@@ -831,7 +829,7 @@ shut_us_down:
831 * means is that we have some (non-persistent) quota 829 * means is that we have some (non-persistent) quota
832 * reservations that need to be unreserved. 830 * reservations that need to be unreserved.
833 */ 831 */
834 XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp); 832 xfs_trans_unreserve_and_mod_dquots(tp);
835 if (tp->t_ticket) { 833 if (tp->t_ticket) {
836 commit_lsn = xfs_log_done(mp, tp->t_ticket, 834 commit_lsn = xfs_log_done(mp, tp->t_ticket,
837 NULL, log_flags); 835 NULL, log_flags);
@@ -850,10 +848,9 @@ shut_us_down:
850 /* 848 /*
851 * If we need to update the superblock, then do it now. 849 * If we need to update the superblock, then do it now.
852 */ 850 */
853 if (tp->t_flags & XFS_TRANS_SB_DIRTY) { 851 if (tp->t_flags & XFS_TRANS_SB_DIRTY)
854 xfs_trans_apply_sb_deltas(tp); 852 xfs_trans_apply_sb_deltas(tp);
855 } 853 xfs_trans_apply_dquot_deltas(tp);
856 XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp);
857 854
858 /* 855 /*
859 * Ask each log item how many log_vector entries it will 856 * Ask each log item how many log_vector entries it will
@@ -1058,7 +1055,7 @@ xfs_trans_uncommit(
1058 } 1055 }
1059 1056
1060 xfs_trans_unreserve_and_mod_sb(tp); 1057 xfs_trans_unreserve_and_mod_sb(tp);
1061 XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp); 1058 xfs_trans_unreserve_and_mod_dquots(tp);
1062 1059
1063 xfs_trans_free_items(tp, flags); 1060 xfs_trans_free_items(tp, flags);
1064 xfs_trans_free_busy(tp); 1061 xfs_trans_free_busy(tp);
@@ -1183,7 +1180,7 @@ xfs_trans_cancel(
1183 } 1180 }
1184#endif 1181#endif
1185 xfs_trans_unreserve_and_mod_sb(tp); 1182 xfs_trans_unreserve_and_mod_sb(tp);
1186 XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp); 1183 xfs_trans_unreserve_and_mod_dquots(tp);
1187 1184
1188 if (tp->t_ticket) { 1185 if (tp->t_ticket) {
1189 if (flags & XFS_TRANS_RELEASE_LOG_RES) { 1186 if (flags & XFS_TRANS_RELEASE_LOG_RES) {
@@ -1213,7 +1210,7 @@ xfs_trans_free(
1213 xfs_trans_t *tp) 1210 xfs_trans_t *tp)
1214{ 1211{
1215 atomic_dec(&tp->t_mountp->m_active_trans); 1212 atomic_dec(&tp->t_mountp->m_active_trans);
1216 XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp); 1213 xfs_trans_free_dqinfo(tp);
1217 kmem_zone_free(xfs_trans_zone, tp); 1214 kmem_zone_free(xfs_trans_zone, tp);
1218} 1215}
1219 1216
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 79b9e5ea5359..4d88616bde91 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -166,7 +166,7 @@ xfs_dir_ialloc(
166 xfs_buf_relse(ialloc_context); 166 xfs_buf_relse(ialloc_context);
167 if (dqinfo) { 167 if (dqinfo) {
168 tp->t_dqinfo = dqinfo; 168 tp->t_dqinfo = dqinfo;
169 XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp); 169 xfs_trans_free_dqinfo(tp);
170 } 170 }
171 *tpp = ntp; 171 *tpp = ntp;
172 *ipp = NULL; 172 *ipp = NULL;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 19cf90a9c762..492d75bae2bf 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -42,6 +42,7 @@
42#include "xfs_ialloc.h" 42#include "xfs_ialloc.h"
43#include "xfs_alloc.h" 43#include "xfs_alloc.h"
44#include "xfs_bmap.h" 44#include "xfs_bmap.h"
45#include "xfs_acl.h"
45#include "xfs_attr.h" 46#include "xfs_attr.h"
46#include "xfs_rw.h" 47#include "xfs_rw.h"
47#include "xfs_error.h" 48#include "xfs_error.h"
@@ -118,7 +119,7 @@ xfs_setattr(
118 */ 119 */
119 ASSERT(udqp == NULL); 120 ASSERT(udqp == NULL);
120 ASSERT(gdqp == NULL); 121 ASSERT(gdqp == NULL);
121 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, ip->i_d.di_projid, 122 code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
122 qflags, &udqp, &gdqp); 123 qflags, &udqp, &gdqp);
123 if (code) 124 if (code)
124 return code; 125 return code;
@@ -180,10 +181,11 @@ xfs_setattr(
180 * Do a quota reservation only if uid/gid is actually 181 * Do a quota reservation only if uid/gid is actually
181 * going to change. 182 * going to change.
182 */ 183 */
183 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || 184 if (XFS_IS_QUOTA_RUNNING(mp) &&
184 (XFS_IS_GQUOTA_ON(mp) && igid != gid)) { 185 ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
186 (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
185 ASSERT(tp); 187 ASSERT(tp);
186 code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp, 188 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
187 capable(CAP_FOWNER) ? 189 capable(CAP_FOWNER) ?
188 XFS_QMOPT_FORCE_RES : 0); 190 XFS_QMOPT_FORCE_RES : 0);
189 if (code) /* out of quota */ 191 if (code) /* out of quota */
@@ -217,7 +219,7 @@ xfs_setattr(
217 /* 219 /*
218 * Make sure that the dquots are attached to the inode. 220 * Make sure that the dquots are attached to the inode.
219 */ 221 */
220 code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED); 222 code = xfs_qm_dqattach_locked(ip, 0);
221 if (code) 223 if (code)
222 goto error_return; 224 goto error_return;
223 225
@@ -351,21 +353,21 @@ xfs_setattr(
351 * in the transaction. 353 * in the transaction.
352 */ 354 */
353 if (iuid != uid) { 355 if (iuid != uid) {
354 if (XFS_IS_UQUOTA_ON(mp)) { 356 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
355 ASSERT(mask & ATTR_UID); 357 ASSERT(mask & ATTR_UID);
356 ASSERT(udqp); 358 ASSERT(udqp);
357 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip, 359 olddquot1 = xfs_qm_vop_chown(tp, ip,
358 &ip->i_udquot, udqp); 360 &ip->i_udquot, udqp);
359 } 361 }
360 ip->i_d.di_uid = uid; 362 ip->i_d.di_uid = uid;
361 inode->i_uid = uid; 363 inode->i_uid = uid;
362 } 364 }
363 if (igid != gid) { 365 if (igid != gid) {
364 if (XFS_IS_GQUOTA_ON(mp)) { 366 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
365 ASSERT(!XFS_IS_PQUOTA_ON(mp)); 367 ASSERT(!XFS_IS_PQUOTA_ON(mp));
366 ASSERT(mask & ATTR_GID); 368 ASSERT(mask & ATTR_GID);
367 ASSERT(gdqp); 369 ASSERT(gdqp);
368 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip, 370 olddquot2 = xfs_qm_vop_chown(tp, ip,
369 &ip->i_gdquot, gdqp); 371 &ip->i_gdquot, gdqp);
370 } 372 }
371 ip->i_d.di_gid = gid; 373 ip->i_d.di_gid = gid;
@@ -461,13 +463,25 @@ xfs_setattr(
461 /* 463 /*
462 * Release any dquot(s) the inode had kept before chown. 464 * Release any dquot(s) the inode had kept before chown.
463 */ 465 */
464 XFS_QM_DQRELE(mp, olddquot1); 466 xfs_qm_dqrele(olddquot1);
465 XFS_QM_DQRELE(mp, olddquot2); 467 xfs_qm_dqrele(olddquot2);
466 XFS_QM_DQRELE(mp, udqp); 468 xfs_qm_dqrele(udqp);
467 XFS_QM_DQRELE(mp, gdqp); 469 xfs_qm_dqrele(gdqp);
468 470
469 if (code) { 471 if (code)
470 return code; 472 return code;
473
474 /*
475 * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
476 * update. We could avoid this with linked transactions
477 * and passing down the transaction pointer all the way
478 * to attr_set. No previous user of the generic
479 * Posix ACL code seems to care about this issue either.
480 */
481 if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
482 code = -xfs_acl_chmod(inode);
483 if (code)
484 return XFS_ERROR(code);
471 } 485 }
472 486
473 if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) && 487 if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
@@ -482,8 +496,8 @@ xfs_setattr(
482 commit_flags |= XFS_TRANS_ABORT; 496 commit_flags |= XFS_TRANS_ABORT;
483 /* FALLTHROUGH */ 497 /* FALLTHROUGH */
484 error_return: 498 error_return:
485 XFS_QM_DQRELE(mp, udqp); 499 xfs_qm_dqrele(udqp);
486 XFS_QM_DQRELE(mp, gdqp); 500 xfs_qm_dqrele(gdqp);
487 if (tp) { 501 if (tp) {
488 xfs_trans_cancel(tp, commit_flags); 502 xfs_trans_cancel(tp, commit_flags);
489 } 503 }
@@ -524,7 +538,9 @@ xfs_readlink_bmap(
524 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); 538 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
525 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); 539 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
526 540
527 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); 541 bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt),
542 XBF_LOCK | XBF_MAPPED |
543 XBF_DONT_BLOCK);
528 error = XFS_BUF_GETERROR(bp); 544 error = XFS_BUF_GETERROR(bp);
529 if (error) { 545 if (error) {
530 xfs_ioerror_alert("xfs_readlink", 546 xfs_ioerror_alert("xfs_readlink",
@@ -739,7 +755,8 @@ xfs_free_eofblocks(
739 /* 755 /*
740 * Attach the dquots to the inode up front. 756 * Attach the dquots to the inode up front.
741 */ 757 */
742 if ((error = XFS_QM_DQATTACH(mp, ip, 0))) 758 error = xfs_qm_dqattach(ip, 0);
759 if (error)
743 return error; 760 return error;
744 761
745 /* 762 /*
@@ -1181,7 +1198,8 @@ xfs_inactive(
1181 1198
1182 ASSERT(ip->i_d.di_nlink == 0); 1199 ASSERT(ip->i_d.di_nlink == 0);
1183 1200
1184 if ((error = XFS_QM_DQATTACH(mp, ip, 0))) 1201 error = xfs_qm_dqattach(ip, 0);
1202 if (error)
1185 return VN_INACTIVE_CACHE; 1203 return VN_INACTIVE_CACHE;
1186 1204
1187 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); 1205 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
@@ -1307,7 +1325,7 @@ xfs_inactive(
1307 /* 1325 /*
1308 * Credit the quota account(s). The inode is gone. 1326 * Credit the quota account(s). The inode is gone.
1309 */ 1327 */
1310 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1); 1328 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1311 1329
1312 /* 1330 /*
1313 * Just ignore errors at this point. There is nothing we can 1331 * Just ignore errors at this point. There is nothing we can
@@ -1323,11 +1341,11 @@ xfs_inactive(
1323 xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " 1341 xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1324 "xfs_trans_commit() returned error %d", error); 1342 "xfs_trans_commit() returned error %d", error);
1325 } 1343 }
1344
1326 /* 1345 /*
1327 * Release the dquots held by inode, if any. 1346 * Release the dquots held by inode, if any.
1328 */ 1347 */
1329 XFS_QM_DQDETACH(mp, ip); 1348 xfs_qm_dqdetach(ip);
1330
1331 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1349 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1332 1350
1333 out: 1351 out:
@@ -1427,8 +1445,7 @@ xfs_create(
1427 /* 1445 /*
1428 * Make sure that we have allocated dquot(s) on disk. 1446 * Make sure that we have allocated dquot(s) on disk.
1429 */ 1447 */
1430 error = XFS_QM_DQVOPALLOC(mp, dp, 1448 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1431 current_fsuid(), current_fsgid(), prid,
1432 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); 1449 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
1433 if (error) 1450 if (error)
1434 goto std_return; 1451 goto std_return;
@@ -1489,7 +1506,7 @@ xfs_create(
1489 /* 1506 /*
1490 * Reserve disk quota and the inode. 1507 * Reserve disk quota and the inode.
1491 */ 1508 */
1492 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); 1509 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
1493 if (error) 1510 if (error)
1494 goto out_trans_cancel; 1511 goto out_trans_cancel;
1495 1512
@@ -1561,7 +1578,7 @@ xfs_create(
1561 * These ids of the inode couldn't have changed since the new 1578 * These ids of the inode couldn't have changed since the new
1562 * inode has been locked ever since it was created. 1579 * inode has been locked ever since it was created.
1563 */ 1580 */
1564 XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp); 1581 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
1565 1582
1566 /* 1583 /*
1567 * xfs_trans_commit normally decrements the vnode ref count 1584 * xfs_trans_commit normally decrements the vnode ref count
@@ -1580,8 +1597,8 @@ xfs_create(
1580 goto out_dqrele; 1597 goto out_dqrele;
1581 } 1598 }
1582 1599
1583 XFS_QM_DQRELE(mp, udqp); 1600 xfs_qm_dqrele(udqp);
1584 XFS_QM_DQRELE(mp, gdqp); 1601 xfs_qm_dqrele(gdqp);
1585 1602
1586 *ipp = ip; 1603 *ipp = ip;
1587 1604
@@ -1602,8 +1619,8 @@ xfs_create(
1602 out_trans_cancel: 1619 out_trans_cancel:
1603 xfs_trans_cancel(tp, cancel_flags); 1620 xfs_trans_cancel(tp, cancel_flags);
1604 out_dqrele: 1621 out_dqrele:
1605 XFS_QM_DQRELE(mp, udqp); 1622 xfs_qm_dqrele(udqp);
1606 XFS_QM_DQRELE(mp, gdqp); 1623 xfs_qm_dqrele(gdqp);
1607 1624
1608 if (unlock_dp_on_error) 1625 if (unlock_dp_on_error)
1609 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1626 xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -1837,11 +1854,11 @@ xfs_remove(
1837 return error; 1854 return error;
1838 } 1855 }
1839 1856
1840 error = XFS_QM_DQATTACH(mp, dp, 0); 1857 error = xfs_qm_dqattach(dp, 0);
1841 if (error) 1858 if (error)
1842 goto std_return; 1859 goto std_return;
1843 1860
1844 error = XFS_QM_DQATTACH(mp, ip, 0); 1861 error = xfs_qm_dqattach(ip, 0);
1845 if (error) 1862 if (error)
1846 goto std_return; 1863 goto std_return;
1847 1864
@@ -2028,11 +2045,11 @@ xfs_link(
2028 2045
2029 /* Return through std_return after this point. */ 2046 /* Return through std_return after this point. */
2030 2047
2031 error = XFS_QM_DQATTACH(mp, sip, 0); 2048 error = xfs_qm_dqattach(sip, 0);
2032 if (error) 2049 if (error)
2033 goto std_return; 2050 goto std_return;
2034 2051
2035 error = XFS_QM_DQATTACH(mp, tdp, 0); 2052 error = xfs_qm_dqattach(tdp, 0);
2036 if (error) 2053 if (error)
2037 goto std_return; 2054 goto std_return;
2038 2055
@@ -2205,8 +2222,7 @@ xfs_symlink(
2205 /* 2222 /*
2206 * Make sure that we have allocated dquot(s) on disk. 2223 * Make sure that we have allocated dquot(s) on disk.
2207 */ 2224 */
2208 error = XFS_QM_DQVOPALLOC(mp, dp, 2225 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
2209 current_fsuid(), current_fsgid(), prid,
2210 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); 2226 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2211 if (error) 2227 if (error)
2212 goto std_return; 2228 goto std_return;
@@ -2248,7 +2264,7 @@ xfs_symlink(
2248 /* 2264 /*
2249 * Reserve disk quota : blocks and inode. 2265 * Reserve disk quota : blocks and inode.
2250 */ 2266 */
2251 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); 2267 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
2252 if (error) 2268 if (error)
2253 goto error_return; 2269 goto error_return;
2254 2270
@@ -2288,7 +2304,7 @@ xfs_symlink(
2288 /* 2304 /*
2289 * Also attach the dquot(s) to it, if applicable. 2305 * Also attach the dquot(s) to it, if applicable.
2290 */ 2306 */
2291 XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp); 2307 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
2292 2308
2293 if (resblks) 2309 if (resblks)
2294 resblks -= XFS_IALLOC_SPACE_RES(mp); 2310 resblks -= XFS_IALLOC_SPACE_RES(mp);
@@ -2376,8 +2392,8 @@ xfs_symlink(
2376 goto error2; 2392 goto error2;
2377 } 2393 }
2378 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 2394 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2379 XFS_QM_DQRELE(mp, udqp); 2395 xfs_qm_dqrele(udqp);
2380 XFS_QM_DQRELE(mp, gdqp); 2396 xfs_qm_dqrele(gdqp);
2381 2397
2382 /* Fall through to std_return with error = 0 or errno from 2398 /* Fall through to std_return with error = 0 or errno from
2383 * xfs_trans_commit */ 2399 * xfs_trans_commit */
@@ -2401,8 +2417,8 @@ std_return:
2401 cancel_flags |= XFS_TRANS_ABORT; 2417 cancel_flags |= XFS_TRANS_ABORT;
2402 error_return: 2418 error_return:
2403 xfs_trans_cancel(tp, cancel_flags); 2419 xfs_trans_cancel(tp, cancel_flags);
2404 XFS_QM_DQRELE(mp, udqp); 2420 xfs_qm_dqrele(udqp);
2405 XFS_QM_DQRELE(mp, gdqp); 2421 xfs_qm_dqrele(gdqp);
2406 2422
2407 if (unlock_dp_on_error) 2423 if (unlock_dp_on_error)
2408 xfs_iunlock(dp, XFS_ILOCK_EXCL); 2424 xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -2541,7 +2557,8 @@ xfs_alloc_file_space(
2541 if (XFS_FORCED_SHUTDOWN(mp)) 2557 if (XFS_FORCED_SHUTDOWN(mp))
2542 return XFS_ERROR(EIO); 2558 return XFS_ERROR(EIO);
2543 2559
2544 if ((error = XFS_QM_DQATTACH(mp, ip, 0))) 2560 error = xfs_qm_dqattach(ip, 0);
2561 if (error)
2545 return error; 2562 return error;
2546 2563
2547 if (len <= 0) 2564 if (len <= 0)
@@ -2628,8 +2645,8 @@ retry:
2628 break; 2645 break;
2629 } 2646 }
2630 xfs_ilock(ip, XFS_ILOCK_EXCL); 2647 xfs_ilock(ip, XFS_ILOCK_EXCL);
2631 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, 2648 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
2632 qblocks, 0, quota_flag); 2649 0, quota_flag);
2633 if (error) 2650 if (error)
2634 goto error1; 2651 goto error1;
2635 2652
@@ -2688,7 +2705,7 @@ dmapi_enospc_check:
2688 2705
2689error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ 2706error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
2690 xfs_bmap_cancel(&free_list); 2707 xfs_bmap_cancel(&free_list);
2691 XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag); 2708 xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
2692 2709
2693error1: /* Just cancel transaction */ 2710error1: /* Just cancel transaction */
2694 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 2711 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
@@ -2827,7 +2844,8 @@ xfs_free_file_space(
2827 2844
2828 xfs_itrace_entry(ip); 2845 xfs_itrace_entry(ip);
2829 2846
2830 if ((error = XFS_QM_DQATTACH(mp, ip, 0))) 2847 error = xfs_qm_dqattach(ip, 0);
2848 if (error)
2831 return error; 2849 return error;
2832 2850
2833 error = 0; 2851 error = 0;
@@ -2953,9 +2971,9 @@ xfs_free_file_space(
2953 break; 2971 break;
2954 } 2972 }
2955 xfs_ilock(ip, XFS_ILOCK_EXCL); 2973 xfs_ilock(ip, XFS_ILOCK_EXCL);
2956 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, 2974 error = xfs_trans_reserve_quota(tp, mp,
2957 ip->i_udquot, ip->i_gdquot, resblks, 0, 2975 ip->i_udquot, ip->i_gdquot,
2958 XFS_QMOPT_RES_REGBLKS); 2976 resblks, 0, XFS_QMOPT_RES_REGBLKS);
2959 if (error) 2977 if (error)
2960 goto error1; 2978 goto error1;
2961 2979
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 04373c6c61ff..a9e102de71a1 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,6 +18,7 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
18#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */ 18#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */
19#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ 19#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */
20#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ 20#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
21#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
21 22
22int xfs_readlink(struct xfs_inode *ip, char *link); 23int xfs_readlink(struct xfs_inode *ip, char *link);
23int xfs_fsync(struct xfs_inode *ip); 24int xfs_fsync(struct xfs_inode *ip);